341 1430 3353 179 682 681 681 140 210 204 1000 1002 1002 1040 1041 1040 197 4 3 3 4 3 4 4 3 1 4 2 185 185 185 185 14 185 185 185 181 181 2 179 179 179 179 179 939 952 926 953 953 718 718 95 24 9 90 90 90 89 90 79 79 79 14 79 79 79 6 6 799 478 537 928 930 1040 1031 268 268 255 268 148 150 149 2018 2022 1895 2022 1716 1718 1629 1026 880 880 880 880 261 258 261 5 5 5 2 5 11 11 10 35 35 35 35 35 736 756 756 737 736 735 732 535 535 533 533 533 6 2 2 533 473 474 474 533 533 750 197 749 724 750 197 27 174 5 17 44 28 28 28 27 4 27 24 24 24 24 24 24 24 7 7 5 5 7 7 5 7 713 713 711 9 713 32 200 337 200 337 171 172 172 1 170 171 36 171 96 86 2 169 23 23 23 23 333 333 234 334 234 168 233 333 127 127 127 126 208 207 123 208 82 108 81 189 13 23 13 13 13 7 17 17 3 16 16 3 16 6 1227 1225 1227 1 1 458 458 51 54 24 40 12 44 44 2 2 726 725 51 112 95 112 111 1 338 260 338 336 501 506 493 51 797 218 19 208 6215 798 798 796 797 727 276 726 728 728 725 1 1 728 796 796 796 728 726 798 44 41 40 41 30 30 30 19 19 19 28 28 28 28 27 107 98 98 6 6 6 6 93 107 107 98 6 6 98 106 99 139 26 115 115 390 11 379 390 390 379 379 379 378 379 11 18 11 11 11 11 12 11 16 16 14 14 14 2 17 17 17 17 17 17 17 17 6563 5 4 6573 5 6552 5 6548 714 58 310 6316 127 127 5 122 120 122 122 6509 38 3 6558 6554 6551 7 6572 27 27 6556 311 6316 6573 6088 796 6515 6535 6535 6535 6524 6511 6535 6518 6520 6518 6 2 101 618 101 516 12 615 6550 6 6556 6567 6548 35 10 7 6524 5 6523 618 615 616 617 6525 6555 23 23 384 382 384 363 364 368 6685 124 127 77 78 127 49 49 49 126 59 127 405 256 403 417 563 568 566 156 156 154 155 154 154 1 1 564 418 2 417 20 20 20 20 20 402 402 402 2 403 399 21 203 21 6677 6694 6683 124 125 106 125 1 124 3854 3 3 3 15 4 1 8 8 8 1 8 260 260 3 3 1 3 3 3855 3854 8 8 3857 16 16 15 16 16 16 16 15 15 16 16 6680 6674 6674 23 21 2 23 6673 6676 6682 6681 26 6694 6695 6685 6677 3854 3854 3864 3859 6678 5782 1104 648 6678 6695 6695 6682 568 6273 6263 6271 6270 6267 6218 6263 6264 6692 6674 17 17 16 16 14 14 3 11 14 14 14 3 17 933 934 934 935 932 933 919 917 917 926 920 917 919 919 918 921 8 925 14 14 15 14 15 15 15 2 1 2 15 1 15 14 14 2 14 2 1 15 15 14 14 14 15 6 11 1 1 1 1 1 1 18 15 15 14 13 1 1 1 13 18 927 928 925 928 928 784 785 784 785 782 167 171 167 171 171 171 171 167 171 50 50 40 39 36 40 43 43 43 42 506 503 507 505 132 506 502 4 4 504 82 503 504 503 65 22 502 3 3 502 259 262 259 502 504 505 81 11 81 81 506 504 8 8 504 81 3 79 43 43 42 81 421 504 504 502 506 502 1 502 501 499 5 5 506 450 450 452 23 53 51 51 51 23 23 52 53 53 53 52 52 51 52 52 455 452 451 53 53 53 52 49 13 13 9 5 8 8 6 13 455 455 451 454 455 52 53 53 53 52 53 53 53 53 49 455 455 455 454 19 19 19 15 179 179 179 179 179 179 179 1 178 15 15 15 44 43 44 921 34 33 34 34 34 34 32 34 34 34 34 5 5 125 5 5 5 5 5 31 2 31 125 125 125 125 125 125 125 125 125 125 31 31 31 31 125 125 125 31 31 31 31 31 31 31 31 102 101 16 30 30 120 120 1013 1013 408 310 833 33 33 3 120 120 8 306 306 32 305 102 102 70 34 34 15 102 102 9 102 9 9 102 102 8 34 34 15 34 15 22 34 3 32 33 85 46 85 126 126 113 126 126 83 128 35 22 100 100 49 36 35 22 36 22 36 36 22 99 100 15 100 15 15 100 15 130 96 114 15 68 68 68 68 100 41 41 41 41 100 100 100 100 100 31 100 100 23 23 7 23 7 23 23 7 733 731 139 697 31 30 87 101 101 101 32 23 30 31 32 32 31 31 31 31 31 31 31 87 87 87 87 87 39 87 87 87 87 87 31 33 33 33 33 33 33 33 31 32 31 31 33 87 87 86 87 1 1 1 1 1 1 1 178 178 177 189 189 188 189 1 1 1 1 1 1 14 14 14 24 2 2 2 24 1 1 24 11 11 11 11 28 28 140 54 88 88 56 56 88 81 81 81 88 52 88 52 52 39 93 94 75 75 94 93 93 23 93 56 94 58 648 449 449 421 8 391 17 449 266 432 39 776 526 484 525 466 524 1 777 375 375 75 374 202 127 151 373 32 32 32 31 373 45 45 373 430 260 429 196 146 126 411 261 106 412 375 371 355 21 49 81 5 76 2 8 138 137 73 69 66 1 65 65 51 65 10 1 65 13 4 13 1 3 13 2 154 8 65 64 63 63 62 41 45 63 47 5 5 5 3 2 1 3 5 8 6 8 712 713 713 713 715 715 714 2 1 2 2 2 1 1 2 2 3 1 2 2 2 2 47 52 51 44 44 42 33 753 753 14 748 43 43 6 3 713 713 34 32 5 45 42 10 6 40 39 45 177 177 177 1 1 1 1 1 52 52 42 1 51 1 50 40 50 50 1 50 12 2 50 50 1 49 2 39 8 45 2 45 1 38 1 43 2 40 1 40 1 41 39 1 38 32 32 5 3 51 42 8 8 3 5 8 8 9 8 8 9 6 15 1 1 1 3 2 3 9 7 6 5 5 1 4 4 4 9 44 1 43 43 1 42 12 12 29 1 13 12 12 12 8 1 11 3 1 10 9 6 8 12 680 680 250 680 680 2 2 22 21 22 18 2 2 22 36 36 20 274 106 274 14 14 274 2 2 274 2 2 273 2 273 273 89 89 274 273 252 273 272 274 274 274 274 183 274 22 274 208 208 15 274 36 274 207 207 208 208 274 30 47 30 45 26 22 22 21 22 22 15 10 923 924 925 265 266 266 953 953 264 922 922 923 925 1 1 680 608 73 5 1 718 717 718 717 718 45 716 718 40 681 597 1 680 597 66 680 680 676 676 676 676 554 676 398 676 79 676 144 676 675 194 193 185 105 185 4 4 2 4 3 5 12 12 1 1 1 5143 23 23 5147 2 5154 1 1 5165 1 1 1 1 1 1 1 1 1 1 1 1 447 447 31 31 31 109 31 109 727 681 116 116 104 728 128 128 728 546 546 396 44 9 38 38 38 4 4 263 925 925 924 924 923 917 925 924 923 924 925 202 923 925 925 924 357 921 264 264 266 265 264 263 644 610 72 199 199 73 73 73 128 208 209 209 200 200 199 200 200 199 181 179 179 179 179 177 177 177 119 137 136 2 136 85 120 121 120 120 120 3 2 2 2 2 171 5 4 4 2 2 4 56 54 57 54 57 68 68 68 68 68 68 68 235 22 213 22 35 17 4 183 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 // SPDX-License-Identifier: GPL-2.0-or-later /* * NET3 Protocol independent device support routines. * * Derived from the non IP parts of dev.c 1.0.19 * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * * Additional Authors: * Florian la Roche <rzsfl@rz.uni-sb.de> * Alan Cox <gw4pts@gw4pts.ampr.org> * David Hinds <dahinds@users.sourceforge.net> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * Adam Sulmicki <adam@cfar.umd.edu> * Pekka Riikonen <priikone@poesidon.pspt.fi> * * Changes: * D.J. Barrow : Fixed bug where dev->refcnt gets set * to 2 if register_netdev gets called * before net_dev_init & also removed a * few lines of code in the process. * Alan Cox : device private ioctl copies fields back. * Alan Cox : Transmit queue code does relevant * stunts to keep the queue safe. * Alan Cox : Fixed double lock. * Alan Cox : Fixed promisc NULL pointer trap * ???????? : Support the full private ioctl range * Alan Cox : Moved ioctl permission check into * drivers * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI * Alan Cox : 100 backlog just doesn't cut it when * you start doing multicast video 8) * Alan Cox : Rewrote net_bh and list manager. * Alan Cox : Fix ETH_P_ALL echoback lengths. * Alan Cox : Took out transmit every packet pass * Saved a few bytes in the ioctl handler * Alan Cox : Network driver sets packet type before * calling netif_rx. Saves a function * call a packet. * Alan Cox : Hashed net_bh() * Richard Kooijman: Timestamp fixes. * Alan Cox : Wrong field in SIOCGIFDSTADDR * Alan Cox : Device lock protection. * Alan Cox : Fixed nasty side effect of device close * changes. * Rudi Cilibrasi : Pass the right thing to * set_mac_address() * Dave Miller : 32bit quantity for the device lock to * make it work out on a Sparc. * Bjorn Ekwall : Added KERNELD hack. * Alan Cox : Cleaned up the backlog initialise. * Craig Metz : SIOCGIFCONF fix if space for under * 1 device. * Thomas Bogendoerfer : Return ENODEV for dev_open, if there * is no device open function. * Andi Kleen : Fix error reporting for SIOCGIFCONF * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF * Cyrus Durgin : Cleaned for KMOD * Adam Sulmicki : Bug Fix : Network Device Unload * A network device unload needs to purge * the backlog queue. * Paul Rusty Russell : SIOCSIFNAME * Pekka Riikonen : Netdev boot-time settings code * Andrew Morton : Make unregister_netdevice wait * indefinitely on dev->refcnt * J Hadi Salim : - Backlog queue sampling * - netif_rx() feedback */ #include <linux/uaccess.h> #include <linux/bitmap.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/hash.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/skbuff.h> #include <linux/kthread.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/busy_poll.h> #include <linux/rtnetlink.h> #include <linux/stat.h> #include <net/dsa.h> #include <net/dst.h> #include <net/dst_metadata.h> #include <net/gro.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/checksum.h> #include <net/xfrm.h> #include <net/tcx.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netpoll.h> #include <linux/rcupdate.h> #include <linux/delay.h> #include <net/iw_handler.h> #include <asm/current.h> #include <linux/audit.h> #include <linux/dmaengine.h> #include <linux/err.h> #include <linux/ctype.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include <linux/ip.h> #include <net/ip.h> #include <net/mpls.h> #include <linux/ipv6.h> #include <linux/in.h> #include <linux/jhash.h> #include <linux/random.h> #include <trace/events/napi.h> #include <trace/events/net.h> #include <trace/events/skb.h> #include <trace/events/qdisc.h> #include <trace/events/xdp.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> #include <linux/static_key.h> #include <linux/hashtable.h> #include <linux/vmalloc.h> #include <linux/if_macvlan.h> #include <linux/errqueue.h> #include <linux/hrtimer.h> #include <linux/netfilter_netdev.h> #include <linux/crash_dump.h> #include <linux/sctp.h> #include <net/udp_tunnel.h> #include <linux/net_namespace.h> #include <linux/indirect_call_wrapper.h> #include <net/devlink.h> #include <linux/pm_runtime.h> #include <linux/prandom.h> #include <linux/once_lite.h> #include <net/netdev_rx_queue.h> #include "dev.h" #include "net-sysfs.h" static DEFINE_SPINLOCK(ptype_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; struct list_head ptype_all __read_mostly; /* Taps */ static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack); /* * The @dev_base_head list is protected by @dev_base_lock and the rtnl * semaphore. * * Pure readers hold dev_base_lock for reading, or rcu_read_lock() * * Writers must hold the rtnl semaphore while they loop through the * dev_base_head list, and hold dev_base_lock for writing when they do the * actual updates. This allows pure readers to access the list even * while a writer is preparing to update it. * * To put it another way, dev_base_lock is held for writing only to * protect against pure readers; the rtnl semaphore provides the * protection against other writers. * * See, for example usages, register_netdevice() and * unregister_netdevice(), which must be called with the rtnl * semaphore held. */ DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); static DEFINE_MUTEX(ifalias_mutex); /* protects napi_hash addition/deletion and napi_gen_id */ static DEFINE_SPINLOCK(napi_hash_lock); static unsigned int napi_gen_id = NR_CPUS; static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); static DECLARE_RWSEM(devnet_rename_sem); static inline void dev_base_seq_inc(struct net *net) { while (++net->dev_base_seq == 0) ; } static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) { unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ)); return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; } static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) { return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } static inline void rps_lock_irqsave(struct softnet_data *sd, unsigned long *flags) { if (IS_ENABLED(CONFIG_RPS)) spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_save(*flags); } static inline void rps_lock_irq_disable(struct softnet_data *sd) { if (IS_ENABLED(CONFIG_RPS)) spin_lock_irq(&sd->input_pkt_queue.lock); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_disable(); } static inline void rps_unlock_irq_restore(struct softnet_data *sd, unsigned long *flags) { if (IS_ENABLED(CONFIG_RPS)) spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_restore(*flags); } static inline void rps_unlock_irq_enable(struct softnet_data *sd) { if (IS_ENABLED(CONFIG_RPS)) spin_unlock_irq(&sd->input_pkt_queue.lock); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_enable(); } static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; name_node = kmalloc(sizeof(*name_node), GFP_KERNEL); if (!name_node) return NULL; INIT_HLIST_NODE(&name_node->hlist); name_node->dev = dev; name_node->name = name; return name_node; } static struct netdev_name_node * netdev_name_node_head_alloc(struct net_device *dev) { struct netdev_name_node *name_node; name_node = netdev_name_node_alloc(dev, dev->name); if (!name_node) return NULL; INIT_LIST_HEAD(&name_node->list); return name_node; } static void netdev_name_node_free(struct netdev_name_node *name_node) { kfree(name_node); } static void netdev_name_node_add(struct net *net, struct netdev_name_node *name_node) { hlist_add_head_rcu(&name_node->hlist, dev_name_hash(net, name_node->name)); } static void netdev_name_node_del(struct netdev_name_node *name_node) { hlist_del_rcu(&name_node->hlist); } static struct netdev_name_node *netdev_name_node_lookup(struct net *net, const char *name) { struct hlist_head *head = dev_name_hash(net, name); struct netdev_name_node *name_node; hlist_for_each_entry(name_node, head, hlist) if (!strcmp(name_node->name, name)) return name_node; return NULL; } static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, const char *name) { struct hlist_head *head = dev_name_hash(net, name); struct netdev_name_node *name_node; hlist_for_each_entry_rcu(name_node, head, hlist) if (!strcmp(name_node->name, name)) return name_node; return NULL; } bool netdev_name_in_use(struct net *net, const char *name) { return netdev_name_node_lookup(net, name); } EXPORT_SYMBOL(netdev_name_in_use); int netdev_name_node_alt_create(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); name_node = netdev_name_node_lookup(net, name); if (name_node) return -EEXIST; name_node = netdev_name_node_alloc(dev, name); if (!name_node) return -ENOMEM; netdev_name_node_add(net, name_node); /* The node that holds dev->name acts as a head of per-device list. */ list_add_tail(&name_node->list, &dev->name_node->list); return 0; } static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) { list_del(&name_node->list); kfree(name_node->name); netdev_name_node_free(name_node); } int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); name_node = netdev_name_node_lookup(net, name); if (!name_node) return -ENOENT; /* lookup might have found our primary name or a name belonging * to another device. */ if (name_node == dev->name_node || name_node->dev != dev) return -EINVAL; netdev_name_node_del(name_node); synchronize_rcu(); __netdev_name_node_alt_destroy(name_node); return 0; } static void netdev_name_node_alt_flush(struct net_device *dev) { struct netdev_name_node *name_node, *tmp; list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) __netdev_name_node_alt_destroy(name_node); } /* Device list insertion */ static void list_netdevice(struct net_device *dev) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); ASSERT_RTNL(); write_lock(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); write_unlock(&dev_base_lock); netdev_for_each_altname(dev, name_node) netdev_name_node_add(net, name_node); /* We reserved the ifindex, this can't fail */ WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL)); dev_base_seq_inc(net); } /* Device list removal * caller must respect a RCU grace period before freeing/reusing dev */ static void unlist_netdevice(struct net_device *dev, bool lock) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); ASSERT_RTNL(); xa_erase(&net->dev_by_index, dev->ifindex); netdev_for_each_altname(dev, name_node) netdev_name_node_del(name_node); /* Unlink dev from the device chain */ if (lock) write_lock(&dev_base_lock); list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); if (lock) write_unlock(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } /* * Our notifier list */ static RAW_NOTIFIER_HEAD(netdev_chain); /* * Device drivers call our routines to queue packets here. We empty the * queue in the local softnet handler. */ DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); EXPORT_PER_CPU_SYMBOL(softnet_data); #ifdef CONFIG_LOCKDEP /* * register_netdevice() inits txq->_xmit_lock and sets lockdep class * according to dev->type */ static const unsigned short netdev_lock_type[] = { ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; static const char *const netdev_lock_name[] = { "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; static inline unsigned short netdev_lock_pos(unsigned short dev_type) { int i; for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) if (netdev_lock_type[i] == dev_type) return i; /* the last key is used by default */ return ARRAY_SIZE(netdev_lock_type) - 1; } static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, unsigned short dev_type) { int i; i = netdev_lock_pos(dev_type); lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], netdev_lock_name[i]); } static inline void netdev_set_addr_lockdep_class(struct net_device *dev) { int i; i = netdev_lock_pos(dev->type); lockdep_set_class_and_name(&dev->addr_list_lock, &netdev_addr_lock_key[i], netdev_lock_name[i]); } #else static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, unsigned short dev_type) { } static inline void netdev_set_addr_lockdep_class(struct net_device *dev) { } #endif /******************************************************************************* * * Protocol management and registration routines * *******************************************************************************/ /* * Add a protocol ID to the list. Now that the input handler is * smarter we can dispense with all the messy stuff that used to be * here. * * BEWARE!!! Protocol handlers, mangling input packets, * MUST BE last in hash buckets and checking protocol handlers * MUST start from promiscuous ptype_all chain in net_bh. * It is true now, do not change it. * Explanation follows: if protocol handler, mangling packet, will * be the first on list, it is not able to sense, that packet * is cloned and should be copied-on-write, so that it will * change it and subsequent readers will get broken packet. * --ANK (980803) */ static inline struct list_head *ptype_head(const struct packet_type *pt) { if (pt->type == htons(ETH_P_ALL)) return pt->dev ? &pt->dev->ptype_all : &ptype_all; else return pt->dev ? &pt->dev->ptype_specific : &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; } /** * dev_add_pack - add packet handler * @pt: packet type declaration * * Add a protocol handler to the networking stack. The passed &packet_type * is linked into kernel lists and may not be freed until it has been * removed from the kernel lists. * * This call does not sleep therefore it can not * guarantee all CPU's that are in middle of receiving packets * will see the new packet type (until the next received packet). */ void dev_add_pack(struct packet_type *pt) { struct list_head *head = ptype_head(pt); spin_lock(&ptype_lock); list_add_rcu(&pt->list, head); spin_unlock(&ptype_lock); } EXPORT_SYMBOL(dev_add_pack); /** * __dev_remove_pack - remove packet handler * @pt: packet type declaration * * Remove a protocol handler that was previously added to the kernel * protocol handlers by dev_add_pack(). The passed &packet_type is removed * from the kernel lists and can be freed or reused once this function * returns. * * The packet type might still be in use by receivers * and must not be freed until after all the CPU's have gone * through a quiescent state. */ void __dev_remove_pack(struct packet_type *pt) { struct list_head *head = ptype_head(pt); struct packet_type *pt1; spin_lock(&ptype_lock); list_for_each_entry(pt1, head, list) { if (pt == pt1) { list_del_rcu(&pt->list); goto out; } } pr_warn("dev_remove_pack: %p not found\n", pt); out: spin_unlock(&ptype_lock); } EXPORT_SYMBOL(__dev_remove_pack); /** * dev_remove_pack - remove packet handler * @pt: packet type declaration * * Remove a protocol handler that was previously added to the kernel * protocol handlers by dev_add_pack(). The passed &packet_type is removed * from the kernel lists and can be freed or reused once this function * returns. * * This call sleeps to guarantee that no CPU is looking at the packet * type after return. */ void dev_remove_pack(struct packet_type *pt) { __dev_remove_pack(pt); synchronize_net(); } EXPORT_SYMBOL(dev_remove_pack); /******************************************************************************* * * Device Interface Subroutines * *******************************************************************************/ /** * dev_get_iflink - get 'iflink' value of a interface * @dev: targeted interface * * Indicates the ifindex the interface is linked to. * Physical interfaces have the same 'ifindex' and 'iflink' values. */ int dev_get_iflink(const struct net_device *dev) { if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) return dev->netdev_ops->ndo_get_iflink(dev); return dev->ifindex; } EXPORT_SYMBOL(dev_get_iflink); /** * dev_fill_metadata_dst - Retrieve tunnel egress information. * @dev: targeted interface * @skb: The packet. * * For better visibility of tunnel traffic OVS needs to retrieve * egress tunnel information for a packet. Following API allows * user to get this info. */ int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info; if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) return -EINVAL; info = skb_tunnel_info_unclone(skb); if (!info) return -ENOMEM; if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) return -EINVAL; return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); } EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack) { int k = stack->num_paths++; if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX)) return NULL; return &stack->path[k]; } int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, struct net_device_path_stack *stack) { const struct net_device *last_dev; struct net_device_path_ctx ctx = { .dev = dev, }; struct net_device_path *path; int ret = 0; memcpy(ctx.daddr, daddr, sizeof(ctx.daddr)); stack->num_paths = 0; while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { last_dev = ctx.dev; path = dev_fwd_path(stack); if (!path) return -1; memset(path, 0, sizeof(struct net_device_path)); ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path); if (ret < 0) return -1; if (WARN_ON_ONCE(last_dev == ctx.dev)) return -1; } if (!ctx.dev) return ret; path = dev_fwd_path(stack); if (!path) return -1; path->type = DEV_PATH_ETHERNET; path->dev = ctx.dev; return ret; } EXPORT_SYMBOL_GPL(dev_fill_forward_path); /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace * @name: name to find * * Find an interface by name. Must be called under RTNL semaphore * or @dev_base_lock. If the name is found a pointer to the device * is returned. If the name is not found then %NULL is returned. The * reference counters are not incremented so the caller must be * careful with locks. */ struct net_device *__dev_get_by_name(struct net *net, const char *name) { struct netdev_name_node *node_name; node_name = netdev_name_node_lookup(net, name); return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(__dev_get_by_name); /** * dev_get_by_name_rcu - find a device by its name * @net: the applicable net namespace * @name: name to find * * Find an interface by name. * If the name is found a pointer to the device is returned. * If the name is not found then %NULL is returned. * The reference counters are not incremented so the caller must be * careful with locks. The caller must hold RCU lock. */ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) { struct netdev_name_node *node_name; node_name = netdev_name_node_lookup_rcu(net, name); return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(dev_get_by_name_rcu); /* Deprecated for new users, call netdev_get_by_name() instead */ struct net_device *dev_get_by_name(struct net *net, const char *name) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_name_rcu(net, name); dev_hold(dev); rcu_read_unlock(); return dev; } EXPORT_SYMBOL(dev_get_by_name); /** * netdev_get_by_name() - find a device by its name * @net: the applicable net namespace * @name: name to find * @tracker: tracking object for the acquired reference * @gfp: allocation flags for the tracker * * Find an interface by name. This can be called from any * context and does its own locking. The returned handle has * the usage count incremented and the caller must use netdev_put() to * release it when it is no longer needed. %NULL is returned if no * matching device is found. */ struct net_device *netdev_get_by_name(struct net *net, const char *name, netdevice_tracker *tracker, gfp_t gfp) { struct net_device *dev; dev = dev_get_by_name(net, name); if (dev) netdev_tracker_alloc(dev, tracker, gfp); return dev; } EXPORT_SYMBOL(netdev_get_by_name); /** * __dev_get_by_index - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful * about locking. The caller must hold either the RTNL semaphore * or @dev_base_lock. */ struct net_device *__dev_get_by_index(struct net *net, int ifindex) { struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex); hlist_for_each_entry(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev; return NULL; } EXPORT_SYMBOL(__dev_get_by_index); /** * dev_get_by_index_rcu - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful * about locking. The caller must hold RCU lock. */ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) { struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex); hlist_for_each_entry_rcu(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev; return NULL; } EXPORT_SYMBOL(dev_get_by_index_rcu); /* Deprecated for new users, call netdev_get_by_index() instead */ struct net_device *dev_get_by_index(struct net *net, int ifindex) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); dev_hold(dev); rcu_read_unlock(); return dev; } EXPORT_SYMBOL(dev_get_by_index); /** * netdev_get_by_index() - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * @tracker: tracking object for the acquired reference * @gfp: allocation flags for the tracker * * Search for an interface by index. Returns NULL if the device * is not found or a pointer to the device. The device returned has * had a reference added and the pointer is safe until the user calls * netdev_put() to indicate they have finished with it. */ struct net_device *netdev_get_by_index(struct net *net, int ifindex, netdevice_tracker *tracker, gfp_t gfp) { struct net_device *dev; dev = dev_get_by_index(net, ifindex); if (dev) netdev_tracker_alloc(dev, tracker, gfp); return dev; } EXPORT_SYMBOL(netdev_get_by_index); /** * dev_get_by_napi_id - find a device by napi_id * @napi_id: ID of the NAPI struct * * Search for an interface by NAPI ID. Returns %NULL if the device * is not found or a pointer to the device. The device has not had * its reference counter increased so the caller must be careful * about locking. The caller must hold RCU lock. */ struct net_device *dev_get_by_napi_id(unsigned int napi_id) { struct napi_struct *napi; WARN_ON_ONCE(!rcu_read_lock_held()); if (napi_id < MIN_NAPI_ID) return NULL; napi = napi_by_id(napi_id); return napi ? napi->dev : NULL; } EXPORT_SYMBOL(dev_get_by_napi_id); /** * netdev_get_name - get a netdevice name, knowing its ifindex. * @net: network namespace * @name: a pointer to the buffer where the name will be stored. * @ifindex: the ifindex of the interface to get the name from. */ int netdev_get_name(struct net *net, char *name, int ifindex) { struct net_device *dev; int ret; down_read(&devnet_rename_sem); rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (!dev) { ret = -ENODEV; goto out; } strcpy(name, dev->name); ret = 0; out: rcu_read_unlock(); up_read(&devnet_rename_sem); return ret; } /** * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace * @type: media type of device * @ha: hardware address * * Search for an interface by MAC address. Returns NULL if the device * is not found or a pointer to the device. * The caller must hold RCU or RTNL. * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * */ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, const char *ha) { struct net_device *dev; for_each_netdev_rcu(net, dev) if (dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len)) return dev; return NULL; } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; rcu_read_lock(); for_each_netdev_rcu(net, dev) if (dev->type == type) { dev_hold(dev); ret = dev; break; } rcu_read_unlock(); return ret; } EXPORT_SYMBOL(dev_getfirstbyhwtype); /** * __dev_get_by_flags - find any device with given flags * @net: the applicable net namespace * @if_flags: IFF_* values * @mask: bitmask of bits in if_flags to check * * Search for any interface with the given flags. Returns NULL if a device * is not found or a pointer to the device. Must be called inside * rtnl_lock(), and result refcount is unchanged. */ struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask) { struct net_device *dev, *ret; ASSERT_RTNL(); ret = NULL; for_each_netdev(net, dev) { if (((dev->flags ^ if_flags) & mask) == 0) { ret = dev; break; } } return ret; } EXPORT_SYMBOL(__dev_get_by_flags); /** * dev_valid_name - check if name is okay for network device * @name: name string * * Network device names need to be valid file names to * allow sysfs to work. We also disallow any kind of * whitespace. */ bool dev_valid_name(const char *name) { if (*name == '\0') return false; if (strnlen(name, IFNAMSIZ) == IFNAMSIZ) return false; if (!strcmp(name, ".") || !strcmp(name, "..")) return false; while (*name) { if (*name == '/' || *name == ':' || isspace(*name)) return false; name++; } return true; } EXPORT_SYMBOL(dev_valid_name); /** * __dev_alloc_name - allocate a name for a device * @net: network namespace to allocate the device name in * @name: name format string * @res: result name string * * Passed a format string - eg "lt%d" it will try and find a suitable * id. It scans list of devices to build up a free map, then chooses * the first empty slot. The caller must hold the dev_base or rtnl lock * while allocating the name and adding the device in order to avoid * duplicates. * Limited to bits_per_byte * page size devices (ie 32K on most platforms). * Returns the number of the unit assigned or a negative errno code. */ static int __dev_alloc_name(struct net *net, const char *name, char *res) { int i = 0; const char *p; const int max_netdevices = 8*PAGE_SIZE; unsigned long *inuse; struct net_device *d; char buf[IFNAMSIZ]; /* Verify the string as this thing may have come from the user. * There must be one "%d" and no other "%" characters. */ p = strchr(name, '%'); if (!p || p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL; /* Use one page as a bit array of possible slots */ inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC); if (!inuse) return -ENOMEM; for_each_netdev(net, d) { struct netdev_name_node *name_node; netdev_for_each_altname(d, name_node) { if (!sscanf(name_node->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) continue; /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, name_node->name, IFNAMSIZ)) __set_bit(i, inuse); } if (!sscanf(d->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) continue; /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, d->name, IFNAMSIZ)) __set_bit(i, inuse); } i = find_first_zero_bit(inuse, max_netdevices); bitmap_free(inuse); if (i == max_netdevices) return -ENFILE; /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */ strscpy(buf, name, IFNAMSIZ); snprintf(res, IFNAMSIZ, buf, i); return i; } /* Returns negative errno or allocated unit id (see __dev_alloc_name()) */ static int dev_prep_valid_name(struct net *net, struct net_device *dev, const char *want_name, char *out_name, int dup_errno) { if (!dev_valid_name(want_name)) return -EINVAL; if (strchr(want_name, '%')) return __dev_alloc_name(net, want_name, out_name); if (netdev_name_in_use(net, want_name)) return -dup_errno; if (out_name != want_name) strscpy(out_name, want_name, IFNAMSIZ); return 0; } /** * dev_alloc_name - allocate a name for a device * @dev: device * @name: name format string * * Passed a format string - eg "lt%d" it will try and find a suitable * id. It scans list of devices to build up a free map, then chooses * the first empty slot. The caller must hold the dev_base or rtnl lock * while allocating the name and adding the device in order to avoid * duplicates. * Limited to bits_per_byte * page size devices (ie 32K on most platforms). * Returns the number of the unit assigned or a negative errno code. */ int dev_alloc_name(struct net_device *dev, const char *name) { return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE); } EXPORT_SYMBOL(dev_alloc_name); static int dev_get_valid_name(struct net *net, struct net_device *dev, const char *name) { int ret; ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST); return ret < 0 ? ret : 0; } /** * dev_change_name - change name of a device * @dev: device * @newname: name (or format string) must be at least IFNAMSIZ * * Change name of a device, can pass format strings "eth%d". * for wildcarding. */ int dev_change_name(struct net_device *dev, const char *newname) { unsigned char old_assign_type; char oldname[IFNAMSIZ]; int err = 0; int ret; struct net *net; ASSERT_RTNL(); BUG_ON(!dev_net(dev)); net = dev_net(dev); down_write(&devnet_rename_sem); if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { up_write(&devnet_rename_sem); return 0; } memcpy(oldname, dev->name, IFNAMSIZ); err = dev_get_valid_name(net, dev, newname); if (err < 0) { up_write(&devnet_rename_sem); return err; } if (oldname[0] && !strchr(oldname, '%')) netdev_info(dev, "renamed from %s%s\n", oldname, dev->flags & IFF_UP ? " (while UP)" : ""); old_assign_type = dev->name_assign_type; dev->name_assign_type = NET_NAME_RENAMED; rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); dev->name_assign_type = old_assign_type; up_write(&devnet_rename_sem); return ret; } up_write(&devnet_rename_sem); netdev_adjacent_rename_links(dev, oldname); write_lock(&dev_base_lock); netdev_name_node_del(dev->name_node); write_unlock(&dev_base_lock); synchronize_rcu(); write_lock(&dev_base_lock); netdev_name_node_add(net, dev->name_node); write_unlock(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); if (ret) { /* err >= 0 after dev_alloc_name() or stores the first errno */ if (err >= 0) { err = ret; down_write(&devnet_rename_sem); memcpy(dev->name, oldname, IFNAMSIZ); memcpy(oldname, newname, IFNAMSIZ); dev->name_assign_type = old_assign_type; old_assign_type = NET_NAME_RENAMED; goto rollback; } else { netdev_err(dev, "name change rollback failed: %d\n", ret); } } return err; } /** * dev_set_alias - change ifalias of a device * @dev: device * @alias: name up to IFALIASZ * @len: limit of bytes to copy from info * * Set ifalias for a device, */ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) { struct dev_ifalias *new_alias = NULL; if (len >= IFALIASZ) return -EINVAL; if (len) { new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL); if (!new_alias) return -ENOMEM; memcpy(new_alias->ifalias, alias, len); new_alias->ifalias[len] = 0; } mutex_lock(&ifalias_mutex); new_alias = rcu_replace_pointer(dev->ifalias, new_alias, mutex_is_locked(&ifalias_mutex)); mutex_unlock(&ifalias_mutex); if (new_alias) kfree_rcu(new_alias, rcuhead); return len; } EXPORT_SYMBOL(dev_set_alias); /** * dev_get_alias - get ifalias of a device * @dev: device * @name: buffer to store name of ifalias * @len: size of buffer * * get ifalias for a device. Caller must make sure dev cannot go * away, e.g. rcu read lock or own a reference count to device. */ int dev_get_alias(const struct net_device *dev, char *name, size_t len) { const struct dev_ifalias *alias; int ret = 0; rcu_read_lock(); alias = rcu_dereference(dev->ifalias); if (alias) ret = snprintf(name, len, "%s", alias->ifalias); rcu_read_unlock(); return ret; } /** * netdev_features_change - device changes features * @dev: device to cause notification * * Called to indicate a device has changed features. */ void netdev_features_change(struct net_device *dev) { call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); } EXPORT_SYMBOL(netdev_features_change); /** * netdev_state_change - device changes state * @dev: device to cause notification * * Called to indicate a device has changed state. This function calls * the notifier chains for netdev_chain and sends a NEWLINK message * to the routing socket. */ void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { struct netdev_notifier_change_info change_info = { .info.dev = dev, }; call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL); } } EXPORT_SYMBOL(netdev_state_change); /** * __netdev_notify_peers - notify network peers about existence of @dev, * to be called when rtnl lock is already held. * @dev: network device * * Generate traffic such that interested network peers are aware of * @dev, such as by generating a gratuitous ARP. This may be used when * a device wants to inform the rest of the network about some sort of * reconfiguration such as a failover event or virtual machine * migration. */ void __netdev_notify_peers(struct net_device *dev) { ASSERT_RTNL(); call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); } EXPORT_SYMBOL(__netdev_notify_peers); /** * netdev_notify_peers - notify network peers about existence of @dev * @dev: network device * * Generate traffic such that interested network peers are aware of * @dev, such as by generating a gratuitous ARP. This may be used when * a device wants to inform the rest of the network about some sort of * reconfiguration such as a failover event or virtual machine * migration. */ void netdev_notify_peers(struct net_device *dev) { rtnl_lock(); __netdev_notify_peers(dev); rtnl_unlock(); } EXPORT_SYMBOL(netdev_notify_peers); static int napi_threaded_poll(void *data); static int napi_kthread_create(struct napi_struct *n) { int err = 0; /* Create and wake up the kthread once to put it in * TASK_INTERRUPTIBLE mode to avoid the blocked task * warning and work with loadavg. */ n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d", n->dev->name, n->napi_id); if (IS_ERR(n->thread)) { err = PTR_ERR(n->thread); pr_err("kthread_run failed with err %d\n", err); n->thread = NULL; } return err; } static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int ret; ASSERT_RTNL(); dev_addr_check(dev); if (!netif_device_present(dev)) { /* may be detached because parent is runtime-suspended */ if (dev->dev.parent) pm_runtime_resume(dev->dev.parent); if (!netif_device_present(dev)) return -ENODEV; } /* Block netpoll from trying to do any rx path servicing. * If we don't do this there is a chance ndo_poll_controller * or ndo_poll may be running while we open the device */ netpoll_poll_disable(dev); ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack); ret = notifier_to_errno(ret); if (ret) return ret; set_bit(__LINK_STATE_START, &dev->state); if (ops->ndo_validate_addr) ret = ops->ndo_validate_addr(dev); if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); netpoll_poll_enable(dev); if (ret) clear_bit(__LINK_STATE_START, &dev->state); else { dev->flags |= IFF_UP; dev_set_rx_mode(dev); dev_activate(dev); add_device_randomness(dev->dev_addr, dev->addr_len); } return ret; } /** * dev_open - prepare an interface for use. * @dev: device to open * @extack: netlink extended ack * * Takes a device from down to up state. The device's private open * function is invoked and then the multicast lists are loaded. Finally * the device is moved into the up state and a %NETDEV_UP message is * sent to the netdev notifier chain. * * Calling this function on an active interface is a nop. On a failure * a negative errno code is returned. */ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { int ret; if (dev->flags & IFF_UP) return 0; ret = __dev_open(dev, extack); if (ret < 0) return ret; rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL); call_netdevice_notifiers(NETDEV_UP, dev); return ret; } EXPORT_SYMBOL(dev_open); static void __dev_close_many(struct list_head *head) { struct net_device *dev; ASSERT_RTNL(); might_sleep(); list_for_each_entry(dev, head, close_list) { /* Temporarily disable netpoll until the interface is down */ netpoll_poll_disable(dev); call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); clear_bit(__LINK_STATE_START, &dev->state); /* Synchronize to scheduled poll. We cannot touch poll list, it * can be even on different cpu. So just clear netif_running(). * * dev->stop() will invoke napi_disable() on all of it's * napi_struct instances on this device. */ smp_mb__after_atomic(); /* Commit netif_running(). */ } dev_deactivate_many(head); list_for_each_entry(dev, head, close_list) { const struct net_device_ops *ops = dev->netdev_ops; /* * Call the device specific close. This cannot fail. * Only if device is UP * * We allow it to be called even after a DETACH hot-plug * event. */ if (ops->ndo_stop) ops->ndo_stop(dev); dev->flags &= ~IFF_UP; netpoll_poll_enable(dev); } } static void __dev_close(struct net_device *dev) { LIST_HEAD(single); list_add(&dev->close_list, &single); __dev_close_many(&single); list_del(&single); } void dev_close_many(struct list_head *head, bool unlink) { struct net_device *dev, *tmp; /* Remove the devices that don't need to be closed */ list_for_each_entry_safe(dev, tmp, head, close_list) if (!(dev->flags & IFF_UP)) list_del_init(&dev->close_list); __dev_close_many(head); list_for_each_entry_safe(dev, tmp, head, close_list) { rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL); call_netdevice_notifiers(NETDEV_DOWN, dev); if (unlink) list_del_init(&dev->close_list); } } EXPORT_SYMBOL(dev_close_many); /** * dev_close - shutdown an interface. * @dev: device to shutdown * * This function moves an active device into down state. A * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier * chain. */ void dev_close(struct net_device *dev) { if (dev->flags & IFF_UP) { LIST_HEAD(single); list_add(&dev->close_list, &single); dev_close_many(&single, true); list_del(&single); } } EXPORT_SYMBOL(dev_close); /** * dev_disable_lro - disable Large Receive Offload on a device * @dev: device * * Disable Large Receive Offload (LRO) on a net device. Must be * called under RTNL. This is needed if received packets may be * forwarded to another interface. */ void dev_disable_lro(struct net_device *dev) { struct net_device *lower_dev; struct list_head *iter; dev->wanted_features &= ~NETIF_F_LRO; netdev_update_features(dev); if (unlikely(dev->features & NETIF_F_LRO)) netdev_WARN(dev, "failed to disable LRO!\n"); netdev_for_each_lower_dev(dev, lower_dev, iter) dev_disable_lro(lower_dev); } EXPORT_SYMBOL(dev_disable_lro); /** * dev_disable_gro_hw - disable HW Generic Receive Offload on a device * @dev: device * * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be * called under RTNL. This is needed if Generic XDP is installed on * the device. */ static void dev_disable_gro_hw(struct net_device *dev) { dev->wanted_features &= ~NETIF_F_GRO_HW; netdev_update_features(dev); if (unlikely(dev->features & NETIF_F_GRO_HW)) netdev_WARN(dev, "failed to disable GRO_HW!\n"); } const char *netdev_cmd_to_name(enum netdev_cmd cmd) { #define N(val) \ case NETDEV_##val: \ return "NETDEV_" __stringify(val); switch (cmd) { N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER) N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE) N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE) N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE) N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA) N(XDP_FEAT_CHANGE) } #undef N return "UNKNOWN_NETDEV_EVENT"; } EXPORT_SYMBOL_GPL(netdev_cmd_to_name); static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { struct netdev_notifier_info info = { .dev = dev, }; return nb->notifier_call(nb, val, &info); } static int call_netdevice_register_notifiers(struct notifier_block *nb, struct net_device *dev) { int err; err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); err = notifier_to_errno(err); if (err) return err; if (!(dev->flags & IFF_UP)) return 0; call_netdevice_notifier(nb, NETDEV_UP, dev); return 0; } static void call_netdevice_unregister_notifiers(struct notifier_block *nb, struct net_device *dev) { if (dev->flags & IFF_UP) { call_netdevice_notifier(nb, NETDEV_GOING_DOWN, dev); call_netdevice_notifier(nb, NETDEV_DOWN, dev); } call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } static int call_netdevice_register_net_notifiers(struct notifier_block *nb, struct net *net) { struct net_device *dev; int err; for_each_netdev(net, dev) { err = call_netdevice_register_notifiers(nb, dev); if (err) goto rollback; } return 0; rollback: for_each_netdev_continue_reverse(net, dev) call_netdevice_unregister_notifiers(nb, dev); return err; } static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb, struct net *net) { struct net_device *dev; for_each_netdev(net, dev) call_netdevice_unregister_notifiers(nb, dev); } static int dev_boot_phase = 1; /** * register_netdevice_notifier - register a network notifier block * @nb: notifier * * Register a notifier to be called when network device events occur. * The notifier passed is linked into the kernel structures and must * not be reused until it has been unregistered. A negative errno code * is returned on a failure. * * When registered all registration and up events are replayed * to the new notifier to allow device to have a race free * view of the network device list. */ int register_netdevice_notifier(struct notifier_block *nb) { struct net *net; int err; /* Close race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock(); err = raw_notifier_chain_register(&netdev_chain, nb); if (err) goto unlock; if (dev_boot_phase) goto unlock; for_each_net(net) { err = call_netdevice_register_net_notifiers(nb, net); if (err) goto rollback; } unlock: rtnl_unlock(); up_write(&pernet_ops_rwsem); return err; rollback: for_each_net_continue_reverse(net) call_netdevice_unregister_net_notifiers(nb, net); raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } EXPORT_SYMBOL(register_netdevice_notifier); /** * unregister_netdevice_notifier - unregister a network notifier block * @nb: notifier * * Unregister a notifier previously registered by * register_netdevice_notifier(). The notifier is unlinked into the * kernel structures and may then be reused. A negative errno code * is returned on a failure. * * After unregistering unregister and down device events are synthesized * for all devices on the device list to the removed notifier to remove * the need for special case cleanup code. */ int unregister_netdevice_notifier(struct notifier_block *nb) { struct net *net; int err; /* Close race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock(); err = raw_notifier_chain_unregister(&netdev_chain, nb); if (err) goto unlock; for_each_net(net) call_netdevice_unregister_net_notifiers(nb, net); unlock: rtnl_unlock(); up_write(&pernet_ops_rwsem); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier); static int __register_netdevice_notifier_net(struct net *net, struct notifier_block *nb, bool ignore_call_fail) { int err; err = raw_notifier_chain_register(&net->netdev_chain, nb); if (err) return err; if (dev_boot_phase) return 0; err = call_netdevice_register_net_notifiers(nb, net); if (err && !ignore_call_fail) goto chain_unregister; return 0; chain_unregister: raw_notifier_chain_unregister(&net->netdev_chain, nb); return err; } static int __unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb) { int err; err = raw_notifier_chain_unregister(&net->netdev_chain, nb); if (err) return err; call_netdevice_unregister_net_notifiers(nb, net); return 0; } /** * register_netdevice_notifier_net - register a per-netns network notifier block * @net: network namespace * @nb: notifier * * Register a notifier to be called when network device events occur. * The notifier passed is linked into the kernel structures and must * not be reused until it has been unregistered. A negative errno code * is returned on a failure. * * When registered all registration and up events are replayed * to the new notifier to allow device to have a race free * view of the network device list. */ int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) { int err; rtnl_lock(); err = __register_netdevice_notifier_net(net, nb, false); rtnl_unlock(); return err; } EXPORT_SYMBOL(register_netdevice_notifier_net); /** * unregister_netdevice_notifier_net - unregister a per-netns * network notifier block * @net: network namespace * @nb: notifier * * Unregister a notifier previously registered by * register_netdevice_notifier_net(). The notifier is unlinked from the * kernel structures and may then be reused. A negative errno code * is returned on a failure. * * After unregistering unregister and down device events are synthesized * for all devices on the device list to the removed notifier to remove * the need for special case cleanup code. */ int unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb) { int err; rtnl_lock(); err = __unregister_netdevice_notifier_net(net, nb); rtnl_unlock(); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier_net); static void __move_netdevice_notifier_net(struct net *src_net, struct net *dst_net, struct notifier_block *nb) { __unregister_netdevice_notifier_net(src_net, nb); __register_netdevice_notifier_net(dst_net, nb, true); } int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { int err; rtnl_lock(); err = __register_netdevice_notifier_net(dev_net(dev), nb, false); if (!err) { nn->nb = nb; list_add(&nn->list, &dev->net_notifier_list); } rtnl_unlock(); return err; } EXPORT_SYMBOL(register_netdevice_notifier_dev_net); int unregister_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { int err; rtnl_lock(); list_del(&nn->list); err = __unregister_netdevice_notifier_net(dev_net(dev), nb); rtnl_unlock(); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net); static void move_netdevice_notifiers_dev_net(struct net_device *dev, struct net *net) { struct netdev_net_notifier *nn; list_for_each_entry(nn, &dev->net_notifier_list, list) __move_netdevice_notifier_net(dev_net(dev), net, nn->nb); } /** * call_netdevice_notifiers_info - call all network notifier blocks * @val: value passed unmodified to notifier function * @info: notifier information data * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info) { struct net *net = dev_net(info->dev); int ret; ASSERT_RTNL(); /* Run per-netns notifier block chain first, then run the global one. * Hopefully, one day, the global one is going to be removed after * all notifier block registrators get converted to be per-netns. */ ret = raw_notifier_call_chain(&net->netdev_chain, val, info); if (ret & NOTIFY_STOP_MASK) return ret; return raw_notifier_call_chain(&netdev_chain, val, info); } /** * call_netdevice_notifiers_info_robust - call per-netns notifier blocks * for and rollback on error * @val_up: value passed unmodified to notifier function * @val_down: value passed unmodified to the notifier function when * recovering from an error on @val_up * @info: notifier information data * * Call all per-netns network notifier blocks, but not notifier blocks on * the global notifier chain. Parameters and return value are as for * raw_notifier_call_chain_robust(). */ static int call_netdevice_notifiers_info_robust(unsigned long val_up, unsigned long val_down, struct netdev_notifier_info *info) { struct net *net = dev_net(info->dev); ASSERT_RTNL(); return raw_notifier_call_chain_robust(&net->netdev_chain, val_up, val_down, info); } static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack) { struct netdev_notifier_info info = { .dev = dev, .extack = extack, }; return call_netdevice_notifiers_info(val, &info); } /** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function * @dev: net_device pointer passed unmodified to notifier function * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { return call_netdevice_notifiers_extack(val, dev, NULL); } EXPORT_SYMBOL(call_netdevice_notifiers); /** * call_netdevice_notifiers_mtu - call all network notifier blocks * @val: value passed unmodified to notifier function * @dev: net_device pointer passed unmodified to notifier function * @arg: additional u32 argument passed to the notifier function * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ static int call_netdevice_notifiers_mtu(unsigned long val, struct net_device *dev, u32 arg) { struct netdev_notifier_info_ext info = { .info.dev = dev, .ext.mtu = arg, }; BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0); return call_netdevice_notifiers_info(val, &info.info); } #ifdef CONFIG_NET_INGRESS static DEFINE_STATIC_KEY_FALSE(ingress_needed_key); void net_inc_ingress_queue(void) { static_branch_inc(&ingress_needed_key); } EXPORT_SYMBOL_GPL(net_inc_ingress_queue); void net_dec_ingress_queue(void) { static_branch_dec(&ingress_needed_key); } EXPORT_SYMBOL_GPL(net_dec_ingress_queue); #endif #ifdef CONFIG_NET_EGRESS static DEFINE_STATIC_KEY_FALSE(egress_needed_key); void net_inc_egress_queue(void) { static_branch_inc(&egress_needed_key); } EXPORT_SYMBOL_GPL(net_inc_egress_queue); void net_dec_egress_queue(void) { static_branch_dec(&egress_needed_key); } EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); EXPORT_SYMBOL(netstamp_needed_key); #ifdef CONFIG_JUMP_LABEL static atomic_t netstamp_needed_deferred; static atomic_t netstamp_wanted; static void netstamp_clear(struct work_struct *work) { int deferred = atomic_xchg(&netstamp_needed_deferred, 0); int wanted; wanted = atomic_add_return(deferred, &netstamp_wanted); if (wanted > 0) static_branch_enable(&netstamp_needed_key); else static_branch_disable(&netstamp_needed_key); } static DECLARE_WORK(netstamp_work, netstamp_clear); #endif void net_enable_timestamp(void) { #ifdef CONFIG_JUMP_LABEL int wanted = atomic_read(&netstamp_wanted); while (wanted > 0) { if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1)) return; } atomic_inc(&netstamp_needed_deferred); schedule_work(&netstamp_work); #else static_branch_inc(&netstamp_needed_key); #endif } EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { #ifdef CONFIG_JUMP_LABEL int wanted = atomic_read(&netstamp_wanted); while (wanted > 1) { if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1)) return; } atomic_dec(&netstamp_needed_deferred); schedule_work(&netstamp_work); #else static_branch_dec(&netstamp_needed_key); #endif } EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp = 0; skb->mono_delivery_time = 0; if (static_branch_unlikely(&netstamp_needed_key)) skb->tstamp = ktime_get_real(); } #define net_timestamp_check(COND, SKB) \ if (static_branch_unlikely(&netstamp_needed_key)) { \ if ((COND) && !(SKB)->tstamp) \ (SKB)->tstamp = ktime_get_real(); \ } \ bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) { return __is_skb_forwardable(dev, skb, true); } EXPORT_SYMBOL_GPL(is_skb_forwardable); static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb, bool check_mtu) { int ret = ____dev_forward_skb(dev, skb, check_mtu); if (likely(!ret)) { skb->protocol = eth_type_trans(skb, dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } return ret; } int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { return __dev_forward_skb2(dev, skb, true); } EXPORT_SYMBOL_GPL(__dev_forward_skb); /** * dev_forward_skb - loopback an skb to another netif * * @dev: destination network device * @skb: buffer to forward * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped, but freed) * * dev_forward_skb can be used for injecting an skb from the * start_xmit function of one device into the receive queue * of another device. * * The receiving device may be in another namespace, so * we have to clear all information in the skb that could * impact namespace isolation. */ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb) { return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb); } static inline int deliver_skb(struct sk_buff *skb, struct packet_type *pt_prev, struct net_device *orig_dev) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) return -ENOMEM; refcount_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } static inline void deliver_ptype_list_skb(struct sk_buff *skb, struct packet_type **pt, struct net_device *orig_dev, __be16 type, struct list_head *ptype_list) { struct packet_type *ptype, *pt_prev = *pt; list_for_each_entry_rcu(ptype, ptype_list, list) { if (ptype->type != type) continue; if (pt_prev) deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } *pt = pt_prev; } static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) { if (!ptype->af_packet_priv || !skb->sk) return false; if (ptype->id_match) return ptype->id_match(ptype, skb->sk); else if ((struct sock *)ptype->af_packet_priv == skb->sk) return true; return false; } /** * dev_nit_active - return true if any network interface taps are in use * * @dev: network device to check for the presence of taps */ bool dev_nit_active(struct net_device *dev) { return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all); } EXPORT_SYMBOL_GPL(dev_nit_active); /* * Support routine. Sends outgoing frames to any network * taps currently in use. */ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct packet_type *ptype; struct sk_buff *skb2 = NULL; struct packet_type *pt_prev = NULL; struct list_head *ptype_list = &ptype_all; rcu_read_lock(); again: list_for_each_entry_rcu(ptype, ptype_list, list) { if (ptype->ignore_outgoing) continue; /* Never send packets back to the socket * they originated from - MvS (miquels@drinkel.ow.org) */ if (skb_loop_sk(ptype, skb)) continue; if (pt_prev) { deliver_skb(skb2, pt_prev, skb->dev); pt_prev = ptype; continue; } /* need to clone skb, done only once */ skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) goto out_unlock; net_timestamp_set(skb2); /* skb->nh should be correctly * set by sender, so that the second statement is * just protection against buggy protocols. */ skb_reset_mac_header(skb2); if (skb_network_header(skb2) < skb2->data || skb_network_header(skb2) > skb_tail_pointer(skb2)) { net_crit_ratelimited("protocol %04x is buggy, dev %s\n", ntohs(skb2->protocol), dev->name); skb_reset_network_header(skb2); } skb2->transport_header = skb2->network_header; skb2->pkt_type = PACKET_OUTGOING; pt_prev = ptype; } if (ptype_list == &ptype_all) { ptype_list = &dev->ptype_all; goto again; } out_unlock: if (pt_prev) { if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC)) pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); else kfree_skb(skb2); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); /** * netif_setup_tc - Handle tc mappings on real_num_tx_queues change * @dev: Network device * @txq: number of queues available * * If real_num_tx_queues is changed the tc mappings may no longer be * valid. To resolve this verify the tc mapping remains valid and if * not NULL the mapping. With no priorities mapping to this * offset/count pair it will no longer be used. In the worst case TC0 * is invalid nothing can be done so disable priority mappings. If is * expected that drivers will fix this mapping if they can before * calling netif_set_real_num_tx_queues. */ static void netif_setup_tc(struct net_device *dev, unsigned int txq) { int i; struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; /* If TC0 is invalidated disable TC mapping */ if (tc->offset + tc->count > txq) { netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); dev->num_tc = 0; return; } /* Invalidated prio to tc mappings set to TC0 */ for (i = 1; i < TC_BITMASK + 1; i++) { int q = netdev_get_prio_tc_map(dev, i); tc = &dev->tc_to_txq[q]; if (tc->offset + tc->count > txq) { netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", i, q); netdev_set_prio_tc_map(dev, i, 0); } } } int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) { if (dev->num_tc) { struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; int i; /* walk through the TCs and see if it falls into any of them */ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { if ((txq - tc->offset) < tc->count) return i; } /* didn't find it, just return -1 to indicate no match */ return -1; } return 0; } EXPORT_SYMBOL(netdev_txq_to_tc); #ifdef CONFIG_XPS static struct static_key xps_needed __read_mostly; static struct static_key xps_rxqs_needed __read_mostly; static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) static bool remove_xps_queue(struct xps_dev_maps *dev_maps, struct xps_dev_maps *old_maps, int tci, u16 index) { struct xps_map *map = NULL; int pos; map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) return false; for (pos = map->len; pos--;) { if (map->queues[pos] != index) continue; if (map->len > 1) { map->queues[pos] = map->queues[--map->len]; break; } if (old_maps) RCU_INIT_POINTER(old_maps->attr_map[tci], NULL); RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); return false; } return true; } static bool remove_xps_queue_cpu(struct net_device *dev, struct xps_dev_maps *dev_maps, int cpu, u16 offset, u16 count) { int num_tc = dev_maps->num_tc; bool active = false; int tci; for (tci = cpu * num_tc; num_tc--; tci++) { int i, j; for (i = count, j = offset; i--; j++) { if (!remove_xps_queue(dev_maps, NULL, tci, j)) break; } active |= i < 0; } return active; } static void reset_xps_maps(struct net_device *dev, struct xps_dev_maps *dev_maps, enum xps_map_type type) { static_key_slow_dec_cpuslocked(&xps_needed); if (type == XPS_RXQS) static_key_slow_dec_cpuslocked(&xps_rxqs_needed); RCU_INIT_POINTER(dev->xps_maps[type], NULL); kfree_rcu(dev_maps, rcu); } static void clean_xps_maps(struct net_device *dev, enum xps_map_type type, u16 offset, u16 count) { struct xps_dev_maps *dev_maps; bool active = false; int i, j; dev_maps = xmap_dereference(dev->xps_maps[type]); if (!dev_maps) return; for (j = 0; j < dev_maps->nr_ids; j++) active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); if (!active) reset_xps_maps(dev, dev_maps, type); if (type == XPS_CPUS) { for (i = offset + (count - 1); count--; i--) netdev_queue_numa_node_write( netdev_get_tx_queue(dev, i), NUMA_NO_NODE); } } static void netif_reset_xps_queues(struct net_device *dev, u16 offset, u16 count) { if (!static_key_false(&xps_needed)) return; cpus_read_lock(); mutex_lock(&xps_map_mutex); if (static_key_false(&xps_rxqs_needed)) clean_xps_maps(dev, XPS_RXQS, offset, count); clean_xps_maps(dev, XPS_CPUS, offset, count); mutex_unlock(&xps_map_mutex); cpus_read_unlock(); } static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) { netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); } static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, u16 index, bool is_rxqs_map) { struct xps_map *new_map; int alloc_len = XPS_MIN_MAP_ALLOC; int i, pos; for (pos = 0; map && pos < map->len; pos++) { if (map->queues[pos] != index) continue; return map; } /* Need to add tx-queue to this CPU's/rx-queue's existing map */ if (map) { if (pos < map->alloc_len) return map; alloc_len = map->alloc_len * 2; } /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's * map */ if (is_rxqs_map) new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL); else new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, cpu_to_node(attr_index)); if (!new_map) return NULL; for (i = 0; i < pos; i++) new_map->queues[i] = map->queues[i]; new_map->alloc_len = alloc_len; new_map->len = pos; return new_map; } /* Copy xps maps at a given index */ static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps, struct xps_dev_maps *new_dev_maps, int index, int tc, bool skip_tc) { int i, tci = index * dev_maps->num_tc; struct xps_map *map; /* copy maps belonging to foreign traffic classes */ for (i = 0; i < dev_maps->num_tc; i++, tci++) { if (i == tc && skip_tc) continue; /* fill in the new device map from the old device map */ map = xmap_dereference(dev_maps->attr_map[tci]); RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } } /* Must be called under cpus_read_lock */ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, enum xps_map_type type) { struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL; const unsigned long *online_mask = NULL; bool active = false, copy = false; int i, j, tci, numa_node_id = -2; int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; unsigned int nr_ids; WARN_ON_ONCE(index >= dev->num_tx_queues); if (dev->num_tc) { /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; if (num_tc < 0) return -EINVAL; /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; } mutex_lock(&xps_map_mutex); dev_maps = xmap_dereference(dev->xps_maps[type]); if (type == XPS_RXQS) { maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); nr_ids = dev->num_rx_queues; } else { maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); if (num_possible_cpus() > 1) online_mask = cpumask_bits(cpu_online_mask); nr_ids = nr_cpu_ids; } if (maps_sz < L1_CACHE_BYTES) maps_sz = L1_CACHE_BYTES; /* The old dev_maps could be larger or smaller than the one we're * setting up now, as dev->num_tc or nr_ids could have been updated in * between. We could try to be smart, but let's be safe instead and only * copy foreign traffic classes if the two map sizes match. */ if (dev_maps && dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids) copy = true; /* allocate memory for queue storage */ for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), j < nr_ids;) { if (!new_dev_maps) { new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { mutex_unlock(&xps_map_mutex); return -ENOMEM; } new_dev_maps->nr_ids = nr_ids; new_dev_maps->num_tc = num_tc; } tci = j * num_tc + tc; map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; map = expand_xps_map(map, j, index, type == XPS_RXQS); if (!map) goto error; RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } if (!new_dev_maps) goto out_no_new_maps; if (!dev_maps) { /* Increment static keys at most once per type */ static_key_slow_inc_cpuslocked(&xps_needed); if (type == XPS_RXQS) static_key_slow_inc_cpuslocked(&xps_rxqs_needed); } for (j = 0; j < nr_ids; j++) { bool skip_tc = false; tci = j * num_tc + tc; if (netif_attr_test_mask(j, mask, nr_ids) && netif_attr_test_online(j, online_mask, nr_ids)) { /* add tx-queue to CPU/rx-queue maps */ int pos = 0; skip_tc = true; map = xmap_dereference(new_dev_maps->attr_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; if (pos == map->len) map->queues[map->len++] = index; #ifdef CONFIG_NUMA if (type == XPS_CPUS) { if (numa_node_id == -2) numa_node_id = cpu_to_node(j); else if (numa_node_id != cpu_to_node(j)) numa_node_id = -1; } #endif } if (copy) xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc, skip_tc); } rcu_assign_pointer(dev->xps_maps[type], new_dev_maps); /* Cleanup old maps */ if (!dev_maps) goto out_no_old_maps; for (j = 0; j < dev_maps->nr_ids; j++) { for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) continue; if (copy) { new_map = xmap_dereference(new_dev_maps->attr_map[tci]); if (map == new_map) continue; } RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); } } old_dev_maps = dev_maps; out_no_old_maps: dev_maps = new_dev_maps; active = true; out_no_new_maps: if (type == XPS_CPUS) /* update Tx queue numa node */ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), (numa_node_id >= 0) ? numa_node_id : NUMA_NO_NODE); if (!dev_maps) goto out_no_maps; /* removes tx-queue from unused CPUs/rx-queues */ for (j = 0; j < dev_maps->nr_ids; j++) { tci = j * dev_maps->num_tc; for (i = 0; i < dev_maps->num_tc; i++, tci++) { if (i == tc && netif_attr_test_mask(j, mask, dev_maps->nr_ids) && netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) continue; active |= remove_xps_queue(dev_maps, copy ? old_dev_maps : NULL, tci, index); } } if (old_dev_maps) kfree_rcu(old_dev_maps, rcu); /* free map if not active */ if (!active) reset_xps_maps(dev, dev_maps, type); out_no_maps: mutex_unlock(&xps_map_mutex); return 0; error: /* remove any maps that we added */ for (j = 0; j < nr_ids; j++) { for (i = num_tc, tci = j * num_tc; i--; tci++) { new_map = xmap_dereference(new_dev_maps->attr_map[tci]); map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; if (new_map && new_map != map) kfree(new_map); } } mutex_unlock(&xps_map_mutex); kfree(new_dev_maps); return -ENOMEM; } EXPORT_SYMBOL_GPL(__netif_set_xps_queue); int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index) { int ret; cpus_read_lock(); ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS); cpus_read_unlock(); return ret; } EXPORT_SYMBOL(netif_set_xps_queue); #endif static void netdev_unbind_all_sb_channels(struct net_device *dev) { struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; /* Unbind any subordinate channels */ while (txq-- != &dev->_tx[0]) { if (txq->sb_dev) netdev_unbind_sb_channel(dev, txq->sb_dev); } } void netdev_reset_tc(struct net_device *dev) { #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif netdev_unbind_all_sb_channels(dev); /* Reset TC configuration of device */ dev->num_tc = 0; memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); } EXPORT_SYMBOL(netdev_reset_tc); int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset) { if (tc >= dev->num_tc) return -EINVAL; #ifdef CONFIG_XPS netif_reset_xps_queues(dev, offset, count); #endif dev->tc_to_txq[tc].count = count; dev->tc_to_txq[tc].offset = offset; return 0; } EXPORT_SYMBOL(netdev_set_tc_queue); int netdev_set_num_tc(struct net_device *dev, u8 num_tc) { if (num_tc > TC_MAX_QUEUE) return -EINVAL; #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif netdev_unbind_all_sb_channels(dev); dev->num_tc = num_tc; return 0; } EXPORT_SYMBOL(netdev_set_num_tc); void netdev_unbind_sb_channel(struct net_device *dev, struct net_device *sb_dev) { struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; #ifdef CONFIG_XPS netif_reset_xps_queues_gt(sb_dev, 0); #endif memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq)); memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map)); while (txq-- != &dev->_tx[0]) { if (txq->sb_dev == sb_dev) txq->sb_dev = NULL; } } EXPORT_SYMBOL(netdev_unbind_sb_channel); int netdev_bind_sb_channel_queue(struct net_device *dev, struct net_device *sb_dev, u8 tc, u16 count, u16 offset) { /* Make certain the sb_dev and dev are already configured */ if (sb_dev->num_tc >= 0 || tc >= dev->num_tc) return -EINVAL; /* We cannot hand out queues we don't have */ if ((offset + count) > dev->real_num_tx_queues) return -EINVAL; /* Record the mapping */ sb_dev->tc_to_txq[tc].count = count; sb_dev->tc_to_txq[tc].offset = offset; /* Provide a way for Tx queue to find the tc_to_txq map or * XPS map for itself. */ while (count--) netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev; return 0; } EXPORT_SYMBOL(netdev_bind_sb_channel_queue); int netdev_set_sb_channel(struct net_device *dev, u16 channel) { /* Do not use a multiqueue device to represent a subordinate channel */ if (netif_is_multiqueue(dev)) return -ENODEV; /* We allow channels 1 - 32767 to be used for subordinate channels. * Channel 0 is meant to be "native" mode and used only to represent * the main root device. We allow writing 0 to reset the device back * to normal mode after being used as a subordinate channel. */ if (channel > S16_MAX) return -EINVAL; dev->num_tc = -channel; return 0; } EXPORT_SYMBOL(netdev_set_sb_channel); /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. */ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) { bool disabling; int rc; disabling = txq < dev->real_num_tx_queues; if (txq < 1 || txq > dev->num_tx_queues) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) { ASSERT_RTNL(); rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, txq); if (rc) return rc; if (dev->num_tc) netif_setup_tc(dev, txq); dev_qdisc_change_real_num_tx(dev, txq); dev->real_num_tx_queues = txq; if (disabling) { synchronize_net(); qdisc_reset_all_tx_gt(dev, txq); #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, txq); #endif } } else { dev->real_num_tx_queues = txq; } return 0; } EXPORT_SYMBOL(netif_set_real_num_tx_queues); #ifdef CONFIG_SYSFS /** * netif_set_real_num_rx_queues - set actual number of RX queues used * @dev: Network device * @rxq: Actual number of RX queues * * This must be called either with the rtnl_lock held or before * registration of the net device. Returns 0 on success, or a * negative error code. If called before registration, it always * succeeds. */ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) { int rc; if (rxq < 1 || rxq > dev->num_rx_queues) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED) { ASSERT_RTNL(); rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, rxq); if (rc) return rc; } dev->real_num_rx_queues = rxq; return 0; } EXPORT_SYMBOL(netif_set_real_num_rx_queues); #endif /** * netif_set_real_num_queues - set actual number of RX and TX queues used * @dev: Network device * @txq: Actual number of TX queues * @rxq: Actual number of RX queues * * Set the real number of both TX and RX queues. * Does nothing if the number of queues is already correct. */ int netif_set_real_num_queues(struct net_device *dev, unsigned int txq, unsigned int rxq) { unsigned int old_rxq = dev->real_num_rx_queues; int err; if (txq < 1 || txq > dev->num_tx_queues || rxq < 1 || rxq > dev->num_rx_queues) return -EINVAL; /* Start from increases, so the error path only does decreases - * decreases can't fail. */ if (rxq > dev->real_num_rx_queues) { err = netif_set_real_num_rx_queues(dev, rxq); if (err) return err; } if (txq > dev->real_num_tx_queues) { err = netif_set_real_num_tx_queues(dev, txq); if (err) goto undo_rx; } if (rxq < dev->real_num_rx_queues) WARN_ON(netif_set_real_num_rx_queues(dev, rxq)); if (txq < dev->real_num_tx_queues) WARN_ON(netif_set_real_num_tx_queues(dev, txq)); return 0; undo_rx: WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq)); return err; } EXPORT_SYMBOL(netif_set_real_num_queues); /** * netif_set_tso_max_size() - set the max size of TSO frames supported * @dev: netdev to update * @size: max skb->len of a TSO frame * * Set the limit on the size of TSO super-frames the device can handle. * Unless explicitly set the stack will assume the value of * %GSO_LEGACY_MAX_SIZE. */ void netif_set_tso_max_size(struct net_device *dev, unsigned int size) { dev->tso_max_size = min(GSO_MAX_SIZE, size); if (size < READ_ONCE(dev->gso_max_size)) netif_set_gso_max_size(dev, size); if (size < READ_ONCE(dev->gso_ipv4_max_size)) netif_set_gso_ipv4_max_size(dev, size); } EXPORT_SYMBOL(netif_set_tso_max_size); /** * netif_set_tso_max_segs() - set the max number of segs supported for TSO * @dev: netdev to update * @segs: max number of TCP segments * * Set the limit on the number of TCP segments the device can generate from * a single TSO super-frame. * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS. */ void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs) { dev->tso_max_segs = segs; if (segs < READ_ONCE(dev->gso_max_segs)) netif_set_gso_max_segs(dev, segs); } EXPORT_SYMBOL(netif_set_tso_max_segs); /** * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper * @to: netdev to update * @from: netdev from which to copy the limits */ void netif_inherit_tso_max(struct net_device *to, const struct net_device *from) { netif_set_tso_max_size(to, from->tso_max_size); netif_set_tso_max_segs(to, from->tso_max_segs); } EXPORT_SYMBOL(netif_inherit_tso_max); /** * netif_get_num_default_rss_queues - default number of RSS queues * * Default value is the number of physical cores if there are only 1 or 2, or * divided by 2 if there are more. */ int netif_get_num_default_rss_queues(void) { cpumask_var_t cpus; int cpu, count = 0; if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL))) return 1; cpumask_copy(cpus, cpu_online_mask); for_each_cpu(cpu, cpus) { ++count; cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu)); } free_cpumask_var(cpus); return count > 2 ? DIV_ROUND_UP(count, 2) : count; } EXPORT_SYMBOL(netif_get_num_default_rss_queues); static void __netif_reschedule(struct Qdisc *q) { struct softnet_data *sd; unsigned long flags; local_irq_save(flags); sd = this_cpu_ptr(&softnet_data); q->next_sched = NULL; *sd->output_queue_tailp = q; sd->output_queue_tailp = &q->next_sched; raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } void __netif_schedule(struct Qdisc *q) { if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) __netif_reschedule(q); } EXPORT_SYMBOL(__netif_schedule); struct dev_kfree_skb_cb { enum skb_drop_reason reason; }; static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) { return (struct dev_kfree_skb_cb *)skb->cb; } void netif_schedule_queue(struct netdev_queue *txq) { rcu_read_lock(); if (!netif_xmit_stopped(txq)) { struct Qdisc *q = rcu_dereference(txq->qdisc); __netif_schedule(q); } rcu_read_unlock(); } EXPORT_SYMBOL(netif_schedule_queue); void netif_tx_wake_queue(struct netdev_queue *dev_queue) { if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { struct Qdisc *q; rcu_read_lock(); q = rcu_dereference(dev_queue->qdisc); __netif_schedule(q); rcu_read_unlock(); } } EXPORT_SYMBOL(netif_tx_wake_queue); void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason) { unsigned long flags; if (unlikely(!skb)) return; if (likely(refcount_read(&skb->users) == 1)) { smp_rmb(); refcount_set(&skb->users, 0); } else if (likely(!refcount_dec_and_test(&skb->users))) { return; } get_kfree_skb_cb(skb)->reason = reason; local_irq_save(flags); skb->next = __this_cpu_read(softnet_data.completion_queue); __this_cpu_write(softnet_data.completion_queue, skb); raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } EXPORT_SYMBOL(dev_kfree_skb_irq_reason); void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason) { if (in_hardirq() || irqs_disabled()) dev_kfree_skb_irq_reason(skb, reason); else kfree_skb_reason(skb, reason); } EXPORT_SYMBOL(dev_kfree_skb_any_reason); /** * netif_device_detach - mark device as removed * @dev: network device * * Mark device as removed from system and therefore no longer available. */ void netif_device_detach(struct net_device *dev) { if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { netif_tx_stop_all_queues(dev); } } EXPORT_SYMBOL(netif_device_detach); /** * netif_device_attach - mark device as attached * @dev: network device * * Mark device as attached from system and restart if needed. */ void netif_device_attach(struct net_device *dev) { if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { netif_tx_wake_all_queues(dev); __netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_device_attach); /* * Returns a Tx hash based on the given packet descriptor a Tx queues' number * to be used as a distribution range. */ static u16 skb_tx_hash(const struct net_device *dev, const struct net_device *sb_dev, struct sk_buff *skb) { u32 hash; u16 qoffset = 0; u16 qcount = dev->real_num_tx_queues; if (dev->num_tc) { u8 tc = netdev_get_prio_tc_map(dev, skb->priority); qoffset = sb_dev->tc_to_txq[tc].offset; qcount = sb_dev->tc_to_txq[tc].count; if (unlikely(!qcount)) { net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n", sb_dev->name, qoffset, tc); qoffset = 0; qcount = dev->real_num_tx_queues; } } if (skb_rx_queue_recorded(skb)) { DEBUG_NET_WARN_ON_ONCE(qcount == 0); hash = skb_get_rx_queue(skb); if (hash >= qoffset) hash -= qoffset; while (unlikely(hash >= qcount)) hash -= qcount; return hash + qoffset; } return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; } void skb_warn_bad_offload(const struct sk_buff *skb) { static const netdev_features_t null_features; struct net_device *dev = skb->dev; const char *name = ""; if (!net_ratelimit()) return; if (dev) { if (dev->dev.parent) name = dev_driver_string(dev->dev.parent); else name = netdev_name(dev); } skb_dump(KERN_WARNING, skb, false); WARN(1, "%s: caps=(%pNF, %pNF)\n", name, dev ? &dev->features : &null_features, skb->sk ? &skb->sk->sk_route_caps : &null_features); } /* * Invalidate hardware checksum when packet is to be mangled, and * complete checksum manually on outgoing path. */ int skb_checksum_help(struct sk_buff *skb) { __wsum csum; int ret = 0, offset; if (skb->ip_summed == CHECKSUM_COMPLETE) goto out_set_summed; if (unlikely(skb_is_gso(skb))) { skb_warn_bad_offload(skb); return -EINVAL; } /* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong. */ if (skb_has_shared_frag(skb)) { ret = __skb_linearize(skb); if (ret) goto out; } offset = skb_checksum_start_offset(skb); ret = -EINVAL; if (unlikely(offset >= skb_headlen(skb))) { DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n", offset, skb_headlen(skb)); goto out; } csum = skb_checksum(skb, offset, skb->len - offset, 0); offset += skb->csum_offset; if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) { DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n", offset + sizeof(__sum16), skb_headlen(skb)); goto out; } ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); if (ret) goto out; *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; out_set_summed: skb->ip_summed = CHECKSUM_NONE; out: return ret; } EXPORT_SYMBOL(skb_checksum_help); int skb_crc32c_csum_help(struct sk_buff *skb) { __le32 crc32c_csum; int ret = 0, offset, start; if (skb->ip_summed != CHECKSUM_PARTIAL) goto out; if (unlikely(skb_is_gso(skb))) goto out; /* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong. */ if (unlikely(skb_has_shared_frag(skb))) { ret = __skb_linearize(skb); if (ret) goto out; } start = skb_checksum_start_offset(skb); offset = start + offsetof(struct sctphdr, checksum); if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { ret = -EINVAL; goto out; } ret = skb_ensure_writable(skb, offset + sizeof(__le32)); if (ret) goto out; crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, skb->len - start, ~(__u32)0, crc32c_csum_stub)); *(__le32 *)(skb->data + offset) = crc32c_csum; skb_reset_csum_not_inet(skb); out: return ret; } __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { __be16 type = skb->protocol; /* Tunnel gso handlers can set protocol to ethernet. */ if (type == htons(ETH_P_TEB)) { struct ethhdr *eth; if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) return 0; eth = (struct ethhdr *)skb->data; type = eth->h_proto; } return vlan_get_protocol_and_depth(skb, type, depth); } /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { netdev_err(dev, "hw csum failure\n"); skb_dump(KERN_ERR, skb, true); dump_stack(); } void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb); } EXPORT_SYMBOL(netdev_rx_csum_fault); #endif /* XXX: check that highmem exists at all on the given machine. */ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; if (!(dev->features & NETIF_F_HIGHDMA)) { for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; if (PageHighMem(skb_frag_page(frag))) return 1; } } #endif return 0; } /* If MPLS offload request, verify we are testing hardware MPLS features * instead of standard features for the netdev. */ #if IS_ENABLED(CONFIG_NET_MPLS_GSO) static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) { if (eth_p_mpls(type)) features &= skb->dev->mpls_features; return features; } #else static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) { return features; } #endif static netdev_features_t harmonize_features(struct sk_buff *skb, netdev_features_t features) { __be16 type; type = skb_network_protocol(skb, NULL); features = net_mpls_features(skb, features, type); if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, type)) { features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); } if (illegal_highdma(skb->dev, skb)) features &= ~NETIF_F_SG; return features; } netdev_features_t passthru_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { return features; } EXPORT_SYMBOL(passthru_features_check); static netdev_features_t dflt_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { return vlan_features_check(skb, features); } static netdev_features_t gso_features_check(const struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { u16 gso_segs = skb_shinfo(skb)->gso_segs; if (gso_segs > READ_ONCE(dev->gso_max_segs)) return features & ~NETIF_F_GSO_MASK; if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size))) return features & ~NETIF_F_GSO_MASK; if (!skb_shinfo(skb)->gso_type) { skb_warn_bad_offload(skb); return features & ~NETIF_F_GSO_MASK; } /* Support for GSO partial features requires software * intervention before we can actually process the packets * so we need to strip support for any partial features now * and we can pull them back in after we have partially * segmented the frame. */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) features &= ~dev->gso_partial_features; /* Make sure to clear the IPv4 ID mangling feature if the * IPv4 header has the potential to be fragmented. */ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { struct iphdr *iph = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb); if (!(iph->frag_off & htons(IP_DF))) features &= ~NETIF_F_TSO_MANGLEID; } return features; } netdev_features_t netif_skb_features(struct sk_buff *skb) { struct net_device *dev = skb->dev; netdev_features_t features = dev->features; if (skb_is_gso(skb)) features = gso_features_check(skb, dev, features); /* If encapsulation offload request, verify we are testing * hardware encapsulation features instead of standard * features for the netdev */ if (skb->encapsulation) features &= dev->hw_enc_features; if (skb_vlan_tagged(skb)) features = netdev_intersect_features(features, dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); if (dev->netdev_ops->ndo_features_check) features &= dev->netdev_ops->ndo_features_check(skb, dev, features); else features &= dflt_features_check(skb, dev, features); return harmonize_features(skb, features); } EXPORT_SYMBOL(netif_skb_features); static int xmit_one(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, bool more) { unsigned int len; int rc; if (dev_nit_active(dev)) dev_queue_xmit_nit(skb, dev); len = skb->len; trace_net_dev_start_xmit(skb, dev); rc = netdev_start_xmit(skb, dev, txq, more); trace_net_dev_xmit(skb, rc, dev, len); return rc; } struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, struct netdev_queue *txq, int *ret) { struct sk_buff *skb = first; int rc = NETDEV_TX_OK; while (skb) { struct sk_buff *next = skb->next; skb_mark_not_on_list(skb); rc = xmit_one(skb, dev, txq, next != NULL); if (unlikely(!dev_xmit_complete(rc))) { skb->next = next; goto out; } skb = next; if (netif_tx_queue_stopped(txq) && skb) { rc = NETDEV_TX_BUSY; break; } } out: *ret = rc; return skb; } static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features) { if (skb_vlan_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) skb = __vlan_hwaccel_push_inside(skb); return skb; } int skb_csum_hwoffload_help(struct sk_buff *skb, const netdev_features_t features) { if (unlikely(skb_csum_is_sctp(skb))) return !!(features & NETIF_F_SCTP_CRC) ? 0 : skb_crc32c_csum_help(skb); if (features & NETIF_F_HW_CSUM) return 0; if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { switch (skb->csum_offset) { case offsetof(struct tcphdr, check): case offsetof(struct udphdr, check): return 0; } } return skb_checksum_help(skb); } EXPORT_SYMBOL(skb_csum_hwoffload_help); static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) { netdev_features_t features; features = netif_skb_features(skb); skb = validate_xmit_vlan(skb, features); if (unlikely(!skb)) goto out_null; skb = sk_validate_xmit_skb(skb, dev); if (unlikely(!skb)) goto out_null; if (netif_needs_gso(skb, features)) { struct sk_buff *segs; segs = skb_gso_segment(skb, features); if (IS_ERR(segs)) { goto out_kfree_skb; } else if (segs) { consume_skb(skb); skb = segs; } } else { if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) goto out_kfree_skb; /* If packet is not checksummed and device does not * support checksumming for this protocol, complete * checksumming here. */ if (skb->ip_summed == CHECKSUM_PARTIAL) { if (skb->encapsulation) skb_set_inner_transport_header(skb, skb_checksum_start_offset(skb)); else skb_set_transport_header(skb, skb_checksum_start_offset(skb)); if (skb_csum_hwoffload_help(skb, features)) goto out_kfree_skb; } } skb = validate_xmit_xfrm(skb, features, again); return skb; out_kfree_skb: kfree_skb(skb); out_null: dev_core_stats_tx_dropped_inc(dev); return NULL; } struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again) { struct sk_buff *next, *head = NULL, *tail; for (; skb != NULL; skb = next) { next = skb->next; skb_mark_not_on_list(skb); /* in case skb wont be segmented, point to itself */ skb->prev = skb; skb = validate_xmit_skb(skb, dev, again); if (!skb) continue; if (!head) head = skb; else tail->next = skb; /* If skb was segmented, skb->prev points to * the last segment. If not, it still contains skb. */ tail = skb->prev; } return head; } EXPORT_SYMBOL_GPL(validate_xmit_skb_list); static void qdisc_pkt_len_init(struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); qdisc_skb_cb(skb)->pkt_len = skb->len; /* To get more precise estimation of bytes sent on wire, * we add to pkt_len the headers size of all segments */ if (shinfo->gso_size && skb_transport_header_was_set(skb)) { u16 gso_segs = shinfo->gso_segs; unsigned int hdr_len; /* mac layer + network layer */ hdr_len = skb_transport_offset(skb); /* + transport layer */ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { const struct tcphdr *th; struct tcphdr _tcphdr; th = skb_header_pointer(skb, hdr_len, sizeof(_tcphdr), &_tcphdr); if (likely(th)) hdr_len += __tcp_hdrlen(th); } else { struct udphdr _udphdr; if (skb_header_pointer(skb, hdr_len, sizeof(_udphdr), &_udphdr)) hdr_len += sizeof(struct udphdr); } if (shinfo->gso_type & SKB_GSO_DODGY) gso_segs = DIV_ROUND_UP(skb->len - hdr_len, shinfo->gso_size); qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } } static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q, struct sk_buff **to_free, struct netdev_queue *txq) { int rc; rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK; if (rc == NET_XMIT_SUCCESS) trace_qdisc_enqueue(q, txq, skb); return rc; } static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq) { spinlock_t *root_lock = qdisc_lock(q); struct sk_buff *to_free = NULL; bool contended; int rc; qdisc_calculate_pkt_len(skb, q); tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP); if (q->flags & TCQ_F_NOLOCK) { if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) && qdisc_run_begin(q)) { /* Retest nolock_qdisc_is_empty() within the protection * of q->seqlock to protect from racing with requeuing. */ if (unlikely(!nolock_qdisc_is_empty(q))) { rc = dev_qdisc_enqueue(skb, q, &to_free, txq); __qdisc_run(q); qdisc_run_end(q); goto no_lock_out; } qdisc_bstats_cpu_update(q, skb); if (sch_direct_xmit(skb, q, dev, txq, NULL, true) && !nolock_qdisc_is_empty(q)) __qdisc_run(q); qdisc_run_end(q); return NET_XMIT_SUCCESS; } rc = dev_qdisc_enqueue(skb, q, &to_free, txq); qdisc_run(q); no_lock_out: if (unlikely(to_free)) kfree_skb_list_reason(to_free, tcf_get_drop_reason(to_free)); return rc; } /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. * This permits qdisc->running owner to get the lock more * often and dequeue packets faster. * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit * and then other tasks will only enqueue packets. The packets will be * sent after the qdisc owner is scheduled again. To prevent this * scenario the task always serialize on the lock. */ contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT); if (unlikely(contended)) spin_lock(&q->busylock); spin_lock(root_lock); if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { __qdisc_drop(skb, &to_free); rc = NET_XMIT_DROP; } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && qdisc_run_begin(q)) { /* * This is a work-conserving queue; there are no old skbs * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ qdisc_bstats_update(q, skb); if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; } __qdisc_run(q); } qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { rc = dev_qdisc_enqueue(skb, q, &to_free, txq); if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; } __qdisc_run(q); qdisc_run_end(q); } } spin_unlock(root_lock); if (unlikely(to_free)) kfree_skb_list_reason(to_free, tcf_get_drop_reason(to_free)); if (unlikely(contended)) spin_unlock(&q->busylock); return rc; } #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) static void skb_update_prio(struct sk_buff *skb) { const struct netprio_map *map; const struct sock *sk; unsigned int prioidx; if (skb->priority) return; map = rcu_dereference_bh(skb->dev->priomap); if (!map) return; sk = skb_to_full_sk(skb); if (!sk) return; prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data); if (prioidx < map->priomap_len) skb->priority = map->priomap[prioidx]; } #else #define skb_update_prio(skb) #endif /** * dev_loopback_xmit - loop back @skb * @net: network namespace this loopback is happening in * @sk: sk needed to be a netfilter okfn * @skb: buffer to transmit */ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_reset_mac_header(skb); __skb_pull(skb, skb_network_offset(skb)); skb->pkt_type = PACKET_LOOPBACK; if (skb->ip_summed == CHECKSUM_NONE) skb->ip_summed = CHECKSUM_UNNECESSARY; DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb)); skb_dst_force(skb); netif_rx(skb); return 0; } EXPORT_SYMBOL(dev_loopback_xmit); #ifdef CONFIG_NET_EGRESS static struct netdev_queue * netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) { int qm = skb_get_queue_mapping(skb); return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); } static bool netdev_xmit_txqueue_skipped(void) { return __this_cpu_read(softnet_data.xmit.skip_txqueue); } void netdev_xmit_skip_txqueue(bool skip) { __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); } EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); #endif /* CONFIG_NET_EGRESS */ #ifdef CONFIG_NET_XGRESS static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, enum skb_drop_reason *drop_reason) { int ret = TC_ACT_UNSPEC; #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq); struct tcf_result res; if (!miniq) return ret; tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; tcf_set_drop_reason(skb, *drop_reason); mini_qdisc_bstats_cpu_update(miniq, skb); ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); /* Only tcf related quirks below. */ switch (ret) { case TC_ACT_SHOT: *drop_reason = tcf_get_drop_reason(skb); mini_qdisc_qstats_cpu_drop(miniq); break; case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(res.classid); break; } #endif /* CONFIG_NET_CLS_ACT */ return ret; } static DEFINE_STATIC_KEY_FALSE(tcx_needed_key); void tcx_inc(void) { static_branch_inc(&tcx_needed_key); } void tcx_dec(void) { static_branch_dec(&tcx_needed_key); } static __always_inline enum tcx_action_base tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, const bool needs_mac) { const struct bpf_mprog_fp *fp; const struct bpf_prog *prog; int ret = TCX_NEXT; if (needs_mac) __skb_push(skb, skb->mac_len); bpf_mprog_foreach_prog(entry, fp, prog) { bpf_compute_data_pointers(skb); ret = bpf_prog_run(prog, skb); if (ret != TCX_NEXT) break; } if (needs_mac) __skb_pull(skb, skb->mac_len); return tcx_action_code(skb, ret); } static __always_inline struct sk_buff * sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev, bool *another) { struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS; int sch_ret; if (!entry) return skb; if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } qdisc_skb_cb(skb)->pkt_len = skb->len; tcx_set_ingress(skb, true); if (static_branch_unlikely(&tcx_needed_key)) { sch_ret = tcx_run(entry, skb, true); if (sch_ret != TC_ACT_UNSPEC) goto ingress_verdict; } sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason); ingress_verdict: switch (sch_ret) { case TC_ACT_REDIRECT: /* skb_mac_header check was done by BPF, so we can safely * push the L2 header back before redirecting to another * netdev. */ __skb_push(skb, skb->mac_len); if (skb_do_redirect(skb) == -EAGAIN) { __skb_pull(skb, skb->mac_len); *another = true; break; } *ret = NET_RX_SUCCESS; return NULL; case TC_ACT_SHOT: kfree_skb_reason(skb, drop_reason); *ret = NET_RX_DROP; return NULL; /* used by tc_run */ case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: consume_skb(skb); fallthrough; case TC_ACT_CONSUMED: *ret = NET_RX_SUCCESS; return NULL; } return skb; } static __always_inline struct sk_buff * sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS; int sch_ret; if (!entry) return skb; /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was * already set by the caller. */ if (static_branch_unlikely(&tcx_needed_key)) { sch_ret = tcx_run(entry, skb, false); if (sch_ret != TC_ACT_UNSPEC) goto egress_verdict; } sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason); egress_verdict: switch (sch_ret) { case TC_ACT_REDIRECT: /* No need to push/pop skb's mac_header here on egress! */ skb_do_redirect(skb); *ret = NET_XMIT_SUCCESS; return NULL; case TC_ACT_SHOT: kfree_skb_reason(skb, drop_reason); *ret = NET_XMIT_DROP; return NULL; /* used by tc_run */ case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: consume_skb(skb); fallthrough; case TC_ACT_CONSUMED: *ret = NET_XMIT_SUCCESS; return NULL; } return skb; } #else static __always_inline struct sk_buff * sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev, bool *another) { return skb; } static __always_inline struct sk_buff * sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { return skb; } #endif /* CONFIG_NET_XGRESS */ #ifdef CONFIG_XPS static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, struct xps_dev_maps *dev_maps, unsigned int tci) { int tc = netdev_get_prio_tc_map(dev, skb->priority); struct xps_map *map; int queue_index = -1; if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids) return queue_index; tci *= dev_maps->num_tc; tci += tc; map = rcu_dereference(dev_maps->attr_map[tci]); if (map) { if (map->len == 1) queue_index = map->queues[0]; else queue_index = map->queues[reciprocal_scale( skb_get_hash(skb), map->len)]; if (unlikely(queue_index >= dev->real_num_tx_queues)) queue_index = -1; } return queue_index; } #endif static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, struct sk_buff *skb) { #ifdef CONFIG_XPS struct xps_dev_maps *dev_maps; struct sock *sk = skb->sk; int queue_index = -1; if (!static_key_false(&xps_needed)) return -1; rcu_read_lock(); if (!static_key_false(&xps_rxqs_needed)) goto get_cpus_map; dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]); if (dev_maps) { int tci = sk_rx_queue_get(sk); if (tci >= 0) queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci); } get_cpus_map: if (queue_index < 0) { dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]); if (dev_maps) { unsigned int tci = skb->sender_cpu - 1; queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci); } } rcu_read_unlock(); return queue_index; #else return -1; #endif } u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { return 0; } EXPORT_SYMBOL(dev_pick_tx_zero); u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { return (u16)raw_smp_processor_id() % dev->real_num_tx_queues; } EXPORT_SYMBOL(dev_pick_tx_cpu_id); u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { struct sock *sk = skb->sk; int queue_index = sk_tx_queue_get(sk); sb_dev = sb_dev ? : dev; if (queue_index < 0 || skb->ooo_okay || queue_index >= dev->real_num_tx_queues) { int new_index = get_xps_queue(dev, sb_dev, skb); if (new_index < 0) new_index = skb_tx_hash(dev, sb_dev, skb); if (queue_index != new_index && sk && sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache)) sk_tx_queue_set(sk, new_index); queue_index = new_index; } return queue_index; } EXPORT_SYMBOL(netdev_pick_tx); struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { int queue_index = 0; #ifdef CONFIG_XPS u32 sender_cpu = skb->sender_cpu - 1; if (sender_cpu >= (u32)NR_CPUS) skb->sender_cpu = raw_smp_processor_id() + 1; #endif if (dev->real_num_tx_queues != 1) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_select_queue) queue_index = ops->ndo_select_queue(dev, skb, sb_dev); else queue_index = netdev_pick_tx(dev, skb, sb_dev); queue_index = netdev_cap_txqueue(dev, queue_index); } skb_set_queue_mapping(skb, queue_index); return netdev_get_tx_queue(dev, queue_index); } /** * __dev_queue_xmit() - transmit a buffer * @skb: buffer to transmit * @sb_dev: suboordinate device used for L2 forwarding offload * * Queue a buffer for transmission to a network device. The caller must * have set the device and priority and built the buffer before calling * this function. The function can be called from an interrupt. * * When calling this method, interrupts MUST be enabled. This is because * the BH enable code must have IRQs enabled so that it will not deadlock. * * Regardless of the return value, the skb is consumed, so it is currently * difficult to retry a send to this method. (You can bump the ref count * before sending to hold a reference for retry if you are careful.) * * Return: * * 0 - buffer successfully transmitted * * positive qdisc return code - NET_XMIT_DROP etc. * * negative errno - other errors */ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { struct net_device *dev = skb->dev; struct netdev_queue *txq = NULL; struct Qdisc *q; int rc = -ENOMEM; bool again = false; skb_reset_mac_header(skb); skb_assert_len(skb); if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ rcu_read_lock_bh(); skb_update_prio(skb); qdisc_pkt_len_init(skb); tcx_set_ingress(skb, false); #ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { if (nf_hook_egress_active()) { skb = nf_hook_egress(skb, &rc, dev); if (!skb) goto out; } netdev_xmit_skip_txqueue(false); nf_skip_egress(skb, true); skb = sch_handle_egress(skb, &rc, dev); if (!skb) goto out; nf_skip_egress(skb, false); if (netdev_xmit_txqueue_skipped()) txq = netdev_tx_queue_mapping(dev, skb); } #endif /* If device/qdisc don't need skb->dst, release it right now while * its hot in this cpu cache. */ if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(skb); else skb_dst_force(skb); if (!txq) txq = netdev_core_pick_tx(dev, skb, sb_dev); q = rcu_dereference_bh(txq->qdisc); trace_net_dev_queue(skb); if (q->enqueue) { rc = __dev_xmit_skb(skb, q, dev, txq); goto out; } /* The device has no queue. Common case for software devices: * loopback, all the sorts of tunnels... * Really, it is unlikely that netif_tx_lock protection is necessary * here. (f.e. loopback and IP tunnels are clean ignoring statistics * counters.) * However, it is possible, that they rely on protection * made by us here. * Check this and shot the lock. It is not prone from deadlocks. *Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ /* Other cpus might concurrently change txq->xmit_lock_owner * to -1 or to their cpu id, but not to our id. */ if (READ_ONCE(txq->xmit_lock_owner) != cpu) { if (dev_xmit_recursion()) goto recursion_alert; skb = validate_xmit_skb(skb, dev, &again); if (!skb) goto out; HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { dev_xmit_recursion_inc(); skb = dev_hard_start_xmit(skb, dev, txq, &rc); dev_xmit_recursion_dec(); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; } } HARD_TX_UNLOCK(dev, txq); net_crit_ratelimited("Virtual device %s asks to queue packet!\n", dev->name); } else { /* Recursion is detected! It is possible, * unfortunately */ recursion_alert: net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", dev->name); } } rc = -ENETDOWN; rcu_read_unlock_bh(); dev_core_stats_tx_dropped_inc(dev); kfree_skb_list(skb); return rc; out: rcu_read_unlock_bh(); return rc; } EXPORT_SYMBOL(__dev_queue_xmit); int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { struct net_device *dev = skb->dev; struct sk_buff *orig_skb = skb; struct netdev_queue *txq; int ret = NETDEV_TX_BUSY; bool again = false; if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev))) goto drop; skb = validate_xmit_skb_list(skb, dev, &again); if (skb != orig_skb) goto drop; skb_set_queue_mapping(skb, queue_id); txq = skb_get_tx_queue(dev, skb); local_bh_disable(); dev_xmit_recursion_inc(); HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_drv_stopped(txq)) ret = netdev_start_xmit(skb, dev, txq, false); HARD_TX_UNLOCK(dev, txq); dev_xmit_recursion_dec(); local_bh_enable(); return ret; drop: dev_core_stats_tx_dropped_inc(dev); kfree_skb_list(skb); return NET_XMIT_DROP; } EXPORT_SYMBOL(__dev_direct_xmit); /************************************************************************* * Receiver routines *************************************************************************/ int netdev_max_backlog __read_mostly = 1000; EXPORT_SYMBOL(netdev_max_backlog); int netdev_tstamp_prequeue __read_mostly = 1; unsigned int sysctl_skb_defer_max __read_mostly = 64; int netdev_budget __read_mostly = 300; /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ int dev_rx_weight __read_mostly = 64; int dev_tx_weight __read_mostly = 64; /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, struct napi_struct *napi) { struct task_struct *thread; lockdep_assert_irqs_disabled(); if (test_bit(NAPI_STATE_THREADED, &napi->state)) { /* Paired with smp_mb__before_atomic() in * napi_enable()/dev_set_threaded(). * Use READ_ONCE() to guarantee a complete * read on napi->thread. Only call * wake_up_process() when it's not NULL. */ thread = READ_ONCE(napi->thread); if (thread) { /* Avoid doing set_bit() if the thread is in * INTERRUPTIBLE state, cause napi_thread_wait() * makes sure to proceed with napi polling * if the thread is explicitly woken from here. */ if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE) set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); wake_up_process(thread); return; } } list_add_tail(&napi->poll_list, &sd->poll_list); WRITE_ONCE(napi->list_owner, smp_processor_id()); /* If not called from net_rx_action() * we have to raise NET_RX_SOFTIRQ. */ if (!sd->in_net_rx_action) __raise_softirq_irqoff(NET_RX_SOFTIRQ); } #ifdef CONFIG_RPS /* One global table that all flow-based protocols share. */ struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); u32 rps_cpu_mask __read_mostly; EXPORT_SYMBOL(rps_cpu_mask); struct static_key_false rps_needed __read_mostly; EXPORT_SYMBOL(rps_needed); struct static_key_false rfs_needed __read_mostly; EXPORT_SYMBOL(rfs_needed); static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) { if (next_cpu < nr_cpu_ids) { #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; u32 flow_id; u16 rxq_index; int rc; /* Should we steer this flow to a different hardware queue? */ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || !(dev->features & NETIF_F_NTUPLE)) goto out; rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); if (rxq_index == skb_get_rx_queue(skb)) goto out; rxqueue = dev->_rx + rxq_index; flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; flow_id = skb_get_hash(skb) & flow_table->mask; rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) goto out; old_rflow = rflow; rflow = &flow_table->flows[flow_id]; rflow->filter = rc; if (old_rflow->filter == rflow->filter) old_rflow->filter = RPS_NO_FILTER; out: #endif rflow->last_qtail = per_cpu(softnet_data, next_cpu).input_queue_head; } rflow->cpu = next_cpu; return rflow; } /* * get_rps_cpu is called from netif_receive_skb and returns the target * CPU from the RPS map of the receiving queue for a given skb. * rcu_read_lock must be held on entry. */ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { const struct rps_sock_flow_table *sock_flow_table; struct netdev_rx_queue *rxqueue = dev->_rx; struct rps_dev_flow_table *flow_table; struct rps_map *map; int cpu = -1; u32 tcpu; u32 hash; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); if (unlikely(index >= dev->real_num_rx_queues)) { WARN_ONCE(dev->real_num_rx_queues > 1, "%s received packet on queue %u, but number " "of RX queues is %u\n", dev->name, index, dev->real_num_rx_queues); goto done; } rxqueue += index; } /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ flow_table = rcu_dereference(rxqueue->rps_flow_table); map = rcu_dereference(rxqueue->rps_map); if (!flow_table && !map) goto done; skb_reset_network_header(skb); hash = skb_get_hash(skb); if (!hash) goto done; sock_flow_table = rcu_dereference(rps_sock_flow_table); if (flow_table && sock_flow_table) { struct rps_dev_flow *rflow; u32 next_cpu; u32 ident; /* First check into global flow table if there is a match. * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow(). */ ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]); if ((ident ^ hash) & ~rps_cpu_mask) goto try_rps; next_cpu = ident & rps_cpu_mask; /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table */ rflow = &flow_table->flows[hash & flow_table->mask]; tcpu = rflow->cpu; /* * If the desired CPU (where last recvmsg was done) is * different from current CPU (one in the rx-queue flow * table entry), switch if one of the following holds: * - Current CPU is unset (>= nr_cpu_ids). * - Current CPU is offline. * - The current CPU's queue tail has advanced beyond the * last packet that was enqueued using this table entry. * This guarantees that all previous packets for the flow * have been dequeued, thus preserving in order delivery. */ if (unlikely(tcpu != next_cpu) && (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || ((int)(per_cpu(softnet_data, tcpu).input_queue_head - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); } if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { *rflowp = rflow; cpu = tcpu; goto done; } } try_rps: if (map) { tcpu = map->cpus[reciprocal_scale(hash, map->len)]; if (cpu_online(tcpu)) { cpu = tcpu; goto done; } } done: return cpu; } #ifdef CONFIG_RFS_ACCEL /** * rps_may_expire_flow - check whether an RFS hardware filter may be removed * @dev: Device on which the filter was set * @rxq_index: RX queue index * @flow_id: Flow ID passed to ndo_rx_flow_steer() * @filter_id: Filter ID returned by ndo_rx_flow_steer() * * Drivers that implement ndo_rx_flow_steer() should periodically call * this function for each installed filter and remove the filters for * which it returns %true. */ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id) { struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; unsigned int cpu; rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id <= flow_table->mask) { rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); if (rflow->filter == filter_id && cpu < nr_cpu_ids && ((int)(per_cpu(softnet_data, cpu).input_queue_head - rflow->last_qtail) < (int)(10 * flow_table->mask))) expire = false; } rcu_read_unlock(); return expire; } EXPORT_SYMBOL(rps_may_expire_flow); #endif /* CONFIG_RFS_ACCEL */ /* Called from hardirq (IPI) context */ static void rps_trigger_softirq(void *data) { struct softnet_data *sd = data; ____napi_schedule(sd, &sd->backlog); sd->received_rps++; } #endif /* CONFIG_RPS */ /* Called from hardirq (IPI) context */ static void trigger_rx_softirq(void *data) { struct softnet_data *sd = data; __raise_softirq_irqoff(NET_RX_SOFTIRQ); smp_store_release(&sd->defer_ipi_scheduled, 0); } /* * After we queued a packet into sd->input_pkt_queue, * we need to make sure this queue is serviced soon. * * - If this is another cpu queue, link it to our rps_ipi_list, * and make sure we will process rps_ipi_list from net_rx_action(). * * - If this is our own queue, NAPI schedule our backlog. * Note that this also raises NET_RX_SOFTIRQ. */ static void napi_schedule_rps(struct softnet_data *sd) { struct softnet_data *mysd = this_cpu_ptr(&softnet_data); #ifdef CONFIG_RPS if (sd != mysd) { sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; /* If not called from net_rx_action() or napi_threaded_poll() * we have to raise NET_RX_SOFTIRQ. */ if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll) __raise_softirq_irqoff(NET_RX_SOFTIRQ); return; } #endif /* CONFIG_RPS */ __napi_schedule_irqoff(&mysd->backlog); } #ifdef CONFIG_NET_FLOW_LIMIT int netdev_flow_limit_table_len __read_mostly = (1 << 12); #endif static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) { #ifdef CONFIG_NET_FLOW_LIMIT struct sd_flow_limit *fl; struct softnet_data *sd; unsigned int old_flow, new_flow; if (qlen < (READ_ONCE(netdev_max_backlog) >> 1)) return false; sd = this_cpu_ptr(&softnet_data); rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); if (fl) { new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); old_flow = fl->history[fl->history_head]; fl->history[fl->history_head] = new_flow; fl->history_head++; fl->history_head &= FLOW_LIMIT_HISTORY - 1; if (likely(fl->buckets[old_flow])) fl->buckets[old_flow]--; if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { fl->count++; rcu_read_unlock(); return true; } } rcu_read_unlock(); #endif return false; } /* * enqueue_to_backlog is called to queue an skb to a per CPU backlog * queue (may be a remote CPU queue). */ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail) { enum skb_drop_reason reason; struct softnet_data *sd; unsigned long flags; unsigned int qlen; reason = SKB_DROP_REASON_NOT_SPECIFIED; sd = &per_cpu(softnet_data, cpu); rps_lock_irqsave(sd, &flags); if (!netif_running(skb->dev)) goto drop; qlen = skb_queue_len(&sd->input_pkt_queue); if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) { if (qlen) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); input_queue_tail_incr_save(sd, qtail); rps_unlock_irq_restore(sd, &flags); return NET_RX_SUCCESS; } /* Schedule NAPI for backlog device * We can use non atomic operation since we own the queue lock */ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) napi_schedule_rps(sd); goto enqueue; } reason = SKB_DROP_REASON_CPU_BACKLOG; drop: sd->dropped++; rps_unlock_irq_restore(sd, &flags); dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); return NET_RX_DROP; } static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct netdev_rx_queue *rxqueue; rxqueue = dev->_rx; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); if (unlikely(index >= dev->real_num_rx_queues)) { WARN_ONCE(dev->real_num_rx_queues > 1, "%s received packet on queue %u, but number " "of RX queues is %u\n", dev->name, index, dev->real_num_rx_queues); return rxqueue; /* Return first rxqueue */ } rxqueue += index; } return rxqueue; } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; bool orig_bcast, orig_host; u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; u32 metalen, act; int off; /* The XDP program wants to see the packet starting at the MAC * header. */ mac_len = skb->data - skb_mac_header(skb); hard_start = skb->data - skb_headroom(skb); /* SKB "head" area always have tailroom for skb_shared_info */ frame_sz = (void *)skb_end_pointer(skb) - hard_start; frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); rxqueue = netif_get_rxqueue(skb); xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, skb_headlen(skb) + mac_len, true); orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr); orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); orig_eth_type = eth->h_proto; act = bpf_prog_run_xdp(xdp_prog, xdp); /* check if bpf_xdp_adjust_head was used */ off = xdp->data - orig_data; if (off) { if (off > 0) __skb_pull(skb, off); else if (off < 0) __skb_push(skb, -off); skb->mac_header += off; skb_reset_network_header(skb); } /* check if bpf_xdp_adjust_tail was used */ off = xdp->data_end - orig_data_end; if (off != 0) { skb_set_tail_pointer(skb, xdp->data_end - xdp->data); skb->len += off; /* positive on grow, negative on shrink */ } /* check if XDP changed eth hdr such SKB needs update */ eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) || (orig_host != ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr)) || (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { __skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); } /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull * before calling us again on redirect path. We do not call do_redirect * as we leave that up to the caller. * * Caller is responsible for managing lifetime of skb (i.e. calling * kfree_skb in response to actions it cannot handle/XDP_DROP). */ switch (act) { case XDP_REDIRECT: case XDP_TX: __skb_push(skb, mac_len); break; case XDP_PASS: metalen = xdp->data - xdp->data_meta; if (metalen) skb_metadata_set(skb, metalen); break; } return act; } static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { u32 act = XDP_DROP; /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. */ if (skb_is_redirected(skb)) return XDP_PASS; /* XDP packets must be linear and must have sufficient headroom * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also * native XDP provides, thus we need to do it here as well. */ if (skb_cloned(skb) || skb_is_nonlinear(skb) || skb_headroom(skb) < XDP_PACKET_HEADROOM) { int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); int troom = skb->tail + skb->data_len - skb->end; /* In case we have to go down the path and also linearize, * then lets do the pskb_expand_head() work just once here. */ if (pskb_expand_head(skb, hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) goto do_drop; if (skb_linearize(skb)) goto do_drop; } act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog); switch (act) { case XDP_REDIRECT: case XDP_TX: case XDP_PASS: break; default: bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(skb->dev, xdp_prog, act); fallthrough; case XDP_DROP: do_drop: kfree_skb(skb); break; } return act; } /* When doing generic XDP we have to bypass the qdisc layer and the * network taps in order to match in-driver-XDP behavior. This also means * that XDP packets are able to starve other packets going through a qdisc, * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX * queues, so they do not have this starvation issue. */ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) { struct net_device *dev = skb->dev; struct netdev_queue *txq; bool free_skb = true; int cpu, rc; txq = netdev_core_pick_tx(dev, skb, NULL); cpu = smp_processor_id(); HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_frozen_or_drv_stopped(txq)) { rc = netdev_start_xmit(skb, dev, txq, 0); if (dev_xmit_complete(rc)) free_skb = false; } HARD_TX_UNLOCK(dev, txq); if (free_skb) { trace_xdp_exception(dev, xdp_prog, XDP_TX); dev_core_stats_tx_dropped_inc(dev); kfree_skb(skb); } } static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) { if (xdp_prog) { struct xdp_buff xdp; u32 act; int err; act = netif_receive_generic_xdp(skb, &xdp, xdp_prog); if (act != XDP_PASS) { switch (act) { case XDP_REDIRECT: err = xdp_do_generic_redirect(skb->dev, skb, &xdp, xdp_prog); if (err) goto out_redir; break; case XDP_TX: generic_xdp_tx(skb, xdp_prog); break; } return XDP_DROP; } } return XDP_PASS; out_redir: kfree_skb_reason(skb, SKB_DROP_REASON_XDP); return XDP_DROP; } EXPORT_SYMBOL_GPL(do_xdp_generic); static int netif_rx_internal(struct sk_buff *skb) { int ret; net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); trace_netif_rx(skb); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu < 0) cpu = smp_processor_id(); ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); } else #endif { unsigned int qtail; ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail); } return ret; } /** * __netif_rx - Slightly optimized version of netif_rx * @skb: buffer to post * * This behaves as netif_rx except that it does not disable bottom halves. * As a result this function may only be invoked from the interrupt context * (either hard or soft interrupt). */ int __netif_rx(struct sk_buff *skb) { int ret; lockdep_assert_once(hardirq_count() | softirq_count()); trace_netif_rx_entry(skb); ret = netif_rx_internal(skb); trace_netif_rx_exit(ret); return ret; } EXPORT_SYMBOL(__netif_rx); /** * netif_rx - post buffer to the network code * @skb: buffer to post * * This function receives a packet from a device driver and queues it for * the upper (protocol) levels to process via the backlog NAPI device. It * always succeeds. The buffer may be dropped during processing for * congestion control or by the protocol layers. * The network buffer is passed via the backlog NAPI device. Modern NIC * driver should use NAPI and GRO. * This function can used from interrupt and from process context. The * caller from process context must not disable interrupts before invoking * this function. * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped) * */ int netif_rx(struct sk_buff *skb) { bool need_bh_off = !(hardirq_count() | softirq_count()); int ret; if (need_bh_off) local_bh_disable(); trace_netif_rx_entry(skb); ret = netif_rx_internal(skb); trace_netif_rx_exit(ret); if (need_bh_off) local_bh_enable(); return ret; } EXPORT_SYMBOL(netif_rx); static __latent_entropy void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); if (sd->completion_queue) { struct sk_buff *clist; local_irq_disable(); clist = sd->completion_queue; sd->completion_queue = NULL; local_irq_enable(); while (clist) { struct sk_buff *skb = clist; clist = clist->next; WARN_ON(refcount_read(&skb->users)); if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED)) trace_consume_skb(skb, net_tx_action); else trace_kfree_skb(skb, net_tx_action, get_kfree_skb_cb(skb)->reason); if (skb->fclone != SKB_FCLONE_UNAVAILABLE) __kfree_skb(skb); else __napi_kfree_skb(skb, get_kfree_skb_cb(skb)->reason); } } if (sd->output_queue) { struct Qdisc *head; local_irq_disable(); head = sd->output_queue; sd->output_queue = NULL; sd->output_queue_tailp = &sd->output_queue; local_irq_enable(); rcu_read_lock(); while (head) { struct Qdisc *q = head; spinlock_t *root_lock = NULL; head = head->next_sched; /* We need to make sure head->next_sched is read * before clearing __QDISC_STATE_SCHED */ smp_mb__before_atomic(); if (!(q->flags & TCQ_F_NOLOCK)) { root_lock = qdisc_lock(q); spin_lock(root_lock); } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { /* There is a synchronize_net() between * STATE_DEACTIVATED flag being set and * qdisc_reset()/some_qdisc_is_busy() in * dev_deactivate(), so we can safely bail out * early here to avoid data race between * qdisc_deactivate() and some_qdisc_is_busy() * for lockless qdisc. */ clear_bit(__QDISC_STATE_SCHED, &q->state); continue; } clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); if (root_lock) spin_unlock(root_lock); } rcu_read_unlock(); } xfrm_dev_backlog(sd); } #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) /* This hook is defined here for ATM LANE */ int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr) __read_mostly; EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif /** * netdev_is_rx_handler_busy - check if receive handler is registered * @dev: device to check * * Check if a receive handler is already registered for a given device. * Return true if there one. * * The caller must hold the rtnl_mutex. */ bool netdev_is_rx_handler_busy(struct net_device *dev) { ASSERT_RTNL(); return dev && rtnl_dereference(dev->rx_handler); } EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy); /** * netdev_rx_handler_register - register receive handler * @dev: device to register a handler for * @rx_handler: receive handler to register * @rx_handler_data: data pointer that is used by rx handler * * Register a receive handler for a device. This handler will then be * called from __netif_receive_skb. A negative errno code is returned * on a failure. * * The caller must hold the rtnl_mutex. * * For a general description of rx_handler, see enum rx_handler_result. */ int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, void *rx_handler_data) { if (netdev_is_rx_handler_busy(dev)) return -EBUSY; if (dev->priv_flags & IFF_NO_RX_HANDLER) return -EINVAL; /* Note: rx_handler_data must be set before rx_handler */ rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); rcu_assign_pointer(dev->rx_handler, rx_handler); return 0; } EXPORT_SYMBOL_GPL(netdev_rx_handler_register); /** * netdev_rx_handler_unregister - unregister receive handler * @dev: device to unregister a handler from * * Unregister a receive handler from a device. * * The caller must hold the rtnl_mutex. */ void netdev_rx_handler_unregister(struct net_device *dev) { ASSERT_RTNL(); RCU_INIT_POINTER(dev->rx_handler, NULL); /* a reader seeing a non NULL rx_handler in a rcu_read_lock() * section has a guarantee to see a non NULL rx_handler_data * as well. */ synchronize_net(); RCU_INIT_POINTER(dev->rx_handler_data, NULL); } EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); /* * Limit the use of PFMEMALLOC reserves to those protocols that implement * the special handling of PFMEMALLOC skbs. */ static bool skb_pfmemalloc_protocol(struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_ARP): case htons(ETH_P_IP): case htons(ETH_P_IPV6): case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): return true; default: return false; } } static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { if (nf_hook_ingress_active(skb)) { int ingress_retval; if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } rcu_read_lock(); ingress_retval = nf_hook_ingress(skb); rcu_read_unlock(); return ingress_retval; } return 0; } static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, struct packet_type **ppt_prev) { struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; struct sk_buff *skb = *pskb; struct net_device *orig_dev; bool deliver_exact = false; int ret = NET_RX_DROP; __be16 type; net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb); trace_netif_receive_skb(skb); orig_dev = skb->dev; skb_reset_network_header(skb); if (!skb_transport_header_was_set(skb)) skb_reset_transport_header(skb); skb_reset_mac_len(skb); pt_prev = NULL; another_round: skb->skb_iif = skb->dev->ifindex; __this_cpu_inc(softnet_data.processed); if (static_branch_unlikely(&generic_xdp_needed_key)) { int ret2; migrate_disable(); ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); migrate_enable(); if (ret2 != XDP_PASS) { ret = NET_RX_DROP; goto out; } } if (eth_type_vlan(skb->protocol)) { skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto out; } if (skb_skip_tc_classify(skb)) goto skip_classify; if (pfmemalloc) goto skip_taps; list_for_each_entry_rcu(ptype, &ptype_all, list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } skip_taps: #ifdef CONFIG_NET_INGRESS if (static_branch_unlikely(&ingress_needed_key)) { bool another = false; nf_skip_egress(skb, true); skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, &another); if (another) goto another_round; if (!skb) goto out; nf_skip_egress(skb, false); if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) goto out; } #endif skb_reset_redirect(skb); skip_classify: if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; if (skb_vlan_tag_present(skb)) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } if (vlan_do_receive(&skb)) goto another_round; else if (unlikely(!skb)) goto out; } rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } switch (rx_handler(&skb)) { case RX_HANDLER_CONSUMED: ret = NET_RX_SUCCESS; goto out; case RX_HANDLER_ANOTHER: goto another_round; case RX_HANDLER_EXACT: deliver_exact = true; break; case RX_HANDLER_PASS: break; default: BUG(); } } if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) { check_vlan_id: if (skb_vlan_tag_get_id(skb)) { /* Vlan id is non 0 and vlan_do_receive() above couldn't * find vlan device. */ skb->pkt_type = PACKET_OTHERHOST; } else if (eth_type_vlan(skb->protocol)) { /* Outer header is 802.1P with vlan 0, inner header is * 802.1Q or 802.1AD and vlan_do_receive() above could * not find vlan dev for vlan id 0. */ __vlan_hwaccel_clear_tag(skb); skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto out; if (vlan_do_receive(&skb)) /* After stripping off 802.1P header with vlan 0 * vlan dev is found for inner header. */ goto another_round; else if (unlikely(!skb)) goto out; else /* We have stripped outer 802.1P vlan 0 header. * But could not find vlan dev. * check again for vlan id to set OTHERHOST. */ goto check_vlan_id; } /* Note: we might in the future use prio bits * and set skb->priority like in vlan_do_receive() * For the time being, just ignore Priority Code Point */ __vlan_hwaccel_clear_tag(skb); } type = skb->protocol; /* deliver only exact match when indicated */ if (likely(!deliver_exact)) { deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &ptype_base[ntohs(type) & PTYPE_HASH_MASK]); } deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &orig_dev->ptype_specific); if (unlikely(skb->dev != orig_dev)) { deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &skb->dev->ptype_specific); } if (pt_prev) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; *ppt_prev = pt_prev; } else { drop: if (!deliver_exact) dev_core_stats_rx_dropped_inc(skb->dev); else dev_core_stats_rx_nohandler_inc(skb->dev); kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) */ ret = NET_RX_DROP; } out: /* The invariant here is that if *ppt_prev is not NULL * then skb should also be non-NULL. * * Apparently *ppt_prev assignment above holds this invariant due to * skb dereferencing near it. */ *pskb = skb; return ret; } static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) { struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; int ret; ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); if (pt_prev) ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb, skb->dev, pt_prev, orig_dev); return ret; } /** * netif_receive_skb_core - special purpose version of netif_receive_skb * @skb: buffer to process * * More direct receive version of netif_receive_skb(). It should * only be used by callers that have a need to skip RPS and Generic XDP. * Caller must also take care of handling if ``(page_is_)pfmemalloc``. * * This function may only be called from softirq context and interrupts * should be enabled. * * Return values (usually ignored): * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped */ int netif_receive_skb_core(struct sk_buff *skb) { int ret; rcu_read_lock(); ret = __netif_receive_skb_one_core(skb, false); rcu_read_unlock(); return ret; } EXPORT_SYMBOL(netif_receive_skb_core); static inline void __netif_receive_skb_list_ptype(struct list_head *head, struct packet_type *pt_prev, struct net_device *orig_dev) { struct sk_buff *skb, *next; if (!pt_prev) return; if (list_empty(head)) return; if (pt_prev->list_func != NULL) INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv, ip_list_rcv, head, pt_prev, orig_dev); else list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } } static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) { /* Fast-path assumptions: * - There is no RX handler. * - Only one packet_type matches. * If either of these fails, we will end up doing some per-packet * processing in-line, then handling the 'last ptype' for the whole * sublist. This can't cause out-of-order delivery to any single ptype, * because the 'last ptype' must be constant across the sublist, and all * other ptypes are handled per-packet. */ /* Current (common) ptype of sublist */ struct packet_type *pt_curr = NULL; /* Current (common) orig_dev of sublist */ struct net_device *od_curr = NULL; struct list_head sublist; struct sk_buff *skb, *next; INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; skb_list_del_init(skb); __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); if (!pt_prev) continue; if (pt_curr != pt_prev || od_curr != orig_dev) { /* dispatch old sublist */ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); /* start new sublist */ INIT_LIST_HEAD(&sublist); pt_curr = pt_prev; od_curr = orig_dev; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); } static int __netif_receive_skb(struct sk_buff *skb) { int ret; if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { unsigned int noreclaim_flag; /* * PFMEMALLOC skbs are special, they should * - be delivered to SOCK_MEMALLOC sockets only * - stay away from userspace * - have bounded memory usage * * Use PF_MEMALLOC as this saves us from propagating the allocation * context down to all allocation sites. */ noreclaim_flag = memalloc_noreclaim_save(); ret = __netif_receive_skb_one_core(skb, true); memalloc_noreclaim_restore(noreclaim_flag); } else ret = __netif_receive_skb_one_core(skb, false); return ret; } static void __netif_receive_skb_list(struct list_head *head) { unsigned long noreclaim_flag = 0; struct sk_buff *skb, *next; bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */ list_for_each_entry_safe(skb, next, head, list) { if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) { struct list_head sublist; /* Handle the previous sublist */ list_cut_before(&sublist, head, &skb->list); if (!list_empty(&sublist)) __netif_receive_skb_list_core(&sublist, pfmemalloc); pfmemalloc = !pfmemalloc; /* See comments in __netif_receive_skb */ if (pfmemalloc) noreclaim_flag = memalloc_noreclaim_save(); else memalloc_noreclaim_restore(noreclaim_flag); } } /* Handle the remaining sublist */ if (!list_empty(head)) __netif_receive_skb_list_core(head, pfmemalloc); /* Restore pflags */ if (pfmemalloc) memalloc_noreclaim_restore(noreclaim_flag); } static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) { struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); struct bpf_prog *new = xdp->prog; int ret = 0; switch (xdp->command) { case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); if (old) bpf_prog_put(old); if (old && !new) { static_branch_dec(&generic_xdp_needed_key); } else if (new && !old) { static_branch_inc(&generic_xdp_needed_key); dev_disable_lro(dev); dev_disable_gro_hw(dev); } break; default: ret = -EINVAL; break; } return ret; } static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; rcu_read_lock(); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu >= 0) { ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); return ret; } } #endif ret = __netif_receive_skb(skb); rcu_read_unlock(); return ret; } void netif_receive_skb_list_internal(struct list_head *head) { struct sk_buff *skb, *next; struct list_head sublist; INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); } list_splice_init(&sublist, head); rcu_read_lock(); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { list_for_each_entry_safe(skb, next, head, list) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu >= 0) { /* Will be handled, remove from list */ skb_list_del_init(skb); enqueue_to_backlog(skb, cpu, &rflow->last_qtail); } } } #endif __netif_receive_skb_list(head); rcu_read_unlock(); } /** * netif_receive_skb - process receive buffer from network * @skb: buffer to process * * netif_receive_skb() is the main receive data processing function. * It always succeeds. The buffer may be dropped during processing * for congestion control or by the protocol layers. * * This function may only be called from softirq context and interrupts * should be enabled. * * Return values (usually ignored): * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped */ int netif_receive_skb(struct sk_buff *skb) { int ret; trace_netif_receive_skb_entry(skb); ret = netif_receive_skb_internal(skb); trace_netif_receive_skb_exit(ret); return ret; } EXPORT_SYMBOL(netif_receive_skb); /** * netif_receive_skb_list - process many receive buffers from network * @head: list of skbs to process. * * Since return value of netif_receive_skb() is normally ignored, and * wouldn't be meaningful for a list, this function returns void. * * This function may only be called from softirq context and interrupts * should be enabled. */ void netif_receive_skb_list(struct list_head *head) { struct sk_buff *skb; if (list_empty(head)) return; if (trace_netif_receive_skb_list_entry_enabled()) { list_for_each_entry(skb, head, list) trace_netif_receive_skb_list_entry(skb); } netif_receive_skb_list_internal(head); trace_netif_receive_skb_list_exit(0); } EXPORT_SYMBOL(netif_receive_skb_list); static DEFINE_PER_CPU(struct work_struct, flush_works); /* Network device is going away, flush any packets still pending */ static void flush_backlog(struct work_struct *work) { struct sk_buff *skb, *tmp; struct softnet_data *sd; local_bh_disable(); sd = this_cpu_ptr(&softnet_data); rps_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); dev_kfree_skb_irq(skb); input_queue_head_incr(sd); } } rps_unlock_irq_enable(sd); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); kfree_skb(skb); input_queue_head_incr(sd); } } local_bh_enable(); } static bool flush_required(int cpu) { #if IS_ENABLED(CONFIG_RPS) struct softnet_data *sd = &per_cpu(softnet_data, cpu); bool do_flush; rps_lock_irq_disable(sd); /* as insertion into process_queue happens with the rps lock held, * process_queue access may race only with dequeue */ do_flush = !skb_queue_empty(&sd->input_pkt_queue) || !skb_queue_empty_lockless(&sd->process_queue); rps_unlock_irq_enable(sd); return do_flush; #endif /* without RPS we can't safely check input_pkt_queue: during a * concurrent remote skb_queue_splice() we can detect as empty both * input_pkt_queue and process_queue even if the latter could end-up * containing a lot of packets. */ return true; } static void flush_all_backlogs(void) { static cpumask_t flush_cpus; unsigned int cpu; /* since we are under rtnl lock protection we can use static data * for the cpumask and avoid allocating on stack the possibly * large mask */ ASSERT_RTNL(); cpus_read_lock(); cpumask_clear(&flush_cpus); for_each_online_cpu(cpu) { if (flush_required(cpu)) { queue_work_on(cpu, system_highpri_wq, per_cpu_ptr(&flush_works, cpu)); cpumask_set_cpu(cpu, &flush_cpus); } } /* we can have in flight packet[s] on the cpus we are not flushing, * synchronize_net() in unregister_netdevice_many() will take care of * them */ for_each_cpu(cpu, &flush_cpus) flush_work(per_cpu_ptr(&flush_works, cpu)); cpus_read_unlock(); } static void net_rps_send_ipi(struct softnet_data *remsd) { #ifdef CONFIG_RPS while (remsd) { struct softnet_data *next = remsd->rps_ipi_next; if (cpu_online(remsd->cpu)) smp_call_function_single_async(remsd->cpu, &remsd->csd); remsd = next; } #endif } /* * net_rps_action_and_irq_enable sends any pending IPI's for rps. * Note: called with local irq disabled, but exits with local irq enabled. */ static void net_rps_action_and_irq_enable(struct softnet_data *sd) { #ifdef CONFIG_RPS struct softnet_data *remsd = sd->rps_ipi_list; if (remsd) { sd->rps_ipi_list = NULL; local_irq_enable(); /* Send pending IPI's to kick RPS processing on remote cpus. */ net_rps_send_ipi(remsd); } else #endif local_irq_enable(); } static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) { #ifdef CONFIG_RPS return sd->rps_ipi_list != NULL; #else return false; #endif } static int process_backlog(struct napi_struct *napi, int quota) { struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); bool again = true; int work = 0; /* Check if we have pending ipi, its better to send them now, * not waiting net_rx_action() end. */ if (sd_has_rps_ipi_waiting(sd)) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } napi->weight = READ_ONCE(dev_rx_weight); while (again) { struct sk_buff *skb; while ((skb = __skb_dequeue(&sd->process_queue))) { rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); input_queue_head_incr(sd); if (++work >= quota) return work; } rps_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). * only current cpu owns and manipulates this napi, * and NAPI_STATE_SCHED is the only possible flag set * on backlog. * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ napi->state = 0; again = false; } else { skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); } rps_unlock_irq_enable(sd); } return work; } /** * __napi_schedule - schedule for receive * @n: entry to schedule * * The entry's receive function will be scheduled to run. * Consider using __napi_schedule_irqoff() if hard irqs are masked. */ void __napi_schedule(struct napi_struct *n) { unsigned long flags; local_irq_save(flags); ____napi_schedule(this_cpu_ptr(&softnet_data), n); local_irq_restore(flags); } EXPORT_SYMBOL(__napi_schedule); /** * napi_schedule_prep - check if napi can be scheduled * @n: napi context * * Test if NAPI routine is already running, and if not mark * it as running. This is used as a condition variable to * insure only one NAPI poll instance runs. We also make * sure there is no pending NAPI disable. */ bool napi_schedule_prep(struct napi_struct *n) { unsigned long new, val = READ_ONCE(n->state); do { if (unlikely(val & NAPIF_STATE_DISABLE)) return false; new = val | NAPIF_STATE_SCHED; /* Sets STATE_MISSED bit if STATE_SCHED was already set * This was suggested by Alexander Duyck, as compiler * emits better code than : * if (val & NAPIF_STATE_SCHED) * new |= NAPIF_STATE_MISSED; */ new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * NAPIF_STATE_MISSED; } while (!try_cmpxchg(&n->state, &val, new)); return !(val & NAPIF_STATE_SCHED); } EXPORT_SYMBOL(napi_schedule_prep); /** * __napi_schedule_irqoff - schedule for receive * @n: entry to schedule * * Variant of __napi_schedule() assuming hard irqs are masked. * * On PREEMPT_RT enabled kernels this maps to __napi_schedule() * because the interrupt disabled assumption might not be true * due to force-threaded interrupts and spinlock substitution. */ void __napi_schedule_irqoff(struct napi_struct *n) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ____napi_schedule(this_cpu_ptr(&softnet_data), n); else __napi_schedule(n); } EXPORT_SYMBOL(__napi_schedule_irqoff); bool napi_complete_done(struct napi_struct *n, int work_done) { unsigned long flags, val, new, timeout = 0; bool ret = true; /* * 1) Don't let napi dequeue from the cpu poll list * just in case its running on a different cpu. * 2) If we are busy polling, do nothing here, we have * the guarantee we will be called later. */ if (unlikely(n->state & (NAPIF_STATE_NPSVC | NAPIF_STATE_IN_BUSY_POLL))) return false; if (work_done) { if (n->gro_bitmask) timeout = READ_ONCE(n->dev->gro_flush_timeout); n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs); } if (n->defer_hard_irqs_count > 0) { n->defer_hard_irqs_count--; timeout = READ_ONCE(n->dev->gro_flush_timeout); if (timeout) ret = false; } if (n->gro_bitmask) { /* When the NAPI instance uses a timeout and keeps postponing * it, we need to bound somehow the time packets are kept in * the GRO layer */ napi_gro_flush(n, !!timeout); } gro_normal_list(n); if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ local_irq_save(flags); list_del_init(&n->poll_list); local_irq_restore(flags); } WRITE_ONCE(n->list_owner, -1); val = READ_ONCE(n->state); do { WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | NAPIF_STATE_SCHED_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); /* If STATE_MISSED was set, leave STATE_SCHED set, * because we will call napi->poll() one more time. * This C code was suggested by Alexander Duyck to help gcc. */ new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * NAPIF_STATE_SCHED; } while (!try_cmpxchg(&n->state, &val, new)); if (unlikely(val & NAPIF_STATE_MISSED)) { __napi_schedule(n); return false; } if (timeout) hrtimer_start(&n->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); return ret; } EXPORT_SYMBOL(napi_complete_done); /* must be called under rcu_read_lock(), as we dont take a reference */ struct napi_struct *napi_by_id(unsigned int napi_id) { unsigned int hash = napi_id % HASH_SIZE(napi_hash); struct napi_struct *napi; hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) if (napi->napi_id == napi_id) return napi; return NULL; } #if defined(CONFIG_NET_RX_BUSY_POLL) static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) { if (!skip_schedule) { gro_normal_list(napi); __napi_schedule(napi); return; } if (napi->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. */ napi_gro_flush(napi, HZ >= 1000); } gro_normal_list(napi); clear_bit(NAPI_STATE_SCHED, &napi->state); } static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, u16 budget) { bool skip_schedule = false; unsigned long timeout; int rc; /* Busy polling means there is a high chance device driver hard irq * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was * set in napi_schedule_prep(). * Since we are about to call napi->poll() once more, we can safely * clear NAPI_STATE_MISSED. * * Note: x86 could use a single "lock and ..." instruction * to perform these two clear_bit() */ clear_bit(NAPI_STATE_MISSED, &napi->state); clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); local_bh_disable(); if (prefer_busy_poll) { napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); timeout = READ_ONCE(napi->dev->gro_flush_timeout); if (napi->defer_hard_irqs_count && timeout) { hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); skip_schedule = true; } } /* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ rc = napi->poll(napi, budget); /* We can't gro_normal_list() here, because napi->poll() might have * rearmed the napi (napi_complete_done()) in which case it could * already be running on another CPU. */ trace_napi_poll(napi, rc, budget); netpoll_poll_unlock(have_poll_lock); if (rc == budget) __busy_poll_stop(napi, skip_schedule); local_bh_enable(); } void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); void *have_poll_lock = NULL; struct napi_struct *napi; restart: napi_poll = NULL; rcu_read_lock(); napi = napi_by_id(napi_id); if (!napi) goto out; if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_disable(); for (;;) { int work = 0; local_bh_disable(); if (!napi_poll) { unsigned long val = READ_ONCE(napi->state); /* If multiple threads are competing for this napi, * we avoid dirtying napi->state as much as we can. */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | NAPIF_STATE_IN_BUSY_POLL)) { if (prefer_busy_poll) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | NAPIF_STATE_SCHED) != val) { if (prefer_busy_poll) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } work = napi_poll(napi, budget); trace_napi_poll(napi, work, budget); gro_normal_list(napi); count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); local_bh_enable(); if (!loop_end || loop_end(loop_end_arg, start_time)) break; if (unlikely(need_resched())) { if (napi_poll) busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); rcu_read_unlock(); cond_resched(); if (loop_end(loop_end_arg, start_time)) return; goto restart; } cpu_relax(); } if (napi_poll) busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); out: rcu_read_unlock(); } EXPORT_SYMBOL(napi_busy_loop); #endif /* CONFIG_NET_RX_BUSY_POLL */ static void napi_hash_add(struct napi_struct *napi) { if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) return; spin_lock(&napi_hash_lock); /* 0..NR_CPUS range is reserved for sender_cpu use */ do { if (unlikely(++napi_gen_id < MIN_NAPI_ID)) napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); napi->napi_id = napi_gen_id; hlist_add_head_rcu(&napi->napi_hash_node, &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); spin_unlock(&napi_hash_lock); } /* Warning : caller is responsible to make sure rcu grace period * is respected before freeing memory containing @napi */ static void napi_hash_del(struct napi_struct *napi) { spin_lock(&napi_hash_lock); hlist_del_init_rcu(&napi->napi_hash_node); spin_unlock(&napi_hash_lock); } static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) { struct napi_struct *napi; napi = container_of(timer, struct napi_struct, timer); /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ if (!napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) { clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); __napi_schedule_irqoff(napi); } return HRTIMER_NORESTART; } static void init_gro_hash(struct napi_struct *napi) { int i; for (i = 0; i < GRO_HASH_BUCKETS; i++) { INIT_LIST_HEAD(&napi->gro_hash[i].list); napi->gro_hash[i].count = 0; } napi->gro_bitmask = 0; } int dev_set_threaded(struct net_device *dev, bool threaded) { struct napi_struct *napi; int err = 0; if (dev->threaded == threaded) return 0; if (threaded) { list_for_each_entry(napi, &dev->napi_list, dev_list) { if (!napi->thread) { err = napi_kthread_create(napi); if (err) { threaded = false; break; } } } } dev->threaded = threaded; /* Make sure kthread is created before THREADED bit * is set. */ smp_mb__before_atomic(); /* Setting/unsetting threaded mode on a napi might not immediately * take effect, if the current napi instance is actively being * polled. In this case, the switch between threaded mode and * softirq mode will happen in the next round of napi_schedule(). * This should not cause hiccups/stalls to the live traffic. */ list_for_each_entry(napi, &dev->napi_list, dev_list) assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); return err; } EXPORT_SYMBOL(dev_set_threaded); /** * netif_queue_set_napi - Associate queue with the napi * @dev: device to which NAPI and queue belong * @queue_index: Index of queue * @type: queue type as RX or TX * @napi: NAPI context, pass NULL to clear previously set NAPI * * Set queue with its corresponding napi context. This should be done after * registering the NAPI handler for the queue-vector and the queues have been * mapped to the corresponding interrupt vector. */ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, enum netdev_queue_type type, struct napi_struct *napi) { struct netdev_rx_queue *rxq; struct netdev_queue *txq; if (WARN_ON_ONCE(napi && !napi->dev)) return; if (dev->reg_state >= NETREG_REGISTERED) ASSERT_RTNL(); switch (type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(dev, queue_index); rxq->napi = napi; return; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(dev, queue_index); txq->napi = napi; return; default: return; } } EXPORT_SYMBOL(netif_queue_set_napi); void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) return; INIT_LIST_HEAD(&napi->poll_list); INIT_HLIST_NODE(&napi->napi_hash_node); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; init_gro_hash(napi); napi->skb = NULL; INIT_LIST_HEAD(&napi->rx_list); napi->rx_count = 0; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) netdev_err_once(dev, "%s() called with weight %d\n", __func__, weight); napi->weight = weight; napi->dev = dev; #ifdef CONFIG_NETPOLL napi->poll_owner = -1; #endif napi->list_owner = -1; set_bit(NAPI_STATE_SCHED, &napi->state); set_bit(NAPI_STATE_NPSVC, &napi->state); list_add_rcu(&napi->dev_list, &dev->napi_list); napi_hash_add(napi); napi_get_frags_check(napi); /* Create kthread for this napi if dev->threaded is set. * Clear dev->threaded if kthread creation failed so that * threaded mode will not be enabled in napi_enable(). */ if (dev->threaded && napi_kthread_create(napi)) dev->threaded = 0; netif_napi_set_irq(napi, -1); } EXPORT_SYMBOL(netif_napi_add_weight); void napi_disable(struct napi_struct *n) { unsigned long val, new; might_sleep(); set_bit(NAPI_STATE_DISABLE, &n->state); val = READ_ONCE(n->state); do { while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { usleep_range(20, 200); val = READ_ONCE(n->state); } new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); } while (!try_cmpxchg(&n->state, &val, new)); hrtimer_cancel(&n->timer); clear_bit(NAPI_STATE_DISABLE, &n->state); } EXPORT_SYMBOL(napi_disable); /** * napi_enable - enable NAPI scheduling * @n: NAPI context * * Resume NAPI from being scheduled on this context. * Must be paired with napi_disable. */ void napi_enable(struct napi_struct *n) { unsigned long new, val = READ_ONCE(n->state); do { BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC); if (n->dev->threaded && n->thread) new |= NAPIF_STATE_THREADED; } while (!try_cmpxchg(&n->state, &val, new)); } EXPORT_SYMBOL(napi_enable); static void flush_gro_hash(struct napi_struct *napi) { int i; for (i = 0; i < GRO_HASH_BUCKETS; i++) { struct sk_buff *skb, *n; list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list) kfree_skb(skb); napi->gro_hash[i].count = 0; } } /* Must be called in process context */ void __netif_napi_del(struct napi_struct *napi) { if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) return; napi_hash_del(napi); list_del_rcu(&napi->dev_list); napi_free_frags(napi); flush_gro_hash(napi); napi->gro_bitmask = 0; if (napi->thread) { kthread_stop(napi->thread); napi->thread = NULL; } } EXPORT_SYMBOL(__netif_napi_del); static int __napi_poll(struct napi_struct *n, bool *repoll) { int work, weight; weight = n->weight; /* This NAPI_STATE_SCHED test is for avoiding a race * with netpoll's poll_napi(). Only the entity which * obtains the lock and sees NAPI_STATE_SCHED set will * actually make the ->poll() call. Therefore we avoid * accidentally calling ->poll() when NAPI is not scheduled. */ work = 0; if (napi_is_scheduled(n)) { work = n->poll(n, weight); trace_napi_poll(n, work, weight); xdp_do_check_flushed(n); } if (unlikely(work > weight)) netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n", n->poll, work, weight); if (likely(work < weight)) return work; /* Drivers must not modify the NAPI state if they * consume the entire weight. In such cases this code * still "owns" the NAPI instance and therefore can * move the instance around on the list at-will. */ if (unlikely(napi_disable_pending(n))) { napi_complete(n); return work; } /* The NAPI context has more processing work, but busy-polling * is preferred. Exit early. */ if (napi_prefer_busy_poll(n)) { if (napi_complete_done(n, work)) { /* If timeout is not set, we need to make sure * that the NAPI is re-scheduled. */ napi_schedule(n); } return work; } if (n->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. */ napi_gro_flush(n, HZ >= 1000); } gro_normal_list(n); /* Some drivers may have called napi_schedule * prior to exhausting their budget. */ if (unlikely(!list_empty(&n->poll_list))) { pr_warn_once("%s: Budget exhausted after napi rescheduled\n", n->dev ? n->dev->name : "backlog"); return work; } *repoll = true; return work; } static int napi_poll(struct napi_struct *n, struct list_head *repoll) { bool do_repoll = false; void *have; int work; list_del_init(&n->poll_list); have = netpoll_poll_lock(n); work = __napi_poll(n, &do_repoll); if (do_repoll) list_add_tail(&n->poll_list, repoll); netpoll_poll_unlock(have); return work; } static int napi_thread_wait(struct napi_struct *napi) { bool woken = false; set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { /* Testing SCHED_THREADED bit here to make sure the current * kthread owns this napi and could poll on this napi. * Testing SCHED bit is not enough because SCHED bit might be * set by some other busy poll thread or by napi_disable(). */ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) { WARN_ON(!list_empty(&napi->poll_list)); __set_current_state(TASK_RUNNING); return 0; } schedule(); /* woken being true indicates this thread owns this napi. */ woken = true; set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); return -1; } static void skb_defer_free_flush(struct softnet_data *sd) { struct sk_buff *skb, *next; /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ if (!READ_ONCE(sd->defer_list)) return; spin_lock(&sd->defer_lock); skb = sd->defer_list; sd->defer_list = NULL; sd->defer_count = 0; spin_unlock(&sd->defer_lock); while (skb != NULL) { next = skb->next; napi_consume_skb(skb, 1); skb = next; } } static int napi_threaded_poll(void *data) { struct napi_struct *napi = data; struct softnet_data *sd; void *have; while (!napi_thread_wait(napi)) { for (;;) { bool repoll = false; local_bh_disable(); sd = this_cpu_ptr(&softnet_data); sd->in_napi_threaded_poll = true; have = netpoll_poll_lock(napi); __napi_poll(napi, &repoll); netpoll_poll_unlock(have); sd->in_napi_threaded_poll = false; barrier(); if (sd_has_rps_ipi_waiting(sd)) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } skb_defer_free_flush(sd); local_bh_enable(); if (!repoll) break; cond_resched(); } } return 0; } static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + usecs_to_jiffies(READ_ONCE(netdev_budget_usecs)); int budget = READ_ONCE(netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); start: sd->in_net_rx_action = true; local_irq_disable(); list_splice_init(&sd->poll_list, &list); local_irq_enable(); for (;;) { struct napi_struct *n; skb_defer_free_flush(sd); if (list_empty(&list)) { if (list_empty(&repoll)) { sd->in_net_rx_action = false; barrier(); /* We need to check if ____napi_schedule() * had refilled poll_list while * sd->in_net_rx_action was true. */ if (!list_empty(&sd->poll_list)) goto start; if (!sd_has_rps_ipi_waiting(sd)) goto end; } break; } n = list_first_entry(&list, struct napi_struct, poll_list); budget -= napi_poll(n, &repoll); /* If softirq window is exhausted then punt. * Allow this to run for 2 jiffies since which will allow * an average latency of 1.5/HZ. */ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) { sd->time_squeeze++; break; } } local_irq_disable(); list_splice_tail_init(&sd->poll_list, &list); list_splice_tail(&repoll, &list); list_splice(&list, &sd->poll_list); if (!list_empty(&sd->poll_list)) __raise_softirq_irqoff(NET_RX_SOFTIRQ); else sd->in_net_rx_action = false; net_rps_action_and_irq_enable(sd); end:; } struct netdev_adjacent { struct net_device *dev; netdevice_tracker dev_tracker; /* upper master flag, there can only be one master device per list */ bool master; /* lookup ignore flag */ bool ignore; /* counter for the number of times this device was added to us */ u16 ref_nr; /* private field for the users */ void *private; struct list_head list; struct rcu_head rcu; }; static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, struct list_head *adj_list) { struct netdev_adjacent *adj; list_for_each_entry(adj, adj_list, list) { if (adj->dev == adj_dev) return adj; } return NULL; } static int ____netdev_has_upper_dev(struct net_device *upper_dev, struct netdev_nested_priv *priv) { struct net_device *dev = (struct net_device *)priv->data; return upper_dev == dev; } /** * netdev_has_upper_dev - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * * Find out if a device is linked to specified upper device and return true * in case it is. Note that this checks only immediate upper device, * not through a complete stack of devices. The caller must hold the RTNL lock. */ bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .data = (void *)upper_dev, }; ASSERT_RTNL(); return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, &priv); } EXPORT_SYMBOL(netdev_has_upper_dev); /** * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * * Find out if a device is linked to specified upper device and return true * in case it is. Note that this checks the entire upper device chain. * The caller must hold rcu lock. */ bool netdev_has_upper_dev_all_rcu(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .data = (void *)upper_dev, }; return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, &priv); } EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); /** * netdev_has_any_upper_dev - Check if device is linked to some device * @dev: device * * Find out if a device is linked to an upper device and return true in case * it is. The caller must hold the RTNL lock. */ bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); return !list_empty(&dev->adj_list.upper); } EXPORT_SYMBOL(netdev_has_any_upper_dev); /** * netdev_master_upper_dev_get - Get master upper device * @dev: device * * Find a master upper device and return pointer to it or NULL in case * it's not there. The caller must hold the RTNL lock. */ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) { struct netdev_adjacent *upper; ASSERT_RTNL(); if (list_empty(&dev->adj_list.upper)) return NULL; upper = list_first_entry(&dev->adj_list.upper, struct netdev_adjacent, list); if (likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get); static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev) { struct netdev_adjacent *upper; ASSERT_RTNL(); if (list_empty(&dev->adj_list.upper)) return NULL; upper = list_first_entry(&dev->adj_list.upper, struct netdev_adjacent, list); if (likely(upper->master) && !upper->ignore) return upper->dev; return NULL; } /** * netdev_has_any_lower_dev - Check if device is linked to some device * @dev: device * * Find out if a device is linked to a lower device and return true in case * it is. The caller must hold the RTNL lock. */ static bool netdev_has_any_lower_dev(struct net_device *dev) { ASSERT_RTNL(); return !list_empty(&dev->adj_list.lower); } void *netdev_adjacent_get_private(struct list_head *adj_list) { struct netdev_adjacent *adj; adj = list_entry(adj_list, struct netdev_adjacent, list); return adj->private; } EXPORT_SYMBOL(netdev_adjacent_get_private); /** * netdev_upper_get_next_dev_rcu - Get the next dev from upper list * @dev: device * @iter: list_head ** of the current position * * Gets the next device from the dev's upper list, starting from iter * position. The caller must hold RCU read lock. */ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *upper; WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; return upper->dev; } EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); static struct net_device *__netdev_next_upper_dev(struct net_device *dev, struct list_head **iter, bool *ignore) { struct netdev_adjacent *upper; upper = list_entry((*iter)->next, struct netdev_adjacent, list); if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; *ignore = upper->ignore; return upper->dev; } static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *upper; WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; return upper->dev; } static int __netdev_walk_all_upper_dev(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; bool ignore; now = dev; iter = &dev->adj_list.upper; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { udev = __netdev_next_upper_dev(now, &iter, &ignore); if (!udev) break; if (ignore) continue; next = udev; niter = &udev->adj_list.upper; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } int netdev_walk_all_upper_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; now = dev; iter = &dev->adj_list.upper; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { udev = netdev_next_upper_dev_rcu(now, &iter); if (!udev) break; next = udev; niter = &udev->adj_list.upper; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); static bool __netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .flags = 0, .data = (void *)upper_dev, }; ASSERT_RTNL(); return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev, &priv); } /** * netdev_lower_get_next_private - Get the next ->private from the * lower neighbour list * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent->private from the dev's lower neighbour * list, starting from iter position. The caller must hold either hold the * RTNL lock or its own locking that guarantees that the neighbour lower * list will remain unchanged. */ void *netdev_lower_get_next_private(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry(*iter, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = lower->list.next; return lower->private; } EXPORT_SYMBOL(netdev_lower_get_next_private); /** * netdev_lower_get_next_private_rcu - Get the next ->private from the * lower neighbour list, RCU * variant * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent->private from the dev's lower neighbour * list, starting from iter position. The caller must hold RCU read lock. */ void *netdev_lower_get_next_private_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; return lower->private; } EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); /** * netdev_lower_get_next - Get the next device from the lower neighbour * list * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent from the dev's lower neighbour * list, starting from iter position. The caller must hold RTNL lock or * its own locking that guarantees that the neighbour lower * list will remain unchanged. */ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry(*iter, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = lower->list.next; return lower->dev; } EXPORT_SYMBOL(netdev_lower_get_next); static struct net_device *netdev_next_lower_dev(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; return lower->dev; } static struct net_device *__netdev_next_lower_dev(struct net_device *dev, struct list_head **iter, bool *ignore) { struct netdev_adjacent *lower; lower = list_entry((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; *ignore = lower->ignore; return lower->dev; } int netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; now = dev; iter = &dev->adj_list.lower; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { ldev = netdev_next_lower_dev(now, &iter); if (!ldev) break; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); static int __netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; bool ignore; now = dev; iter = &dev->adj_list.lower; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { ldev = __netdev_next_lower_dev(now, &iter, &ignore); if (!ldev) break; if (ignore) continue; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; return lower->dev; } EXPORT_SYMBOL(netdev_next_lower_dev_rcu); static u8 __netdev_upper_depth(struct net_device *dev) { struct net_device *udev; struct list_head *iter; u8 max_depth = 0; bool ignore; for (iter = &dev->adj_list.upper, udev = __netdev_next_upper_dev(dev, &iter, &ignore); udev; udev = __netdev_next_upper_dev(dev, &iter, &ignore)) { if (ignore) continue; if (max_depth < udev->upper_level) max_depth = udev->upper_level; } return max_depth; } static u8 __netdev_lower_depth(struct net_device *dev) { struct net_device *ldev; struct list_head *iter; u8 max_depth = 0; bool ignore; for (iter = &dev->adj_list.lower, ldev = __netdev_next_lower_dev(dev, &iter, &ignore); ldev; ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) { if (ignore) continue; if (max_depth < ldev->lower_level) max_depth = ldev->lower_level; } return max_depth; } static int __netdev_update_upper_level(struct net_device *dev, struct netdev_nested_priv *__unused) { dev->upper_level = __netdev_upper_depth(dev) + 1; return 0; } #ifdef CONFIG_LOCKDEP static LIST_HEAD(net_unlink_list); static void net_unlink_todo(struct net_device *dev) { if (list_empty(&dev->unlink_list)) list_add_tail(&dev->unlink_list, &net_unlink_list); } #endif static int __netdev_update_lower_level(struct net_device *dev, struct netdev_nested_priv *priv) { dev->lower_level = __netdev_lower_depth(dev) + 1; #ifdef CONFIG_LOCKDEP if (!priv) return 0; if (priv->flags & NESTED_SYNC_IMM) dev->nested_level = dev->lower_level - 1; if (priv->flags & NESTED_SYNC_TODO) net_unlink_todo(dev); #endif return 0; } int netdev_walk_all_lower_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; now = dev; iter = &dev->adj_list.lower; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { ldev = netdev_next_lower_dev_rcu(now, &iter); if (!ldev) break; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu); /** * netdev_lower_get_first_private_rcu - Get the first ->private from the * lower neighbour list, RCU * variant * @dev: device * * Gets the first netdev_adjacent->private from the dev's lower neighbour * list. The caller must hold RCU read lock. */ void *netdev_lower_get_first_private_rcu(struct net_device *dev) { struct netdev_adjacent *lower; lower = list_first_or_null_rcu(&dev->adj_list.lower, struct netdev_adjacent, list); if (lower) return lower->private; return NULL; } EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); /** * netdev_master_upper_dev_get_rcu - Get master upper device * @dev: device * * Find a master upper device and return pointer to it or NULL in case * it's not there. The caller must hold the RCU read lock. */ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { struct netdev_adjacent *upper; upper = list_first_or_null_rcu(&dev->adj_list.upper, struct netdev_adjacent, list); if (upper && likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); static int netdev_adjacent_sysfs_add(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list) { char linkname[IFNAMSIZ+7]; sprintf(linkname, dev_list == &dev->adj_list.upper ? "upper_%s" : "lower_%s", adj_dev->name); return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), linkname); } static void netdev_adjacent_sysfs_del(struct net_device *dev, char *name, struct list_head *dev_list) { char linkname[IFNAMSIZ+7]; sprintf(linkname, dev_list == &dev->adj_list.upper ? "upper_%s" : "lower_%s", name); sysfs_remove_link(&(dev->dev.kobj), linkname); } static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list) { return (dev_list == &dev->adj_list.upper || dev_list == &dev->adj_list.lower) && net_eq(dev_net(dev), dev_net(adj_dev)); } static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list, void *private, bool master) { struct netdev_adjacent *adj; int ret; adj = __netdev_find_adj(adj_dev, dev_list); if (adj) { adj->ref_nr += 1; pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n", dev->name, adj_dev->name, adj->ref_nr); return 0; } adj = kmalloc(sizeof(*adj), GFP_KERNEL); if (!adj) return -ENOMEM; adj->dev = adj_dev; adj->master = master; adj->ref_nr = 1; adj->private = private; adj->ignore = false; netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL); pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n", dev->name, adj_dev->name, adj->ref_nr, adj_dev->name); if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); if (ret) goto free_adj; } /* Ensure that master link is always the first item in list. */ if (master) { ret = sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), "master"); if (ret) goto remove_symlinks; list_add_rcu(&adj->list, dev_list); } else { list_add_tail_rcu(&adj->list, dev_list); } return 0; remove_symlinks: if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); free_adj: netdev_put(adj_dev, &adj->dev_tracker); kfree(adj); return ret; } static void __netdev_adjacent_dev_remove(struct net_device *dev, struct net_device *adj_dev, u16 ref_nr, struct list_head *dev_list) { struct netdev_adjacent *adj; pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n", dev->name, adj_dev->name, ref_nr); adj = __netdev_find_adj(adj_dev, dev_list); if (!adj) { pr_err("Adjacency does not exist for device %s from %s\n", dev->name, adj_dev->name); WARN_ON(1); return; } if (adj->ref_nr > ref_nr) { pr_debug("adjacency: %s to %s ref_nr - %d = %d\n", dev->name, adj_dev->name, ref_nr, adj->ref_nr - ref_nr); adj->ref_nr -= ref_nr; return; } if (adj->master) sysfs_remove_link(&(dev->dev.kobj), "master"); if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); list_del_rcu(&adj->list); pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); netdev_put(adj_dev, &adj->dev_tracker); kfree_rcu(adj, rcu); } static int __netdev_adjacent_dev_link_lists(struct net_device *dev, struct net_device *upper_dev, struct list_head *up_list, struct list_head *down_list, void *private, bool master) { int ret; ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, master); if (ret) return ret; ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, false); if (ret) { __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list); return ret; } return 0; } static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, struct net_device *upper_dev, u16 ref_nr, struct list_head *up_list, struct list_head *down_list) { __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); } static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, struct net_device *upper_dev, void *private, bool master) { return __netdev_adjacent_dev_link_lists(dev, upper_dev, &dev->adj_list.upper, &upper_dev->adj_list.lower, private, master); } static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, struct net_device *upper_dev) { __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, &dev->adj_list.upper, &upper_dev->adj_list.lower); } static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, void *upper_priv, void *upper_info, struct netdev_nested_priv *priv, struct netlink_ext_ack *extack) { struct netdev_notifier_changeupper_info changeupper_info = { .info = { .dev = dev, .extack = extack, }, .upper_dev = upper_dev, .master = master, .linking = true, .upper_info = upper_info, }; struct net_device *master_dev; int ret = 0; ASSERT_RTNL(); if (dev == upper_dev) return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ if (__netdev_has_upper_dev(upper_dev, dev)) return -EBUSY; if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV) return -EMLINK; if (!master) { if (__netdev_has_upper_dev(dev, upper_dev)) return -EEXIST; } else { master_dev = __netdev_master_upper_dev_get(dev); if (master_dev) return master_dev == upper_dev ? -EEXIST : -EBUSY; } ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) return ret; ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, master); if (ret) return ret; ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) goto rollback; __netdev_update_upper_level(dev, NULL); __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); __netdev_update_lower_level(upper_dev, priv); __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, priv); return 0; rollback: __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); return ret; } /** * netdev_upper_dev_link - Add a link to the upper device * @dev: device * @upper_dev: new upper device * @extack: netlink extended ack * * Adds a link to device which is upper to this one. The caller must hold * the RTNL lock. On a failure a negative errno code is returned. * On success the reference counts are adjusted and the function * returns zero. */ int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, struct netlink_ext_ack *extack) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, .data = NULL, }; return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL, &priv, extack); } EXPORT_SYMBOL(netdev_upper_dev_link); /** * netdev_master_upper_dev_link - Add a master link to the upper device * @dev: device * @upper_dev: new upper device * @upper_priv: upper device private * @upper_info: upper info to be passed down via notifier * @extack: netlink extended ack * * Adds a link to device which is upper to this one. In this case, only * one master upper device can be linked, although other non-master devices * might be linked as well. The caller must hold the RTNL lock. * On a failure a negative errno code is returned. On success the reference * counts are adjusted and the function returns zero. */ int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, void *upper_priv, void *upper_info, struct netlink_ext_ack *extack) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, .data = NULL, }; return __netdev_upper_dev_link(dev, upper_dev, true, upper_priv, upper_info, &priv, extack); } EXPORT_SYMBOL(netdev_master_upper_dev_link); static void __netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev, struct netdev_nested_priv *priv) { struct netdev_notifier_changeupper_info changeupper_info = { .info = { .dev = dev, }, .upper_dev = upper_dev, .linking = false, }; ASSERT_RTNL(); changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); __netdev_update_upper_level(dev, NULL); __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); __netdev_update_lower_level(upper_dev, priv); __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, priv); } /** * netdev_upper_dev_unlink - Removes a link to upper device * @dev: device * @upper_dev: new upper device * * Removes a link to device which is upper to this one. The caller must hold * the RTNL lock. */ void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_TODO, .data = NULL, }; __netdev_upper_dev_unlink(dev, upper_dev, &priv); } EXPORT_SYMBOL(netdev_upper_dev_unlink); static void __netdev_adjacent_dev_set(struct net_device *upper_dev, struct net_device *lower_dev, bool val) { struct netdev_adjacent *adj; adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower); if (adj) adj->ignore = val; adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper); if (adj) adj->ignore = val; } static void netdev_adjacent_dev_disable(struct net_device *upper_dev, struct net_device *lower_dev) { __netdev_adjacent_dev_set(upper_dev, lower_dev, true); } static void netdev_adjacent_dev_enable(struct net_device *upper_dev, struct net_device *lower_dev) { __netdev_adjacent_dev_set(upper_dev, lower_dev, false); } int netdev_adjacent_change_prepare(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev, struct netlink_ext_ack *extack) { struct netdev_nested_priv priv = { .flags = 0, .data = NULL, }; int err; if (!new_dev) return 0; if (old_dev && new_dev != old_dev) netdev_adjacent_dev_disable(dev, old_dev); err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv, extack); if (err) { if (old_dev && new_dev != old_dev) netdev_adjacent_dev_enable(dev, old_dev); return err; } return 0; } EXPORT_SYMBOL(netdev_adjacent_change_prepare); void netdev_adjacent_change_commit(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, .data = NULL, }; if (!new_dev || !old_dev) return; if (new_dev == old_dev) return; netdev_adjacent_dev_enable(dev, old_dev); __netdev_upper_dev_unlink(old_dev, dev, &priv); } EXPORT_SYMBOL(netdev_adjacent_change_commit); void netdev_adjacent_change_abort(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev) { struct netdev_nested_priv priv = { .flags = 0, .data = NULL, }; if (!new_dev) return; if (old_dev && new_dev != old_dev) netdev_adjacent_dev_enable(dev, old_dev); __netdev_upper_dev_unlink(new_dev, dev, &priv); } EXPORT_SYMBOL(netdev_adjacent_change_abort); /** * netdev_bonding_info_change - Dispatch event about slave change * @dev: device * @bonding_info: info to dispatch * * Send NETDEV_BONDING_INFO to netdev notifiers with info. * The caller must hold the RTNL lock. */ void netdev_bonding_info_change(struct net_device *dev, struct netdev_bonding_info *bonding_info) { struct netdev_notifier_bonding_info info = { .info.dev = dev, }; memcpy(&info.bonding_info, bonding_info, sizeof(struct netdev_bonding_info)); call_netdevice_notifiers_info(NETDEV_BONDING_INFO, &info.info); } EXPORT_SYMBOL(netdev_bonding_info_change); static int netdev_offload_xstats_enable_l3(struct net_device *dev, struct netlink_ext_ack *extack) { struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .info.extack = extack, .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, }; int err; int rc; dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3), GFP_KERNEL); if (!dev->offload_xstats_l3) return -ENOMEM; rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE, NETDEV_OFFLOAD_XSTATS_DISABLE, &info.info); err = notifier_to_errno(rc); if (err) goto free_stats; return 0; free_stats: kfree(dev->offload_xstats_l3); dev->offload_xstats_l3 = NULL; return err; } int netdev_offload_xstats_enable(struct net_device *dev, enum netdev_offload_xstats_type type, struct netlink_ext_ack *extack) { ASSERT_RTNL(); if (netdev_offload_xstats_enabled(dev, type)) return -EALREADY; switch (type) { case NETDEV_OFFLOAD_XSTATS_TYPE_L3: return netdev_offload_xstats_enable_l3(dev, extack); } WARN_ON(1); return -EINVAL; } EXPORT_SYMBOL(netdev_offload_xstats_enable); static void netdev_offload_xstats_disable_l3(struct net_device *dev) { struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, }; call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE, &info.info); kfree(dev->offload_xstats_l3); dev->offload_xstats_l3 = NULL; } int netdev_offload_xstats_disable(struct net_device *dev, enum netdev_offload_xstats_type type) { ASSERT_RTNL(); if (!netdev_offload_xstats_enabled(dev, type)) return -EALREADY; switch (type) { case NETDEV_OFFLOAD_XSTATS_TYPE_L3: netdev_offload_xstats_disable_l3(dev); return 0; } WARN_ON(1); return -EINVAL; } EXPORT_SYMBOL(netdev_offload_xstats_disable); static void netdev_offload_xstats_disable_all(struct net_device *dev) { netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3); } static struct rtnl_hw_stats64 * netdev_offload_xstats_get_ptr(const struct net_device *dev, enum netdev_offload_xstats_type type) { switch (type) { case NETDEV_OFFLOAD_XSTATS_TYPE_L3: return dev->offload_xstats_l3; } WARN_ON(1); return NULL; } bool netdev_offload_xstats_enabled(const struct net_device *dev, enum netdev_offload_xstats_type type) { ASSERT_RTNL(); return netdev_offload_xstats_get_ptr(dev, type); } EXPORT_SYMBOL(netdev_offload_xstats_enabled); struct netdev_notifier_offload_xstats_ru { bool used; }; struct netdev_notifier_offload_xstats_rd { struct rtnl_hw_stats64 stats; bool used; }; static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest, const struct rtnl_hw_stats64 *src) { dest->rx_packets += src->rx_packets; dest->tx_packets += src->tx_packets; dest->rx_bytes += src->rx_bytes; dest->tx_bytes += src->tx_bytes; dest->rx_errors += src->rx_errors; dest->tx_errors += src->tx_errors; dest->rx_dropped += src->rx_dropped; dest->tx_dropped += src->tx_dropped; dest->multicast += src->multicast; } static int netdev_offload_xstats_get_used(struct net_device *dev, enum netdev_offload_xstats_type type, bool *p_used, struct netlink_ext_ack *extack) { struct netdev_notifier_offload_xstats_ru report_used = {}; struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .info.extack = extack, .type = type, .report_used = &report_used, }; int rc; WARN_ON(!netdev_offload_xstats_enabled(dev, type)); rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED, &info.info); *p_used = report_used.used; return notifier_to_errno(rc); } static int netdev_offload_xstats_get_stats(struct net_device *dev, enum netdev_offload_xstats_type type, struct rtnl_hw_stats64 *p_stats, bool *p_used, struct netlink_ext_ack *extack) { struct netdev_notifier_offload_xstats_rd report_delta = {}; struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .info.extack = extack, .type = type, .report_delta = &report_delta, }; struct rtnl_hw_stats64 *stats; int rc; stats = netdev_offload_xstats_get_ptr(dev, type); if (WARN_ON(!stats)) return -EINVAL; rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, &info.info); /* Cache whatever we got, even if there was an error, otherwise the * successful stats retrievals would get lost. */ netdev_hw_stats64_add(stats, &report_delta.stats); if (p_stats) *p_stats = *stats; *p_used = report_delta.used; return notifier_to_errno(rc); } int netdev_offload_xstats_get(struct net_device *dev, enum netdev_offload_xstats_type type, struct rtnl_hw_stats64 *p_stats, bool *p_used, struct netlink_ext_ack *extack) { ASSERT_RTNL(); if (p_stats) return netdev_offload_xstats_get_stats(dev, type, p_stats, p_used, extack); else return netdev_offload_xstats_get_used(dev, type, p_used, extack); } EXPORT_SYMBOL(netdev_offload_xstats_get); void netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta, const struct rtnl_hw_stats64 *stats) { report_delta->used = true; netdev_hw_stats64_add(&report_delta->stats, stats); } EXPORT_SYMBOL(netdev_offload_xstats_report_delta); void netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used) { report_used->used = true; } EXPORT_SYMBOL(netdev_offload_xstats_report_used); void netdev_offload_xstats_push_delta(struct net_device *dev, enum netdev_offload_xstats_type type, const struct rtnl_hw_stats64 *p_stats) { struct rtnl_hw_stats64 *stats; ASSERT_RTNL(); stats = netdev_offload_xstats_get_ptr(dev, type); if (WARN_ON(!stats)) return; netdev_hw_stats64_add(stats, p_stats); } EXPORT_SYMBOL(netdev_offload_xstats_push_delta); /** * netdev_get_xmit_slave - Get the xmit slave of master device * @dev: device * @skb: The packet * @all_slaves: assume all the slaves are active * * The reference counters are not incremented so the caller must be * careful with locks. The caller must hold RCU lock. * %NULL is returned if no slave is found. */ struct net_device *netdev_get_xmit_slave(struct net_device *dev, struct sk_buff *skb, bool all_slaves) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_get_xmit_slave) return NULL; return ops->ndo_get_xmit_slave(dev, skb, all_slaves); } EXPORT_SYMBOL(netdev_get_xmit_slave); static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev, struct sock *sk) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_sk_get_lower_dev) return NULL; return ops->ndo_sk_get_lower_dev(dev, sk); } /** * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket * @dev: device * @sk: the socket * * %NULL is returned if no lower device is found. */ struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, struct sock *sk) { struct net_device *lower; lower = netdev_sk_get_lower_dev(dev, sk); while (lower) { dev = lower; lower = netdev_sk_get_lower_dev(dev, sk); } return dev; } EXPORT_SYMBOL(netdev_sk_get_lowest_dev); static void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter; struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_add(dev, iter->dev, &dev->adj_list.upper); } list_for_each_entry(iter, &dev->adj_list.lower, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_add(dev, iter->dev, &dev->adj_list.lower); } } static void netdev_adjacent_del_links(struct net_device *dev) { struct netdev_adjacent *iter; struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, dev->name, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_del(dev, iter->dev->name, &dev->adj_list.upper); } list_for_each_entry(iter, &dev->adj_list.lower, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, dev->name, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_del(dev, iter->dev->name, &dev->adj_list.lower); } } void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) { struct netdev_adjacent *iter; struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.lower); } list_for_each_entry(iter, &dev->adj_list.lower, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.upper); } } void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev) { struct netdev_adjacent *lower; if (!lower_dev) return NULL; lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); if (!lower) return NULL; return lower->private; } EXPORT_SYMBOL(netdev_lower_dev_get_private); /** * netdev_lower_state_changed - Dispatch event about lower device state change * @lower_dev: device * @lower_state_info: state to dispatch * * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. * The caller must hold the RTNL lock. */ void netdev_lower_state_changed(struct net_device *lower_dev, void *lower_state_info) { struct netdev_notifier_changelowerstate_info changelowerstate_info = { .info.dev = lower_dev, }; ASSERT_RTNL(); changelowerstate_info.lower_state_info = lower_state_info; call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, &changelowerstate_info.info); } EXPORT_SYMBOL(netdev_lower_state_changed); static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_change_rx_flags) ops->ndo_change_rx_flags(dev, flags); } static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; kuid_t uid; kgid_t gid; ASSERT_RTNL(); dev->flags |= IFF_PROMISC; dev->promiscuity += inc; if (dev->promiscuity == 0) { /* * Avoid overflow. * If inc causes overflow, untouch promisc and return error. */ if (inc < 0) dev->flags &= ~IFF_PROMISC; else { dev->promiscuity -= inc; netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n"); return -EOVERFLOW; } } if (dev->flags != old_flags) { netdev_info(dev, "%s promiscuous mode\n", dev->flags & IFF_PROMISC ? "entered" : "left"); if (audit_enabled) { current_uid_gid(&uid, &gid); audit_log(audit_context(), GFP_ATOMIC, AUDIT_ANOM_PROMISCUOUS, "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", dev->name, (dev->flags & IFF_PROMISC), (old_flags & IFF_PROMISC), from_kuid(&init_user_ns, audit_get_loginuid(current)), from_kuid(&init_user_ns, uid), from_kgid(&init_user_ns, gid), audit_get_sessionid(current)); } dev_change_rx_flags(dev, IFF_PROMISC); } if (notify) __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL); return 0; } /** * dev_set_promiscuity - update promiscuity count on a device * @dev: device * @inc: modifier * * Add or remove promiscuity from a device. While the count in the device * remains above zero the interface remains promiscuous. Once it hits zero * the device reverts back to normal filtering operation. A negative inc * value is used to drop promiscuity on the device. * Return 0 if successful or a negative errno code on error. */ int dev_set_promiscuity(struct net_device *dev, int inc) { unsigned int old_flags = dev->flags; int err; err = __dev_set_promiscuity(dev, inc, true); if (err < 0) return err; if (dev->flags != old_flags) dev_set_rx_mode(dev); return err; } EXPORT_SYMBOL(dev_set_promiscuity); static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags, old_gflags = dev->gflags; ASSERT_RTNL(); dev->flags |= IFF_ALLMULTI; dev->allmulti += inc; if (dev->allmulti == 0) { /* * Avoid overflow. * If inc causes overflow, untouch allmulti and return error. */ if (inc < 0) dev->flags &= ~IFF_ALLMULTI; else { dev->allmulti -= inc; netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n"); return -EOVERFLOW; } } if (dev->flags ^ old_flags) { netdev_info(dev, "%s allmulticast mode\n", dev->flags & IFF_ALLMULTI ? "entered" : "left"); dev_change_rx_flags(dev, IFF_ALLMULTI); dev_set_rx_mode(dev); if (notify) __dev_notify_flags(dev, old_flags, dev->gflags ^ old_gflags, 0, NULL); } return 0; } /** * dev_set_allmulti - update allmulti count on a device * @dev: device * @inc: modifier * * Add or remove reception of all multicast frames to a device. While the * count in the device remains above zero the interface remains listening * to all interfaces. Once it hits zero the device reverts back to normal * filtering operation. A negative @inc value is used to drop the counter * when releasing a resource needing all multicasts. * Return 0 if successful or a negative errno code on error. */ int dev_set_allmulti(struct net_device *dev, int inc) { return __dev_set_allmulti(dev, inc, true); } EXPORT_SYMBOL(dev_set_allmulti); /* * Upload unicast and multicast address lists to device and * configure RX filtering. When the device doesn't support unicast * filtering it is put in promiscuous mode while unicast addresses * are present. */ void __dev_set_rx_mode(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; /* dev_open will call this function so the list will stay sane. */ if (!(dev->flags&IFF_UP)) return; if (!netif_device_present(dev)) return; if (!(dev->priv_flags & IFF_UNICAST_FLT)) { /* Unicast addresses changes may only happen under the rtnl, * therefore calling __dev_set_promiscuity here is safe. */ if (!netdev_uc_empty(dev) && !dev->uc_promisc) { __dev_set_promiscuity(dev, 1, false); dev->uc_promisc = true; } else if (netdev_uc_empty(dev) && dev->uc_promisc) { __dev_set_promiscuity(dev, -1, false); dev->uc_promisc = false; } } if (ops->ndo_set_rx_mode) ops->ndo_set_rx_mode(dev); } void dev_set_rx_mode(struct net_device *dev) { netif_addr_lock_bh(dev); __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); } /** * dev_get_flags - get flags reported to userspace * @dev: device * * Get the combination of flag bits exported through APIs to userspace. */ unsigned int dev_get_flags(const struct net_device *dev) { unsigned int flags; flags = (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI | IFF_RUNNING | IFF_LOWER_UP | IFF_DORMANT)) | (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI)); if (netif_running(dev)) { if (netif_oper_up(dev)) flags |= IFF_RUNNING; if (netif_carrier_ok(dev)) flags |= IFF_LOWER_UP; if (netif_dormant(dev)) flags |= IFF_DORMANT; } return flags; } EXPORT_SYMBOL(dev_get_flags); int __dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack) { unsigned int old_flags = dev->flags; int ret; ASSERT_RTNL(); /* * Set the flags on our device. */ dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | IFF_AUTOMEDIA)) | (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | IFF_ALLMULTI)); /* * Load in the correct multicast list now the flags have changed. */ if ((old_flags ^ flags) & IFF_MULTICAST) dev_change_rx_flags(dev, IFF_MULTICAST); dev_set_rx_mode(dev); /* * Have we downed the interface. We handle IFF_UP ourselves * according to user attempts to set it, rather than blindly * setting it. */ ret = 0; if ((old_flags ^ flags) & IFF_UP) { if (old_flags & IFF_UP) __dev_close(dev); else ret = __dev_open(dev, extack); } if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; unsigned int old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; if (__dev_set_promiscuity(dev, inc, false) >= 0) if (dev->flags != old_flags) dev_set_rx_mode(dev); } /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI * is important. Some (broken) drivers set IFF_PROMISC, when * IFF_ALLMULTI is requested not asking us and not reporting. */ if ((flags ^ dev->gflags) & IFF_ALLMULTI) { int inc = (flags & IFF_ALLMULTI) ? 1 : -1; dev->gflags ^= IFF_ALLMULTI; __dev_set_allmulti(dev, inc, false); } return ret; } void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, unsigned int gchanges, u32 portid, const struct nlmsghdr *nlh) { unsigned int changes = dev->flags ^ old_flags; if (gchanges) rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh); if (changes & IFF_UP) { if (dev->flags & IFF_UP) call_netdevice_notifiers(NETDEV_UP, dev); else call_netdevice_notifiers(NETDEV_DOWN, dev); } if (dev->flags & IFF_UP && (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { struct netdev_notifier_change_info change_info = { .info = { .dev = dev, }, .flags_changed = changes, }; call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); } } /** * dev_change_flags - change device settings * @dev: device * @flags: device state flags * @extack: netlink extended ack * * Change settings on device based state flags. The flags are * in the userspace exported format. */ int dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack) { int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; ret = __dev_change_flags(dev, flags, extack); if (ret < 0) return ret; changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); __dev_notify_flags(dev, old_flags, changes, 0, NULL); return ret; } EXPORT_SYMBOL(dev_change_flags); int __dev_set_mtu(struct net_device *dev, int new_mtu) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_change_mtu) return ops->ndo_change_mtu(dev, new_mtu); /* Pairs with all the lockless reads of dev->mtu in the stack */ WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(__dev_set_mtu); int dev_validate_mtu(struct net_device *dev, int new_mtu, struct netlink_ext_ack *extack) { /* MTU must be positive, and in range */ if (new_mtu < 0 || new_mtu < dev->min_mtu) { NL_SET_ERR_MSG(extack, "mtu less than device minimum"); return -EINVAL; } if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { NL_SET_ERR_MSG(extack, "mtu greater than device maximum"); return -EINVAL; } return 0; } /** * dev_set_mtu_ext - Change maximum transfer unit * @dev: device * @new_mtu: new transfer unit * @extack: netlink extended ack * * Change the maximum transfer size of the network device. */ int dev_set_mtu_ext(struct net_device *dev, int new_mtu, struct netlink_ext_ack *extack) { int err, orig_mtu; if (new_mtu == dev->mtu) return 0; err = dev_validate_mtu(dev, new_mtu, extack); if (err) return err; if (!netif_device_present(dev)) return -ENODEV; err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); err = notifier_to_errno(err); if (err) return err; orig_mtu = dev->mtu; err = __dev_set_mtu(dev, new_mtu); if (!err) { err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, orig_mtu); err = notifier_to_errno(err); if (err) { /* setting mtu back and notifying everyone again, * so that they have a chance to revert changes. */ __dev_set_mtu(dev, orig_mtu); call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, new_mtu); } } return err; } int dev_set_mtu(struct net_device *dev, int new_mtu) { struct netlink_ext_ack extack; int err; memset(&extack, 0, sizeof(extack)); err = dev_set_mtu_ext(dev, new_mtu, &extack); if (err && extack._msg) net_err_ratelimited("%s: %s\n", dev->name, extack._msg); return err; } EXPORT_SYMBOL(dev_set_mtu); /** * dev_change_tx_queue_len - Change TX queue length of a netdevice * @dev: device * @new_len: new tx queue length */ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) { unsigned int orig_len = dev->tx_queue_len; int res; if (new_len != (unsigned int)new_len) return -ERANGE; if (new_len != orig_len) { dev->tx_queue_len = new_len; res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); res = notifier_to_errno(res); if (res) goto err_rollback; res = dev_qdisc_change_tx_queue_len(dev); if (res) goto err_rollback; } return 0; err_rollback: netdev_err(dev, "refused to change device tx_queue_len\n"); dev->tx_queue_len = orig_len; return res; } /** * dev_set_group - Change group this device belongs to * @dev: device * @new_group: group this device should belong to */ void dev_set_group(struct net_device *dev, int new_group) { dev->group = new_group; } /** * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR. * @dev: device * @addr: new address * @extack: netlink extended ack */ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack) { struct netdev_notifier_pre_changeaddr_info info = { .info.dev = dev, .info.extack = extack, .dev_addr = addr, }; int rc; rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info); return notifier_to_errno(rc); } EXPORT_SYMBOL(dev_pre_changeaddr_notify); /** * dev_set_mac_address - Change Media Access Control Address * @dev: device * @sa: new address * @extack: netlink extended ack * * Change the hardware (MAC) address of the device */ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int err; if (!ops->ndo_set_mac_address) return -EOPNOTSUPP; if (sa->sa_family != dev->type) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); if (err) return err; if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) { err = ops->ndo_set_mac_address(dev, sa); if (err) return err; } dev->addr_assign_type = NET_ADDR_SET; call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); add_device_randomness(dev->dev_addr, dev->addr_len); return 0; } EXPORT_SYMBOL(dev_set_mac_address); static DECLARE_RWSEM(dev_addr_sem); int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack) { int ret; down_write(&dev_addr_sem); ret = dev_set_mac_address(dev, sa, extack); up_write(&dev_addr_sem); return ret; } EXPORT_SYMBOL(dev_set_mac_address_user); int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) { size_t size = sizeof(sa->sa_data_min); struct net_device *dev; int ret = 0; down_read(&dev_addr_sem); rcu_read_lock(); dev = dev_get_by_name_rcu(net, dev_name); if (!dev) { ret = -ENODEV; goto unlock; } if (!dev->addr_len) memset(sa->sa_data, 0, size); else memcpy(sa->sa_data, dev->dev_addr, min_t(size_t, size, dev->addr_len)); sa->sa_family = dev->type; unlock: rcu_read_unlock(); up_read(&dev_addr_sem); return ret; } EXPORT_SYMBOL(dev_get_mac_address); /** * dev_change_carrier - Change device carrier * @dev: device * @new_carrier: new value * * Change device carrier */ int dev_change_carrier(struct net_device *dev, bool new_carrier) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_change_carrier) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; return ops->ndo_change_carrier(dev, new_carrier); } /** * dev_get_phys_port_id - Get device physical port ID * @dev: device * @ppid: port ID * * Get device physical port ID */ int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_get_phys_port_id) return -EOPNOTSUPP; return ops->ndo_get_phys_port_id(dev, ppid); } /** * dev_get_phys_port_name - Get device physical port name * @dev: device * @name: port name * @len: limit of bytes to copy to name * * Get device physical port name */ int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len) { const struct net_device_ops *ops = dev->netdev_ops; int err; if (ops->ndo_get_phys_port_name) { err = ops->ndo_get_phys_port_name(dev, name, len); if (err != -EOPNOTSUPP) return err; } return devlink_compat_phys_port_name_get(dev, name, len); } /** * dev_get_port_parent_id - Get the device's port parent identifier * @dev: network device * @ppid: pointer to a storage for the port's parent identifier * @recurse: allow/disallow recursion to lower devices * * Get the devices's port parent identifier */ int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse) { const struct net_device_ops *ops = dev->netdev_ops; struct netdev_phys_item_id first = { }; struct net_device *lower_dev; struct list_head *iter; int err; if (ops->ndo_get_port_parent_id) { err = ops->ndo_get_port_parent_id(dev, ppid); if (err != -EOPNOTSUPP) return err; } err = devlink_compat_switch_id_get(dev, ppid); if (!recurse || err != -EOPNOTSUPP) return err; netdev_for_each_lower_dev(dev, lower_dev, iter) { err = dev_get_port_parent_id(lower_dev, ppid, true); if (err) break; if (!first.id_len) first = *ppid; else if (memcmp(&first, ppid, sizeof(*ppid))) return -EOPNOTSUPP; } return err; } EXPORT_SYMBOL(dev_get_port_parent_id); /** * netdev_port_same_parent_id - Indicate if two network devices have * the same port parent identifier * @a: first network device * @b: second network device */ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) { struct netdev_phys_item_id a_id = { }; struct netdev_phys_item_id b_id = { }; if (dev_get_port_parent_id(a, &a_id, true) || dev_get_port_parent_id(b, &b_id, true)) return false; return netdev_phys_item_id_same(&a_id, &b_id); } EXPORT_SYMBOL(netdev_port_same_parent_id); static void netdev_dpll_pin_assign(struct net_device *dev, struct dpll_pin *dpll_pin) { #if IS_ENABLED(CONFIG_DPLL) rtnl_lock(); dev->dpll_pin = dpll_pin; rtnl_unlock(); #endif } void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin) { WARN_ON(!dpll_pin); netdev_dpll_pin_assign(dev, dpll_pin); } EXPORT_SYMBOL(netdev_dpll_pin_set); void netdev_dpll_pin_clear(struct net_device *dev) { netdev_dpll_pin_assign(dev, NULL); } EXPORT_SYMBOL(netdev_dpll_pin_clear); /** * dev_change_proto_down - set carrier according to proto_down. * * @dev: device * @proto_down: new value */ int dev_change_proto_down(struct net_device *dev, bool proto_down) { if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; if (proto_down) netif_carrier_off(dev); else netif_carrier_on(dev); dev->proto_down = proto_down; return 0; } /** * dev_change_proto_down_reason - proto down reason * * @dev: device * @mask: proto down mask * @value: proto down value */ void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, u32 value) { int b; if (!mask) { dev->proto_down_reason = value; } else { for_each_set_bit(b, &mask, 32) { if (value & (1 << b)) dev->proto_down_reason |= BIT(b); else dev->proto_down_reason &= ~BIT(b); } } } struct bpf_xdp_link { struct bpf_link link; struct net_device *dev; /* protected by rtnl_lock, no refcnt held */ int flags; }; static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags) { if (flags & XDP_FLAGS_HW_MODE) return XDP_MODE_HW; if (flags & XDP_FLAGS_DRV_MODE) return XDP_MODE_DRV; if (flags & XDP_FLAGS_SKB_MODE) return XDP_MODE_SKB; return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB; } static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) { switch (mode) { case XDP_MODE_SKB: return generic_xdp_install; case XDP_MODE_DRV: case XDP_MODE_HW: return dev->netdev_ops->ndo_bpf; default: return NULL; } } static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, enum bpf_xdp_mode mode) { return dev->xdp_state[mode].link; } static struct bpf_prog *dev_xdp_prog(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_xdp_link *link = dev_xdp_link(dev, mode); if (link) return link->link.prog; return dev->xdp_state[mode].prog; } u8 dev_xdp_prog_count(struct net_device *dev) { u8 count = 0; int i; for (i = 0; i < __MAX_XDP_MODE; i++) if (dev->xdp_state[i].prog || dev->xdp_state[i].link) count++; return count; } EXPORT_SYMBOL_GPL(dev_xdp_prog_count); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_prog *prog = dev_xdp_prog(dev, mode); return prog ? prog->aux->id : 0; } static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode, struct bpf_xdp_link *link) { dev->xdp_state[mode].link = link; dev->xdp_state[mode].prog = NULL; } static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode, struct bpf_prog *prog) { dev->xdp_state[mode].link = NULL; dev->xdp_state[mode].prog = prog; } static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, bpf_op_t bpf_op, struct netlink_ext_ack *extack, u32 flags, struct bpf_prog *prog) { struct netdev_bpf xdp; int err; memset(&xdp, 0, sizeof(xdp)); xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG; xdp.extack = extack; xdp.flags = flags; xdp.prog = prog; /* Drivers assume refcnt is already incremented (i.e, prog pointer is * "moved" into driver), so they don't increment it on their own, but * they do decrement refcnt when program is detached or replaced. * Given net_device also owns link/prog, we need to bump refcnt here * to prevent drivers from underflowing it. */ if (prog) bpf_prog_inc(prog); err = bpf_op(dev, &xdp); if (err) { if (prog) bpf_prog_put(prog); return err; } if (mode != XDP_MODE_HW) bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog); return 0; } static void dev_xdp_uninstall(struct net_device *dev) { struct bpf_xdp_link *link; struct bpf_prog *prog; enum bpf_xdp_mode mode; bpf_op_t bpf_op; ASSERT_RTNL(); for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) { prog = dev_xdp_prog(dev, mode); if (!prog) continue; bpf_op = dev_xdp_bpf_op(dev, mode); if (!bpf_op) continue; WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); /* auto-detach link from net device */ link = dev_xdp_link(dev, mode); if (link) link->dev = NULL; else bpf_prog_put(prog); dev_xdp_set_link(dev, mode, NULL); } } static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack, struct bpf_xdp_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog, u32 flags) { unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES); struct bpf_prog *cur_prog; struct net_device *upper; struct list_head *iter; enum bpf_xdp_mode mode; bpf_op_t bpf_op; int err; ASSERT_RTNL(); /* either link or prog attachment, never both */ if (link && (new_prog || old_prog)) return -EINVAL; /* link supports only XDP mode flags */ if (link && (flags & ~XDP_FLAGS_MODES)) { NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); return -EINVAL; } /* just one XDP mode bit should be set, zero defaults to drv/skb mode */ if (num_modes > 1) { NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); return -EINVAL; } /* avoid ambiguity if offload + drv/skb mode progs are both loaded */ if (!num_modes && dev_xdp_prog_count(dev) > 1) { NL_SET_ERR_MSG(extack, "More than one program loaded, unset mode is ambiguous"); return -EINVAL; } /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */ if (old_prog && !(flags & XDP_FLAGS_REPLACE)) { NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified"); return -EINVAL; } mode = dev_xdp_mode(dev, flags); /* can't replace attached link */ if (dev_xdp_link(dev, mode)) { NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link"); return -EBUSY; } /* don't allow if an upper device already has a program */ netdev_for_each_upper_dev_rcu(dev, upper, iter) { if (dev_xdp_prog_count(upper) > 0) { NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program"); return -EEXIST; } } cur_prog = dev_xdp_prog(dev, mode); /* can't replace attached prog with link */ if (link && cur_prog) { NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link"); return -EBUSY; } if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) { NL_SET_ERR_MSG(extack, "Active program does not match expected"); return -EEXIST; } /* put effective new program into new_prog */ if (link) new_prog = link->link.prog; if (new_prog) { bool offload = mode == XDP_MODE_HW; enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB ? XDP_MODE_DRV : XDP_MODE_SKB; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) { NL_SET_ERR_MSG(extack, "XDP program already attached"); return -EBUSY; } if (!offload && dev_xdp_prog(dev, other_mode)) { NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); return -EEXIST; } if (!offload && bpf_prog_is_offloaded(new_prog->aux)) { NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported"); return -EINVAL; } if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) { NL_SET_ERR_MSG(extack, "Program bound to different device"); return -EINVAL; } if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); return -EINVAL; } if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) { NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device"); return -EINVAL; } } /* don't call drivers if the effective program didn't change */ if (new_prog != cur_prog) { bpf_op = dev_xdp_bpf_op(dev, mode); if (!bpf_op) { NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode"); return -EOPNOTSUPP; } err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog); if (err) return err; } if (link) dev_xdp_set_link(dev, mode, link); else dev_xdp_set_prog(dev, mode, new_prog); if (cur_prog) bpf_prog_put(cur_prog); return 0; } static int dev_xdp_attach_link(struct net_device *dev, struct netlink_ext_ack *extack, struct bpf_xdp_link *link) { return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags); } static int dev_xdp_detach_link(struct net_device *dev, struct netlink_ext_ack *extack, struct bpf_xdp_link *link) { enum bpf_xdp_mode mode; bpf_op_t bpf_op; ASSERT_RTNL(); mode = dev_xdp_mode(dev, link->flags); if (dev_xdp_link(dev, mode) != link) return -EINVAL; bpf_op = dev_xdp_bpf_op(dev, mode); WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); dev_xdp_set_link(dev, mode, NULL); return 0; } static void bpf_xdp_link_release(struct bpf_link *link) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); rtnl_lock(); /* if racing with net_device's tear down, xdp_link->dev might be * already NULL, in which case link was already auto-detached */ if (xdp_link->dev) { WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); xdp_link->dev = NULL; } rtnl_unlock(); } static int bpf_xdp_link_detach(struct bpf_link *link) { bpf_xdp_link_release(link); return 0; } static void bpf_xdp_link_dealloc(struct bpf_link *link) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); kfree(xdp_link); } static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); u32 ifindex = 0; rtnl_lock(); if (xdp_link->dev) ifindex = xdp_link->dev->ifindex; rtnl_unlock(); seq_printf(seq, "ifindex:\t%u\n", ifindex); } static int bpf_xdp_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); u32 ifindex = 0; rtnl_lock(); if (xdp_link->dev) ifindex = xdp_link->dev->ifindex; rtnl_unlock(); info->xdp.ifindex = ifindex; return 0; } static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); enum bpf_xdp_mode mode; bpf_op_t bpf_op; int err = 0; rtnl_lock(); /* link might have been auto-released already, so fail */ if (!xdp_link->dev) { err = -ENOLINK; goto out_unlock; } if (old_prog && link->prog != old_prog) { err = -EPERM; goto out_unlock; } old_prog = link->prog; if (old_prog->type != new_prog->type || old_prog->expected_attach_type != new_prog->expected_attach_type) { err = -EINVAL; goto out_unlock; } if (old_prog == new_prog) { /* no-op, don't disturb drivers */ bpf_prog_put(new_prog); goto out_unlock; } mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags); bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode); err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL, xdp_link->flags, new_prog); if (err) goto out_unlock; old_prog = xchg(&link->prog, new_prog); bpf_prog_put(old_prog); out_unlock: rtnl_unlock(); return err; } static const struct bpf_link_ops bpf_xdp_link_lops = { .release = bpf_xdp_link_release, .dealloc = bpf_xdp_link_dealloc, .detach = bpf_xdp_link_detach, .show_fdinfo = bpf_xdp_link_show_fdinfo, .fill_link_info = bpf_xdp_link_fill_link_info, .update_prog = bpf_xdp_link_update, }; int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct net *net = current->nsproxy->net_ns; struct bpf_link_primer link_primer; struct netlink_ext_ack extack = {}; struct bpf_xdp_link *link; struct net_device *dev; int err, fd; rtnl_lock(); dev = dev_get_by_index(net, attr->link_create.target_ifindex); if (!dev) { rtnl_unlock(); return -EINVAL; } link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; goto unlock; } bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); link->dev = dev; link->flags = attr->link_create.flags; err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); goto unlock; } err = dev_xdp_attach_link(dev, &extack, link); rtnl_unlock(); if (err) { link->dev = NULL; bpf_link_cleanup(&link_primer); trace_bpf_xdp_link_attach_failed(extack._msg); goto out_put_dev; } fd = bpf_link_settle(&link_primer); /* link itself doesn't hold dev's refcnt to not complicate shutdown */ dev_put(dev); return fd; unlock: rtnl_unlock(); out_put_dev: dev_put(dev); return err; } /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device * @extack: netlink extended ack * @fd: new program fd or negative value to clear * @expected_fd: old program fd that userspace expects to replace or clear * @flags: xdp-related flags * * Set or clear a bpf program for a device */ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags) { enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags); struct bpf_prog *new_prog = NULL, *old_prog = NULL; int err; ASSERT_RTNL(); if (fd >= 0) { new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, mode != XDP_MODE_SKB); if (IS_ERR(new_prog)) return PTR_ERR(new_prog); } if (expected_fd >= 0) { old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP, mode != XDP_MODE_SKB); if (IS_ERR(old_prog)) { err = PTR_ERR(old_prog); old_prog = NULL; goto err_out; } } err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags); err_out: if (err && new_prog) bpf_prog_put(new_prog); if (old_prog) bpf_prog_put(old_prog); return err; } /** * dev_index_reserve() - allocate an ifindex in a namespace * @net: the applicable net namespace * @ifindex: requested ifindex, pass %0 to get one allocated * * Allocate a ifindex for a new device. Caller must either use the ifindex * to store the device (via list_netdevice()) or call dev_index_release() * to give the index up. * * Return: a suitable unique value for a new device interface number or -errno. */ static int dev_index_reserve(struct net *net, u32 ifindex) { int err; if (ifindex > INT_MAX) { DEBUG_NET_WARN_ON_ONCE(1); return -EINVAL; } if (!ifindex) err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL, xa_limit_31b, &net->ifindex, GFP_KERNEL); else err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL); if (err < 0) return err; return ifindex; } static void dev_index_release(struct net *net, int ifindex) { /* Expect only unused indexes, unlist_netdevice() removes the used */ WARN_ON(xa_erase(&net->dev_by_index, ifindex)); } /* Delayed registration/unregisteration */ LIST_HEAD(net_todo_list); DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); atomic_inc(&dev_net(dev)->dev_unreg_count); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, struct net_device *upper, netdev_features_t features) { netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; netdev_features_t feature; int feature_bit; for_each_netdev_feature(upper_disables, feature_bit) { feature = __NETIF_F_BIT(feature_bit); if (!(upper->wanted_features & feature) && (features & feature)) { netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n", &feature, upper->name); features &= ~feature; } } return features; } static void netdev_sync_lower_features(struct net_device *upper, struct net_device *lower, netdev_features_t features) { netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; netdev_features_t feature; int feature_bit; for_each_netdev_feature(upper_disables, feature_bit) { feature = __NETIF_F_BIT(feature_bit); if (!(features & feature) && (lower->features & feature)) { netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", &feature, lower->name); lower->wanted_features &= ~feature; __netdev_update_features(lower); if (unlikely(lower->features & feature)) netdev_WARN(upper, "failed to disable %pNF on %s!\n", &feature, lower->name); else netdev_features_change(lower); } } } static netdev_features_t netdev_fix_features(struct net_device *dev, netdev_features_t features) { /* Fix illegal checksum combinations */ if ((features & NETIF_F_HW_CSUM) && (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { netdev_warn(dev, "mixed HW and IP checksum settings.\n"); features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); } /* TSO requires that SG is present as well. */ if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); features &= ~NETIF_F_ALL_TSO; } if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && !(features & NETIF_F_IP_CSUM)) { netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); features &= ~NETIF_F_TSO; features &= ~NETIF_F_TSO_ECN; } if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && !(features & NETIF_F_IPV6_CSUM)) { netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); features &= ~NETIF_F_TSO6; } /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */ if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO)) features &= ~NETIF_F_TSO_MANGLEID; /* TSO ECN requires that TSO is present as well. */ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) features &= ~NETIF_F_TSO_ECN; /* Software GSO depends on SG. */ if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); features &= ~NETIF_F_GSO; } /* GSO partial features require GSO partial be set */ if ((features & dev->gso_partial_features) && !(features & NETIF_F_GSO_PARTIAL)) { netdev_dbg(dev, "Dropping partially supported GSO features since no GSO partial.\n"); features &= ~dev->gso_partial_features; } if (!(features & NETIF_F_RXCSUM)) { /* NETIF_F_GRO_HW implies doing RXCSUM since every packet * successfully merged by hardware must also have the * checksum verified by hardware. If the user does not * want to enable RXCSUM, logically, we should disable GRO_HW. */ if (features & NETIF_F_GRO_HW) { netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n"); features &= ~NETIF_F_GRO_HW; } } /* LRO/HW-GRO features cannot be combined with RX-FCS */ if (features & NETIF_F_RXFCS) { if (features & NETIF_F_LRO) { netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n"); features &= ~NETIF_F_LRO; } if (features & NETIF_F_GRO_HW) { netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n"); features &= ~NETIF_F_GRO_HW; } } if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) { netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n"); features &= ~NETIF_F_LRO; } if (features & NETIF_F_HW_TLS_TX) { bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) == (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); bool hw_csum = features & NETIF_F_HW_CSUM; if (!ip_csum && !hw_csum) { netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); features &= ~NETIF_F_HW_TLS_TX; } } if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n"); features &= ~NETIF_F_HW_TLS_RX; } return features; } int __netdev_update_features(struct net_device *dev) { struct net_device *upper, *lower; netdev_features_t features; struct list_head *iter; int err = -1; ASSERT_RTNL(); features = netdev_get_wanted_features(dev); if (dev->netdev_ops->ndo_fix_features) features = dev->netdev_ops->ndo_fix_features(dev, features); /* driver might be less strict about feature dependencies */ features = netdev_fix_features(dev, features); /* some features can't be enabled if they're off on an upper device */ netdev_for_each_upper_dev_rcu(dev, upper, iter) features = netdev_sync_upper_features(dev, upper, features); if (dev->features == features) goto sync_lower; netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", &dev->features, &features); if (dev->netdev_ops->ndo_set_features) err = dev->netdev_ops->ndo_set_features(dev, features); else err = 0; if (unlikely(err < 0)) { netdev_err(dev, "set_features() failed (%d); wanted %pNF, left %pNF\n", err, &features, &dev->features); /* return non-0 since some features might have changed and * it's better to fire a spurious notification than miss it */ return -1; } sync_lower: /* some features must be disabled on lower devices when disabled * on an upper device (think: bonding master or bridge) */ netdev_for_each_lower_dev(dev, lower, iter) netdev_sync_lower_features(dev, lower, features); if (!err) { netdev_features_t diff = features ^ dev->features; if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) { /* udp_tunnel_{get,drop}_rx_info both need * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the * device, or they won't do anything. * Thus we need to update dev->features * *before* calling udp_tunnel_get_rx_info, * but *after* calling udp_tunnel_drop_rx_info. */ if (features & NETIF_F_RX_UDP_TUNNEL_PORT) { dev->features = features; udp_tunnel_get_rx_info(dev); } else { udp_tunnel_drop_rx_info(dev); } } if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) { if (features & NETIF_F_HW_VLAN_CTAG_FILTER) { dev->features = features; err |= vlan_get_rx_ctag_filter_info(dev); } else { vlan_drop_rx_ctag_filter_info(dev); } } if (diff & NETIF_F_HW_VLAN_STAG_FILTER) { if (features & NETIF_F_HW_VLAN_STAG_FILTER) { dev->features = features; err |= vlan_get_rx_stag_filter_info(dev); } else { vlan_drop_rx_stag_filter_info(dev); } } dev->features = features; } return err < 0 ? 0 : 1; } /** * netdev_update_features - recalculate device features * @dev: the device to check * * Recalculate dev->features set and send notifications if it * has changed. Should be called after driver or hardware dependent * conditions might have changed that influence the features. */ void netdev_update_features(struct net_device *dev) { if (__netdev_update_features(dev)) netdev_features_change(dev); } EXPORT_SYMBOL(netdev_update_features); /** * netdev_change_features - recalculate device features * @dev: the device to check * * Recalculate dev->features set and send notifications even * if they have not changed. Should be called instead of * netdev_update_features() if also dev->vlan_features might * have changed to allow the changes to be propagated to stacked * VLAN devices. */ void netdev_change_features(struct net_device *dev) { __netdev_update_features(dev); netdev_features_change(dev); } EXPORT_SYMBOL(netdev_change_features); /** * netif_stacked_transfer_operstate - transfer operstate * @rootdev: the root or lower level device to transfer state from * @dev: the device to transfer operstate to * * Transfer operational state from root to device. This is normally * called when a stacking relationship exists between the root * device and the device(a leaf device). */ void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev) { if (rootdev->operstate == IF_OPER_DORMANT) netif_dormant_on(dev); else netif_dormant_off(dev); if (rootdev->operstate == IF_OPER_TESTING) netif_testing_on(dev); else netif_testing_off(dev); if (netif_carrier_ok(rootdev)) netif_carrier_on(dev); else netif_carrier_off(dev); } EXPORT_SYMBOL(netif_stacked_transfer_operstate); static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; size_t sz = count * sizeof(*rx); int err = 0; BUG_ON(count < 1); rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!rx) return -ENOMEM; dev->_rx = rx; for (i = 0; i < count; i++) { rx[i].dev = dev; /* XDP RX-queue setup */ err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0); if (err < 0) goto err_rxq_info; } return 0; err_rxq_info: /* Rollback successful reg's and free other resources */ while (i--) xdp_rxq_info_unreg(&rx[i].xdp_rxq); kvfree(dev->_rx); dev->_rx = NULL; return err; } static void netif_free_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */ if (!dev->_rx) return; for (i = 0; i < count; i++) xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq); kvfree(dev->_rx); } static void netdev_init_one_queue(struct net_device *dev, struct netdev_queue *queue, void *_unused) { /* Initialize queue lock */ spin_lock_init(&queue->_xmit_lock); netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); queue->xmit_lock_owner = -1; netdev_queue_numa_node_write(queue, NUMA_NO_NODE); queue->dev = dev; #ifdef CONFIG_BQL dql_init(&queue->dql, HZ); #endif } static void netif_free_tx_queues(struct net_device *dev) { kvfree(dev->_tx); } static int netif_alloc_netdev_queues(struct net_device *dev) { unsigned int count = dev->num_tx_queues; struct netdev_queue *tx; size_t sz = count * sizeof(*tx); if (count < 1 || count > 0xffff) return -EINVAL; tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!tx) return -ENOMEM; dev->_tx = tx; netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); spin_lock_init(&dev->tx_global_lock); return 0; } void netif_tx_stop_all_queues(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); netif_tx_stop_queue(txq); } } EXPORT_SYMBOL(netif_tx_stop_all_queues); static int netdev_do_alloc_pcpu_stats(struct net_device *dev) { void __percpu *v; /* Drivers implementing ndo_get_peer_dev must support tstat * accounting, so that skb_do_redirect() can bump the dev's * RX stats upon network namespace switch. */ if (dev->netdev_ops->ndo_get_peer_dev && dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS) return -EOPNOTSUPP; switch (dev->pcpu_stat_type) { case NETDEV_PCPU_STAT_NONE: return 0; case NETDEV_PCPU_STAT_LSTATS: v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); break; case NETDEV_PCPU_STAT_TSTATS: v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); break; case NETDEV_PCPU_STAT_DSTATS: v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); break; default: return -EINVAL; } return v ? 0 : -ENOMEM; } static void netdev_do_free_pcpu_stats(struct net_device *dev) { switch (dev->pcpu_stat_type) { case NETDEV_PCPU_STAT_NONE: return; case NETDEV_PCPU_STAT_LSTATS: free_percpu(dev->lstats); break; case NETDEV_PCPU_STAT_TSTATS: free_percpu(dev->tstats); break; case NETDEV_PCPU_STAT_DSTATS: free_percpu(dev->dstats); break; } } /** * register_netdevice() - register a network device * @dev: device to register * * Take a prepared network device structure and make it externally accessible. * A %NETDEV_REGISTER message is sent to the netdev notifier chain. * Callers must hold the rtnl lock - you may want register_netdev() * instead of this. */ int register_netdevice(struct net_device *dev) { int ret; struct net *net = dev_net(dev); BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE < NETDEV_FEATURE_COUNT); BUG_ON(dev_boot_phase); ASSERT_RTNL(); might_sleep(); /* When net_device's are persistent, this will be fatal. */ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); BUG_ON(!net); ret = ethtool_check_ops(dev->ethtool_ops); if (ret) return ret; spin_lock_init(&dev->addr_list_lock); netdev_set_addr_lockdep_class(dev); ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) goto out; ret = -ENOMEM; dev->name_node = netdev_name_node_head_alloc(dev); if (!dev->name_node) goto out; /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); if (ret) { if (ret > 0) ret = -EIO; goto err_free_name; } } if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_CTAG_FILTER) && (!dev->netdev_ops->ndo_vlan_rx_add_vid || !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); ret = -EINVAL; goto err_uninit; } ret = netdev_do_alloc_pcpu_stats(dev); if (ret) goto err_uninit; ret = dev_index_reserve(net, dev->ifindex); if (ret < 0) goto err_free_pcpu; dev->ifindex = ret; /* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO). */ dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF); dev->features |= NETIF_F_SOFT_FEATURES; if (dev->udp_tunnel_nic_info) { dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT; dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT; } dev->wanted_features = dev->features & dev->hw_features; if (!(dev->flags & IFF_LOOPBACK)) dev->hw_features |= NETIF_F_NOCACHE_COPY; /* If IPv4 TCP segmentation offload is supported we should also * allow the device to enable segmenting the frame with the option * of ignoring a static IP ID value. This doesn't enable the * feature itself but allows the user to enable it later. */ if (dev->hw_features & NETIF_F_TSO) dev->hw_features |= NETIF_F_TSO_MANGLEID; if (dev->vlan_features & NETIF_F_TSO) dev->vlan_features |= NETIF_F_TSO_MANGLEID; if (dev->mpls_features & NETIF_F_TSO) dev->mpls_features |= NETIF_F_TSO_MANGLEID; if (dev->hw_enc_features & NETIF_F_TSO) dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. */ dev->vlan_features |= NETIF_F_HIGHDMA; /* Make NETIF_F_SG inheritable to tunnel devices. */ dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL; /* Make NETIF_F_SG inheritable to MPLS. */ dev->mpls_features |= NETIF_F_SG; ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) goto err_ifindex_release; ret = netdev_register_kobject(dev); write_lock(&dev_base_lock); dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED; write_unlock(&dev_base_lock); if (ret) goto err_uninit_notify; __netdev_update_features(dev); /* * Default initial state at registry is that the * device is present. */ set_bit(__LINK_STATE_PRESENT, &dev->state); linkwatch_init_dev(dev); dev_init_scheduler(dev); netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL); list_netdevice(dev); add_device_randomness(dev->dev_addr, dev->addr_len); /* If the device has permanent device address, driver should * set dev_addr and also addr_assign_type should be set to * NET_ADDR_PERM (default value). */ if (dev->addr_assign_type == NET_ADDR_PERM) memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); /* Notify protocols, that a new device appeared. */ ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); ret = notifier_to_errno(ret); if (ret) { /* Expect explicit free_netdev() on failure */ dev->needs_free_netdev = false; unregister_netdevice_queue(dev, NULL); goto out; } /* * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); out: return ret; err_uninit_notify: call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); err_ifindex_release: dev_index_release(net, dev->ifindex); err_free_pcpu: netdev_do_free_pcpu_stats(dev); err_uninit: if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); if (dev->priv_destructor) dev->priv_destructor(dev); err_free_name: netdev_name_node_free(dev->name_node); goto out; } EXPORT_SYMBOL(register_netdevice); /** * init_dummy_netdev - init a dummy network device for NAPI * @dev: device to init * * This takes a network device structure and initialize the minimum * amount of fields so it can be used to schedule NAPI polls without * registering a full blown interface. This is to be used by drivers * that need to tie several hardware interfaces to a single NAPI * poll scheduler due to HW limitations. */ int init_dummy_netdev(struct net_device *dev) { /* Clear everything. Note we don't initialize spinlocks * are they aren't supposed to be taken by any of the * NAPI code and this dummy netdev is supposed to be * only ever used for NAPI polls */ memset(dev, 0, sizeof(struct net_device)); /* make sure we BUG if trying to hit standard * register/unregister code path */ dev->reg_state = NETREG_DUMMY; /* NAPI wants this */ INIT_LIST_HEAD(&dev->napi_list); /* a dummy interface is started by default */ set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); /* napi_busy_loop stats accounting wants this */ dev_net_set(dev, &init_net); /* Note : We dont allocate pcpu_refcnt for dummy devices, * because users of this 'device' dont need to change * its refcount. */ return 0; } EXPORT_SYMBOL_GPL(init_dummy_netdev); /** * register_netdev - register a network device * @dev: device to register * * Take a completed network device structure and add it to the kernel * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier * chain. 0 is returned on success. A negative errno code is returned * on a failure to set up the device, or if the name is a duplicate. * * This is a wrapper around register_netdevice that takes the rtnl semaphore * and expands the device name if you passed a format string to * alloc_netdev. */ int register_netdev(struct net_device *dev) { int err; if (rtnl_lock_killable()) return -EINTR; err = register_netdevice(dev); rtnl_unlock(); return err; } EXPORT_SYMBOL(register_netdev); int netdev_refcnt_read(const struct net_device *dev) { #ifdef CONFIG_PCPU_DEV_REFCNT int i, refcnt = 0; for_each_possible_cpu(i) refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); return refcnt; #else return refcount_read(&dev->dev_refcnt); #endif } EXPORT_SYMBOL(netdev_refcnt_read); int netdev_unregister_timeout_secs __read_mostly = 10; #define WAIT_REFS_MIN_MSECS 1 #define WAIT_REFS_MAX_MSECS 250 /** * netdev_wait_allrefs_any - wait until all references are gone. * @list: list of net_devices to wait on * * This is called when unregistering network devices. * * Any protocol or device that holds a reference should register * for netdevice notification, and cleanup and put back the * reference if they receive an UNREGISTER event. * We can get stuck here if buggy protocols don't correctly * call dev_put. */ static struct net_device *netdev_wait_allrefs_any(struct list_head *list) { unsigned long rebroadcast_time, warning_time; struct net_device *dev; int wait = 0; rebroadcast_time = warning_time = jiffies; list_for_each_entry(dev, list, todo_list) if (netdev_refcnt_read(dev) == 1) return dev; while (true) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); /* Rebroadcast unregister notification */ list_for_each_entry(dev, list, todo_list) call_netdevice_notifiers(NETDEV_UNREGISTER, dev); __rtnl_unlock(); rcu_barrier(); rtnl_lock(); list_for_each_entry(dev, list, todo_list) if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { /* We must not have linkwatch events * pending on unregister. If this * happens, we simply run the queue * unscheduled, resulting in a noop * for this device. */ linkwatch_run_queue(); break; } __rtnl_unlock(); rebroadcast_time = jiffies; } if (!wait) { rcu_barrier(); wait = WAIT_REFS_MIN_MSECS; } else { msleep(wait); wait = min(wait << 1, WAIT_REFS_MAX_MSECS); } list_for_each_entry(dev, list, todo_list) if (netdev_refcnt_read(dev) == 1) return dev; if (time_after(jiffies, warning_time + READ_ONCE(netdev_unregister_timeout_secs) * HZ)) { list_for_each_entry(dev, list, todo_list) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", dev->name, netdev_refcnt_read(dev)); ref_tracker_dir_print(&dev->refcnt_tracker, 10); } warning_time = jiffies; } } } /* The sequence is: * * rtnl_lock(); * ... * register_netdevice(x1); * register_netdevice(x2); * ... * unregister_netdevice(y1); * unregister_netdevice(y2); * ... * rtnl_unlock(); * free_netdev(y1); * free_netdev(y2); * * We are invoked by rtnl_unlock(). * This allows us to deal with problems: * 1) We can delete sysfs objects which invoke hotplug * without deadlocking with linkwatch via keventd. * 2) Since we run with the RTNL semaphore not held, we can sleep * safely in order to wait for the netdev refcnt to drop to zero. * * We must not return until all unregister events added during * the interval the lock was held have been completed. */ void netdev_run_todo(void) { struct net_device *dev, *tmp; struct list_head list; #ifdef CONFIG_LOCKDEP struct list_head unlink_list; list_replace_init(&net_unlink_list, &unlink_list); while (!list_empty(&unlink_list)) { struct net_device *dev = list_first_entry(&unlink_list, struct net_device, unlink_list); list_del_init(&dev->unlink_list); dev->nested_level = dev->lower_level - 1; } #endif /* Snapshot list, allow later requests */ list_replace_init(&net_todo_list, &list); __rtnl_unlock(); /* Wait for rcu callbacks to finish before next phase */ if (!list_empty(&list)) rcu_barrier(); list_for_each_entry_safe(dev, tmp, &list, todo_list) { if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { netdev_WARN(dev, "run_todo but not unregistering\n"); list_del(&dev->todo_list); continue; } write_lock(&dev_base_lock); dev->reg_state = NETREG_UNREGISTERED; write_unlock(&dev_base_lock); linkwatch_sync_dev(dev); } while (!list_empty(&list)) { dev = netdev_wait_allrefs_any(&list); list_del(&dev->todo_list); /* paranoia */ BUG_ON(netdev_refcnt_read(dev) != 1); BUG_ON(!list_empty(&dev->ptype_all)); BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr)); netdev_do_free_pcpu_stats(dev); if (dev->priv_destructor) dev->priv_destructor(dev); if (dev->needs_free_netdev) free_netdev(dev); if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count)) wake_up(&netdev_unregistering_wq); /* Free network device */ kobject_put(&dev->dev.kobj); } } /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has * all the same fields in the same order as net_device_stats, with only * the type differing, but rtnl_link_stats64 may have additional fields * at the end for newer counters. */ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats) { size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t); const atomic_long_t *src = (atomic_long_t *)netdev_stats; u64 *dst = (u64 *)stats64; BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++) dst[i] = (unsigned long)atomic_long_read(&src[i]); /* zero out counters that only exist in rtnl_link_stats64 */ memset((char *)stats64 + n * sizeof(u64), 0, sizeof(*stats64) - n * sizeof(u64)); } EXPORT_SYMBOL(netdev_stats_to_stats64); static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc( struct net_device *dev) { struct net_device_core_stats __percpu *p; p = alloc_percpu_gfp(struct net_device_core_stats, GFP_ATOMIC | __GFP_NOWARN); if (p && cmpxchg(&dev->core_stats, NULL, p)) free_percpu(p); /* This READ_ONCE() pairs with the cmpxchg() above */ return READ_ONCE(dev->core_stats); } noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset) { /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats); unsigned long __percpu *field; if (unlikely(!p)) { p = netdev_core_stats_alloc(dev); if (!p) return; } field = (__force unsigned long __percpu *)((__force void *)p + offset); this_cpu_inc(*field); } EXPORT_SYMBOL_GPL(netdev_core_stats_inc); /** * dev_get_stats - get network device statistics * @dev: device to get statistics from * @storage: place to store stats * * Get network statistics from device. Return @storage. * The device driver may provide its own method by setting * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; * otherwise the internal statistics structure is used. */ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, struct rtnl_link_stats64 *storage) { const struct net_device_ops *ops = dev->netdev_ops; const struct net_device_core_stats __percpu *p; if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); ops->ndo_get_stats64(dev, storage); } else if (ops->ndo_get_stats) { netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); } else { netdev_stats_to_stats64(storage, &dev->stats); } /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ p = READ_ONCE(dev->core_stats); if (p) { const struct net_device_core_stats *core_stats; int i; for_each_possible_cpu(i) { core_stats = per_cpu_ptr(p, i); storage->rx_dropped += READ_ONCE(core_stats->rx_dropped); storage->tx_dropped += READ_ONCE(core_stats->tx_dropped); storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler); storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped); } } return storage; } EXPORT_SYMBOL(dev_get_stats); /** * dev_fetch_sw_netstats - get per-cpu network device statistics * @s: place to store stats * @netstats: per-cpu network stats to read from * * Read per-cpu network statistics and populate the related fields in @s. */ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, const struct pcpu_sw_netstats __percpu *netstats) { int cpu; for_each_possible_cpu(cpu) { u64 rx_packets, rx_bytes, tx_packets, tx_bytes; const struct pcpu_sw_netstats *stats; unsigned int start; stats = per_cpu_ptr(netstats, cpu); do { start = u64_stats_fetch_begin(&stats->syncp); rx_packets = u64_stats_read(&stats->rx_packets); rx_bytes = u64_stats_read(&stats->rx_bytes); tx_packets = u64_stats_read(&stats->tx_packets); tx_bytes = u64_stats_read(&stats->tx_bytes); } while (u64_stats_fetch_retry(&stats->syncp, start)); s->rx_packets += rx_packets; s->rx_bytes += rx_bytes; s->tx_packets += tx_packets; s->tx_bytes += tx_bytes; } } EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats); /** * dev_get_tstats64 - ndo_get_stats64 implementation * @dev: device to get statistics from * @s: place to store stats * * Populate @s from dev->stats and dev->tstats. Can be used as * ndo_get_stats64() callback. */ void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s) { netdev_stats_to_stats64(s, &dev->stats); dev_fetch_sw_netstats(s, dev->tstats); } EXPORT_SYMBOL_GPL(dev_get_tstats64); struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) { struct netdev_queue *queue = dev_ingress_queue(dev); #ifdef CONFIG_NET_CLS_ACT if (queue) return queue; queue = kzalloc(sizeof(*queue), GFP_KERNEL); if (!queue) return NULL; netdev_init_one_queue(dev, queue, NULL); RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc); rcu_assign_pointer(dev->ingress_queue, queue); #endif return queue; } static const struct ethtool_ops default_ethtool_ops; void netdev_set_default_ethtool_ops(struct net_device *dev, const struct ethtool_ops *ops) { if (dev->ethtool_ops == &default_ethtool_ops) dev->ethtool_ops = ops; } EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); /** * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default * @dev: netdev to enable the IRQ coalescing on * * Sets a conservative default for SW IRQ coalescing. Users can use * sysfs attributes to override the default values. */ void netdev_sw_irq_coalesce_default_on(struct net_device *dev) { WARN_ON(dev->reg_state == NETREG_REGISTERED); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { dev->gro_flush_timeout = 20000; dev->napi_defer_hard_irqs = 1; } } EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on); void netdev_freemem(struct net_device *dev) { char *addr = (char *)dev - dev->padded; kvfree(addr); } /** * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for * @name: device name format string * @name_assign_type: origin of device name * @setup: callback to initialize device * @txqs: the number of TX subqueues to allocate * @rxqs: the number of RX subqueues to allocate * * Allocates a struct net_device with private data area for driver use * and performs basic initialization. Also allocates subqueue structs * for each queue on the device. */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs) { struct net_device *dev; unsigned int alloc_size; struct net_device *p; BUG_ON(strlen(name) >= sizeof(dev->name)); if (txqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); return NULL; } if (rxqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); return NULL; } alloc_size = sizeof(struct net_device); if (sizeof_priv) { /* ensure 32-byte alignment of private area */ alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); alloc_size += sizeof_priv; } /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN - 1; p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!p) return NULL; dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; ref_tracker_dir_init(&dev->refcnt_tracker, 128, name); #ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) goto free_dev; __dev_hold(dev); #else refcount_set(&dev->dev_refcnt, 1); #endif if (dev_addr_init(dev)) goto free_pcpu; dev_mc_init(dev); dev_uc_init(dev); dev_net_set(dev, &init_net); dev->gso_max_size = GSO_LEGACY_MAX_SIZE; dev->xdp_zc_max_segs = 1; dev->gso_max_segs = GSO_MAX_SEGS; dev->gro_max_size = GRO_LEGACY_MAX_SIZE; dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE; dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE; dev->tso_max_size = TSO_LEGACY_MAX_SIZE; dev->tso_max_segs = TSO_MAX_SEGS; dev->upper_level = 1; dev->lower_level = 1; #ifdef CONFIG_LOCKDEP dev->nested_level = 0; INIT_LIST_HEAD(&dev->unlink_list); #endif INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); INIT_LIST_HEAD(&dev->close_list); INIT_LIST_HEAD(&dev->link_watch_list); INIT_LIST_HEAD(&dev->adj_list.upper); INIT_LIST_HEAD(&dev->adj_list.lower); INIT_LIST_HEAD(&dev->ptype_all); INIT_LIST_HEAD(&dev->ptype_specific); INIT_LIST_HEAD(&dev->net_notifier_list); #ifdef CONFIG_NET_SCHED hash_init(dev->qdisc_hash); #endif dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); if (!dev->tx_queue_len) { dev->priv_flags |= IFF_NO_QUEUE; dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; } dev->num_tx_queues = txqs; dev->real_num_tx_queues = txqs; if (netif_alloc_netdev_queues(dev)) goto free_all; dev->num_rx_queues = rxqs; dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_all; strcpy(dev->name, name); dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) dev->ethtool_ops = &default_ethtool_ops; nf_hook_netdev_init(dev); return dev; free_all: free_netdev(dev); return NULL; free_pcpu: #ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); free_dev: #endif netdev_freemem(dev); return NULL; } EXPORT_SYMBOL(alloc_netdev_mqs); /** * free_netdev - free network device * @dev: device * * This function does the last stage of destroying an allocated device * interface. The reference to the device object is released. If this * is the last reference then it will be freed.Must be called in process * context. */ void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; might_sleep(); /* When called immediately after register_netdevice() failed the unwind * handling may still be dismantling the device. Handle that case by * deferring the free. */ if (dev->reg_state == NETREG_UNREGISTERING) { ASSERT_RTNL(); dev->needs_free_netdev = true; return; } netif_free_tx_queues(dev); netif_free_rx_queues(dev); kfree(rcu_dereference_protected(dev->ingress_queue, 1)); /* Flush device addresses */ dev_addr_flush(dev); list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); ref_tracker_dir_exit(&dev->refcnt_tracker); #ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; #endif free_percpu(dev->core_stats); dev->core_stats = NULL; free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { netdev_freemem(dev); return; } BUG_ON(dev->reg_state != NETREG_UNREGISTERED); dev->reg_state = NETREG_RELEASED; /* will free via device release */ put_device(&dev->dev); } EXPORT_SYMBOL(free_netdev); /** * synchronize_net - Synchronize with packet receive processing * * Wait for packets currently being received to be done. * Does not block later packets from starting. */ void synchronize_net(void) { might_sleep(); if (rtnl_is_locked()) synchronize_rcu_expedited(); else synchronize_rcu(); } EXPORT_SYMBOL(synchronize_net); /** * unregister_netdevice_queue - remove device from the kernel * @dev: device * @head: list * * This function shuts down a device interface and removes it * from the kernel tables. * If head not NULL, device is queued to be unregistered later. * * Callers must hold the rtnl semaphore. You may want * unregister_netdev() instead of this. */ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) { ASSERT_RTNL(); if (head) { list_move_tail(&dev->unreg_list, head); } else { LIST_HEAD(single); list_add(&dev->unreg_list, &single); unregister_netdevice_many(&single); } } EXPORT_SYMBOL(unregister_netdevice_queue); void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh) { struct net_device *dev, *tmp; LIST_HEAD(close_head); BUG_ON(dev_boot_phase); ASSERT_RTNL(); if (list_empty(head)) return; list_for_each_entry_safe(dev, tmp, head, unreg_list) { /* Some devices call without registering * for initialization unwind. Remove those * devices and proceed with the remaining. */ if (dev->reg_state == NETREG_UNINITIALIZED) { pr_debug("unregister_netdevice: device %s/%p never was registered\n", dev->name, dev); WARN_ON(1); list_del(&dev->unreg_list); continue; } dev->dismantle = true; BUG_ON(dev->reg_state != NETREG_REGISTERED); } /* If device is running, close it first. */ list_for_each_entry(dev, head, unreg_list) list_add_tail(&dev->close_list, &close_head); dev_close_many(&close_head, true); list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ write_lock(&dev_base_lock); unlist_netdevice(dev, false); dev->reg_state = NETREG_UNREGISTERING; write_unlock(&dev_base_lock); } flush_all_backlogs(); synchronize_net(); list_for_each_entry(dev, head, unreg_list) { struct sk_buff *skb = NULL; /* Shutdown queueing discipline. */ dev_shutdown(dev); dev_tcx_uninstall(dev); dev_xdp_uninstall(dev); bpf_dev_bound_netdev_unregister(dev); netdev_offload_xstats_disable_all(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, GFP_KERNEL, NULL, 0, portid, nlh); /* * Flush the unicast and multicast chains */ dev_uc_flush(dev); dev_mc_flush(dev); netdev_name_node_alt_flush(dev); netdev_name_node_free(dev->name_node); call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); if (skb) rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh); /* Notifier chain MUST detach us all upper devices. */ WARN_ON(netdev_has_any_upper_dev(dev)); WARN_ON(netdev_has_any_lower_dev(dev)); /* Remove entries from kobject tree */ netdev_unregister_kobject(dev); #ifdef CONFIG_XPS /* Remove XPS queueing entries */ netif_reset_xps_queues_gt(dev, 0); #endif } synchronize_net(); list_for_each_entry(dev, head, unreg_list) { netdev_put(dev, &dev->dev_registered_tracker); net_set_todo(dev); } list_del(head); } /** * unregister_netdevice_many - unregister many devices * @head: list of devices * * Note: As most callers use a stack allocated list_head, * we force a list_del() to make sure stack wont be corrupted later. */ void unregister_netdevice_many(struct list_head *head) { unregister_netdevice_many_notify(head, 0, NULL); } EXPORT_SYMBOL(unregister_netdevice_many); /** * unregister_netdev - remove device from the kernel * @dev: device * * This function shuts down a device interface and removes it * from the kernel tables. * * This is just a wrapper for unregister_netdevice that takes * the rtnl semaphore. In general you want to use this and not * unregister_netdevice. */ void unregister_netdev(struct net_device *dev) { rtnl_lock(); unregister_netdevice(dev); rtnl_unlock(); } EXPORT_SYMBOL(unregister_netdev); /** * __dev_change_net_namespace - move device to different nethost namespace * @dev: device * @net: network namespace * @pat: If not NULL name pattern to try if the current device name * is already taken in the destination network namespace. * @new_ifindex: If not zero, specifies device index in the target * namespace. * * This function shuts down a device interface and moves it * to a new network namespace. On success 0 is returned, on * a failure a netagive errno code is returned. * * Callers must hold the rtnl semaphore. */ int __dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat, int new_ifindex) { struct netdev_name_node *name_node; struct net *net_old = dev_net(dev); char new_name[IFNAMSIZ] = {}; int err, new_nsid; ASSERT_RTNL(); /* Don't allow namespace local devices to be moved. */ err = -EINVAL; if (dev->features & NETIF_F_NETNS_LOCAL) goto out; /* Ensure the device has been registrered */ if (dev->reg_state != NETREG_REGISTERED) goto out; /* Get out if there is nothing todo */ err = 0; if (net_eq(net_old, net)) goto out; /* Pick the destination device name, and ensure * we can use it in the destination network namespace. */ err = -EEXIST; if (netdev_name_in_use(net, dev->name)) { /* We get here if we can't use the current device name */ if (!pat) goto out; err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST); if (err < 0) goto out; } /* Check that none of the altnames conflicts. */ err = -EEXIST; netdev_for_each_altname(dev, name_node) if (netdev_name_in_use(net, name_node->name)) goto out; /* Check that new_ifindex isn't used yet. */ if (new_ifindex) { err = dev_index_reserve(net, new_ifindex); if (err < 0) goto out; } else { /* If there is an ifindex conflict assign a new one */ err = dev_index_reserve(net, dev->ifindex); if (err == -EBUSY) err = dev_index_reserve(net, 0); if (err < 0) goto out; new_ifindex = err; } /* * And now a mini version of register_netdevice unregister_netdevice. */ /* If device is running close it first. */ dev_close(dev); /* And unlink it from device chain */ unlist_netdevice(dev, true); synchronize_net(); /* Shutdown queueing discipline. */ dev_shutdown(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. * * Note that dev->reg_state stays at NETREG_REGISTERED. * This is wanted because this way 8021q and macvlan know * the device is just moving and can keep their slaves up. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, new_ifindex); /* * Flush the unicast and multicast chains */ dev_uc_flush(dev); dev_mc_flush(dev); /* Send a netdev-removed uevent to the old namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); netdev_adjacent_del_links(dev); /* Move per-net netdevice notifiers that are following the netdevice */ move_netdevice_notifiers_dev_net(dev, net); /* Actually switch the network namespace */ dev_net_set(dev, net); dev->ifindex = new_ifindex; if (new_name[0]) /* Rename the netdev to prepared name */ strscpy(dev->name, new_name, IFNAMSIZ); /* Fixup kobjects */ dev_set_uevent_suppress(&dev->dev, 1); err = device_rename(&dev->dev, dev->name); dev_set_uevent_suppress(&dev->dev, 0); WARN_ON(err); /* Send a netdev-add uevent to the new namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_ADD); netdev_adjacent_add_links(dev); /* Adapt owner in case owning user namespace of target network * namespace is different from the original one. */ err = netdev_change_owner(dev, net_old, net); WARN_ON(err); /* Add the device back in the hashes */ list_netdevice(dev); /* Notify protocols, that a new device appeared. */ call_netdevice_notifiers(NETDEV_REGISTER, dev); /* * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); synchronize_net(); err = 0; out: return err; } EXPORT_SYMBOL_GPL(__dev_change_net_namespace); static int dev_cpu_dead(unsigned int oldcpu) { struct sk_buff **list_skb; struct sk_buff *skb; unsigned int cpu; struct softnet_data *sd, *oldsd, *remsd = NULL; local_irq_disable(); cpu = smp_processor_id(); sd = &per_cpu(softnet_data, cpu); oldsd = &per_cpu(softnet_data, oldcpu); /* Find end of our completion_queue. */ list_skb = &sd->completion_queue; while (*list_skb) list_skb = &(*list_skb)->next; /* Append completion queue from offline CPU. */ *list_skb = oldsd->completion_queue; oldsd->completion_queue = NULL; /* Append output queue from offline CPU. */ if (oldsd->output_queue) { *sd->output_queue_tailp = oldsd->output_queue; sd->output_queue_tailp = oldsd->output_queue_tailp; oldsd->output_queue = NULL; oldsd->output_queue_tailp = &oldsd->output_queue; } /* Append NAPI poll list from offline CPU, with one exception : * process_backlog() must be called by cpu owning percpu backlog. * We properly handle process_queue & input_pkt_queue later. */ while (!list_empty(&oldsd->poll_list)) { struct napi_struct *napi = list_first_entry(&oldsd->poll_list, struct napi_struct, poll_list); list_del_init(&napi->poll_list); if (napi->poll == process_backlog) napi->state = 0; else ____napi_schedule(sd, napi); } raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); #ifdef CONFIG_RPS remsd = oldsd->rps_ipi_list; oldsd->rps_ipi_list = NULL; #endif /* send out pending IPI's on offline CPU */ net_rps_send_ipi(remsd); /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx(skb); input_queue_head_incr(oldsd); } while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); input_queue_head_incr(oldsd); } return 0; } /** * netdev_increment_features - increment feature set by one * @all: current feature set * @one: new feature set * @mask: mask feature set * * Computes a new feature set after adding a device with feature set * @one to the master device with current feature set @all. Will not * enable anything that is off in @mask. Returns the new feature set. */ netdev_features_t netdev_increment_features(netdev_features_t all, netdev_features_t one, netdev_features_t mask) { if (mask & NETIF_F_HW_CSUM) mask |= NETIF_F_CSUM_MASK; mask |= NETIF_F_VLAN_CHALLENGED; all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask; all &= one | ~NETIF_F_ALL_FOR_ALL; /* If one device supports hw checksumming, set for all. */ if (all & NETIF_F_HW_CSUM) all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM); return all; } EXPORT_SYMBOL(netdev_increment_features); static struct hlist_head * __net_init netdev_create_hash(void) { int i; struct hlist_head *hash; hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL); if (hash != NULL) for (i = 0; i < NETDEV_HASHENTRIES; i++) INIT_HLIST_HEAD(&hash[i]); return hash; } /* Initialize per network namespace state */ static int __net_init netdev_init(struct net *net) { BUILD_BUG_ON(GRO_HASH_BUCKETS > 8 * sizeof_field(struct napi_struct, gro_bitmask)); INIT_LIST_HEAD(&net->dev_base_head); net->dev_name_head = netdev_create_hash(); if (net->dev_name_head == NULL) goto err_name; net->dev_index_head = netdev_create_hash(); if (net->dev_index_head == NULL) goto err_idx; xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1); RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); return 0; err_idx: kfree(net->dev_name_head); err_name: return -ENOMEM; } /** * netdev_drivername - network driver for the device * @dev: network device * * Determine network driver for device. */ const char *netdev_drivername(const struct net_device *dev) { const struct device_driver *driver; const struct device *parent; const char *empty = ""; parent = dev->dev.parent; if (!parent) return empty; driver = parent->driver; if (driver && driver->name) return driver->name; return empty; } static void __netdev_printk(const char *level, const struct net_device *dev, struct va_format *vaf) { if (dev && dev->dev.parent) { dev_printk_emit(level[1] - '0', dev->dev.parent, "%s %s %s%s: %pV", dev_driver_string(dev->dev.parent), dev_name(dev->dev.parent), netdev_name(dev), netdev_reg_state(dev), vaf); } else if (dev) { printk("%s%s%s: %pV", level, netdev_name(dev), netdev_reg_state(dev), vaf); } else { printk("%s(NULL net_device): %pV", level, vaf); } } void netdev_printk(const char *level, const struct net_device *dev, const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; __netdev_printk(level, dev, &vaf); va_end(args); } EXPORT_SYMBOL(netdev_printk); #define define_netdev_printk_level(func, level) \ void func(const struct net_device *dev, const char *fmt, ...) \ { \ struct va_format vaf; \ va_list args; \ \ va_start(args, fmt); \ \ vaf.fmt = fmt; \ vaf.va = &args; \ \ __netdev_printk(level, dev, &vaf); \ \ va_end(args); \ } \ EXPORT_SYMBOL(func); define_netdev_printk_level(netdev_emerg, KERN_EMERG); define_netdev_printk_level(netdev_alert, KERN_ALERT); define_netdev_printk_level(netdev_crit, KERN_CRIT); define_netdev_printk_level(netdev_err, KERN_ERR); define_netdev_printk_level(netdev_warn, KERN_WARNING); define_netdev_printk_level(netdev_notice, KERN_NOTICE); define_netdev_printk_level(netdev_info, KERN_INFO); static void __net_exit netdev_exit(struct net *net) { kfree(net->dev_name_head); kfree(net->dev_index_head); xa_destroy(&net->dev_by_index); if (net != &init_net) WARN_ON_ONCE(!list_empty(&net->dev_base_head)); } static struct pernet_operations __net_initdata netdev_net_ops = { .init = netdev_init, .exit = netdev_exit, }; static void __net_exit default_device_exit_net(struct net *net) { struct netdev_name_node *name_node, *tmp; struct net_device *dev, *aux; /* * Push all migratable network devices back to the * initial network namespace */ ASSERT_RTNL(); for_each_netdev_safe(net, dev, aux) { int err; char fb_name[IFNAMSIZ]; /* Ignore unmoveable devices (i.e. loopback) */ if (dev->features & NETIF_F_NETNS_LOCAL) continue; /* Leave virtual devices for the generic cleanup */ if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund) continue; /* Push remaining network devices to init_net */ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); if (netdev_name_in_use(&init_net, fb_name)) snprintf(fb_name, IFNAMSIZ, "dev%%d"); netdev_for_each_altname_safe(dev, name_node, tmp) if (netdev_name_in_use(&init_net, name_node->name)) { netdev_name_node_del(name_node); synchronize_rcu(); __netdev_name_node_alt_destroy(name_node); } err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { pr_emerg("%s: failed to move %s to init_net: %d\n", __func__, dev->name, err); BUG(); } } } static void __net_exit default_device_exit_batch(struct list_head *net_list) { /* At exit all network devices most be removed from a network * namespace. Do this in the reverse order of registration. * Do this across as many network namespaces as possible to * improve batching efficiency. */ struct net_device *dev; struct net *net; LIST_HEAD(dev_kill_list); rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { default_device_exit_net(net); cond_resched(); } list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) dev->rtnl_link_ops->dellink(dev, &dev_kill_list); else unregister_netdevice_queue(dev, &dev_kill_list); } } unregister_netdevice_many(&dev_kill_list); rtnl_unlock(); } static struct pernet_operations __net_initdata default_device_ops = { .exit_batch = default_device_exit_batch, }; static void __init net_dev_struct_check(void) { /* TX read-mostly hotpath */ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq); #ifdef CONFIG_XPS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps); #endif #ifdef CONFIG_NETFILTER_EGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress); #endif #ifdef CONFIG_NET_XGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress); #endif CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160); /* TXRX read-mostly hotpath */ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr); CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 30); /* RX read-mostly hotpath */ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net); #ifdef CONFIG_NETPOLL CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo); #endif #ifdef CONFIG_NET_XGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress); #endif CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104); } /* * Initialize the DEV module. At boot time this walks the device list and * unhooks any devices that fail to initialise (normally hardware not * present) and leaves us with a valid list of present and active devices. * */ /* * This is called single threaded during boot, so no need * to take the rtnl semaphore. */ static int __init net_dev_init(void) { int i, rc = -ENOMEM; BUG_ON(!dev_boot_phase); net_dev_struct_check(); if (dev_proc_init()) goto out; if (netdev_kobject_init()) goto out; INIT_LIST_HEAD(&ptype_all); for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); if (register_pernet_subsys(&netdev_net_ops)) goto out; /* * Initialise the packet receive queues. */ for_each_possible_cpu(i) { struct work_struct *flush = per_cpu_ptr(&flush_works, i); struct softnet_data *sd = &per_cpu(softnet_data, i); INIT_WORK(flush, flush_backlog); skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); #ifdef CONFIG_XFRM_OFFLOAD skb_queue_head_init(&sd->xfrm_backlog); #endif INIT_LIST_HEAD(&sd->poll_list); sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); spin_lock_init(&sd->defer_lock); init_gro_hash(&sd->backlog); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; } dev_boot_phase = 0; /* The loopback device is special if any other network devices * is present in a network namespace the loopback device must * be present. Since we now dynamically allocate and free the * loopback device ensure this invariant is maintained by * keeping the loopback device as the first device on the * list of network devices. Ensuring the loopback devices * is the first device that appears and the last network device * that disappears. */ if (register_pernet_device(&loopback_net_ops)) goto out; if (register_pernet_device(&default_device_ops)) goto out; open_softirq(NET_TX_SOFTIRQ, net_tx_action); open_softirq(NET_RX_SOFTIRQ, net_rx_action); rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead", NULL, dev_cpu_dead); WARN_ON(rc < 0); rc = 0; out: return rc; } subsys_initcall(net_dev_init);
12 11 5 10 9 8 8 7 7 4 4 2 2 3 6 3 2 1 1 2 5 1 12 3 2 1 3 3 1 1 3 2 2 3 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> * Copyright (c) 2012 Intel Corporation */ #include <linux/module.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/string.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_tables.h> #include <net/ip.h> struct nft_nat { u8 sreg_addr_min; u8 sreg_addr_max; u8 sreg_proto_min; u8 sreg_proto_max; enum nf_nat_manip_type type:8; u8 family; u16 flags; }; static void nft_nat_setup_addr(struct nf_nat_range2 *range, const struct nft_regs *regs, const struct nft_nat *priv) { switch (priv->family) { case AF_INET: range->min_addr.ip = (__force __be32) regs->data[priv->sreg_addr_min]; range->max_addr.ip = (__force __be32) regs->data[priv->sreg_addr_max]; break; case AF_INET6: memcpy(range->min_addr.ip6, &regs->data[priv->sreg_addr_min], sizeof(range->min_addr.ip6)); memcpy(range->max_addr.ip6, &regs->data[priv->sreg_addr_max], sizeof(range->max_addr.ip6)); break; } } static void nft_nat_setup_proto(struct nf_nat_range2 *range, const struct nft_regs *regs, const struct nft_nat *priv) { range->min_proto.all = (__force __be16) nft_reg_load16(&regs->data[priv->sreg_proto_min]); range->max_proto.all = (__force __be16) nft_reg_load16(&regs->data[priv->sreg_proto_max]); } static void nft_nat_setup_netmap(struct nf_nat_range2 *range, const struct nft_pktinfo *pkt, const struct nft_nat *priv) { struct sk_buff *skb = pkt->skb; union nf_inet_addr new_addr; __be32 netmask; int i, len = 0; switch (priv->type) { case NFT_NAT_SNAT: if (nft_pf(pkt) == NFPROTO_IPV4) { new_addr.ip = ip_hdr(skb)->saddr; len = sizeof(struct in_addr); } else { new_addr.in6 = ipv6_hdr(skb)->saddr; len = sizeof(struct in6_addr); } break; case NFT_NAT_DNAT: if (nft_pf(pkt) == NFPROTO_IPV4) { new_addr.ip = ip_hdr(skb)->daddr; len = sizeof(struct in_addr); } else { new_addr.in6 = ipv6_hdr(skb)->daddr; len = sizeof(struct in6_addr); } break; } for (i = 0; i < len / sizeof(__be32); i++) { netmask = ~(range->min_addr.ip6[i] ^ range->max_addr.ip6[i]); new_addr.ip6[i] &= ~netmask; new_addr.ip6[i] |= range->min_addr.ip6[i] & netmask; } range->min_addr = new_addr; range->max_addr = new_addr; } static void nft_nat_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_nat *priv = nft_expr_priv(expr); enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo); struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); if (priv->sreg_addr_min) { nft_nat_setup_addr(&range, regs, priv); if (priv->flags & NF_NAT_RANGE_NETMAP) nft_nat_setup_netmap(&range, pkt, priv); } if (priv->sreg_proto_min) nft_nat_setup_proto(&range, regs, priv); range.flags = priv->flags; regs->verdict.code = nf_nat_setup_info(ct, &range, priv->type); } static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { [NFTA_NAT_TYPE] = { .type = NLA_U32 }, [NFTA_NAT_FAMILY] = { .type = NLA_U32 }, [NFTA_NAT_REG_ADDR_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 }, [NFTA_NAT_FLAGS] = NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), }; static int nft_nat_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) { struct nft_nat *priv = nft_expr_priv(expr); int err; if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); if (err < 0) return err; switch (priv->type) { case NFT_NAT_SNAT: err = nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN)); break; case NFT_NAT_DNAT: err = nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)); break; } return err; } static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_nat *priv = nft_expr_priv(expr); unsigned int alen, plen; u32 family; int err; if (tb[NFTA_NAT_TYPE] == NULL || (tb[NFTA_NAT_REG_ADDR_MIN] == NULL && tb[NFTA_NAT_REG_PROTO_MIN] == NULL)) return -EINVAL; switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) { case NFT_NAT_SNAT: priv->type = NF_NAT_MANIP_SRC; break; case NFT_NAT_DNAT: priv->type = NF_NAT_MANIP_DST; break; default: return -EOPNOTSUPP; } if (tb[NFTA_NAT_FAMILY] == NULL) return -EINVAL; family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); if (ctx->family != NFPROTO_INET && ctx->family != family) return -EOPNOTSUPP; switch (family) { case NFPROTO_IPV4: alen = sizeof_field(struct nf_nat_range, min_addr.ip); break; case NFPROTO_IPV6: alen = sizeof_field(struct nf_nat_range, min_addr.ip6); break; default: if (tb[NFTA_NAT_REG_ADDR_MIN]) return -EAFNOSUPPORT; break; } priv->family = family; if (tb[NFTA_NAT_REG_ADDR_MIN]) { err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MIN], &priv->sreg_addr_min, alen); if (err < 0) return err; if (tb[NFTA_NAT_REG_ADDR_MAX]) { err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MAX], &priv->sreg_addr_max, alen); if (err < 0) return err; } else { priv->sreg_addr_max = priv->sreg_addr_min; } priv->flags |= NF_NAT_RANGE_MAP_IPS; } plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_NAT_REG_PROTO_MIN]) { err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MIN], &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_NAT_REG_PROTO_MAX]) { err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MAX], &priv->sreg_proto_max, plen); if (err < 0) return err; } else { priv->sreg_proto_max = priv->sreg_proto_min; } priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } if (tb[NFTA_NAT_FLAGS]) priv->flags |= ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); return nf_ct_netns_get(ctx->net, family); } static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_nat *priv = nft_expr_priv(expr); switch (priv->type) { case NF_NAT_MANIP_SRC: if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT))) goto nla_put_failure; break; case NF_NAT_MANIP_DST: if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT))) goto nla_put_failure; break; } if (nla_put_be32(skb, NFTA_NAT_FAMILY, htonl(priv->family))) goto nla_put_failure; if (priv->sreg_addr_min) { if (nft_dump_register(skb, NFTA_NAT_REG_ADDR_MIN, priv->sreg_addr_min) || nft_dump_register(skb, NFTA_NAT_REG_ADDR_MAX, priv->sreg_addr_max)) goto nla_put_failure; } if (priv->sreg_proto_min) { if (nft_dump_register(skb, NFTA_NAT_REG_PROTO_MIN, priv->sreg_proto_min) || nft_dump_register(skb, NFTA_NAT_REG_PROTO_MAX, priv->sreg_proto_max)) goto nla_put_failure; } if (priv->flags != 0) { if (nla_put_be32(skb, NFTA_NAT_FLAGS, htonl(priv->flags))) goto nla_put_failure; } return 0; nla_put_failure: return -1; } static void nft_nat_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_nat *priv = nft_expr_priv(expr); nf_ct_netns_put(ctx->net, priv->family); } static struct nft_expr_type nft_nat_type; static const struct nft_expr_ops nft_nat_ops = { .type = &nft_nat_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)), .eval = nft_nat_eval, .init = nft_nat_init, .destroy = nft_nat_destroy, .dump = nft_nat_dump, .validate = nft_nat_validate, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_nat_type __read_mostly = { .name = "nat", .ops = &nft_nat_ops, .policy = nft_nat_policy, .maxattr = NFTA_NAT_MAX, .owner = THIS_MODULE, }; #ifdef CONFIG_NF_TABLES_INET static void nft_nat_inet_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_nat *priv = nft_expr_priv(expr); if (priv->family == nft_pf(pkt) || priv->family == NFPROTO_INET) nft_nat_eval(expr, regs, pkt); } static const struct nft_expr_ops nft_nat_inet_ops = { .type = &nft_nat_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)), .eval = nft_nat_inet_eval, .init = nft_nat_init, .destroy = nft_nat_destroy, .dump = nft_nat_dump, .validate = nft_nat_validate, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_inet_nat_type __read_mostly = { .name = "nat", .family = NFPROTO_INET, .ops = &nft_nat_inet_ops, .policy = nft_nat_policy, .maxattr = NFTA_NAT_MAX, .owner = THIS_MODULE, }; static int nft_nat_inet_module_init(void) { return nft_register_expr(&nft_inet_nat_type); } static void nft_nat_inet_module_exit(void) { nft_unregister_expr(&nft_inet_nat_type); } #else static int nft_nat_inet_module_init(void) { return 0; } static void nft_nat_inet_module_exit(void) { } #endif static int __init nft_nat_module_init(void) { int ret = nft_nat_inet_module_init(); if (ret) return ret; ret = nft_register_expr(&nft_nat_type); if (ret) nft_nat_inet_module_exit(); return ret; } static void __exit nft_nat_module_exit(void) { nft_nat_inet_module_exit(); nft_unregister_expr(&nft_nat_type); } module_init(nft_nat_module_init); module_exit(nft_nat_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>"); MODULE_ALIAS_NFT_EXPR("nat"); MODULE_DESCRIPTION("Network Address Translation support");
496 281 280 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 /* SPDX-License-Identifier: GPL-2.0 */ /* * descriptor table internals; you almost certainly want file.h instead. */ #ifndef __LINUX_FDTABLE_H #define __LINUX_FDTABLE_H #include <linux/posix_types.h> #include <linux/compiler.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/nospec.h> #include <linux/types.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/atomic.h> /* * The default fd array needs to be at least BITS_PER_LONG, * as this is the granularity returned by copy_fdset(). */ #define NR_OPEN_DEFAULT BITS_PER_LONG #define NR_OPEN_MAX ~0U struct fdtable { unsigned int max_fds; struct file __rcu **fd; /* current fd array */ unsigned long *close_on_exec; unsigned long *open_fds; unsigned long *full_fds_bits; struct rcu_head rcu; }; static inline bool close_on_exec(unsigned int fd, const struct fdtable *fdt) { return test_bit(fd, fdt->close_on_exec); } static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) { return test_bit(fd, fdt->open_fds); } /* * Open file table structure */ struct files_struct { /* * read mostly part */ atomic_t count; bool resize_in_progress; wait_queue_head_t resize_wait; struct fdtable __rcu *fdt; struct fdtable fdtab; /* * written part on a separate cache line in SMP */ spinlock_t file_lock ____cacheline_aligned_in_smp; unsigned int next_fd; unsigned long close_on_exec_init[1]; unsigned long open_fds_init[1]; unsigned long full_fds_bits_init[1]; struct file __rcu * fd_array[NR_OPEN_DEFAULT]; }; struct file_operations; struct vfsmount; struct dentry; #define rcu_dereference_check_fdtable(files, fdtfd) \ rcu_dereference_check((fdtfd), lockdep_is_held(&(files)->file_lock)) #define files_fdtable(files) \ rcu_dereference_check_fdtable((files), (files)->fdt) /* * The caller must ensure that fd table isn't shared or hold rcu or file lock */ static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = rcu_dereference_raw(files->fdt); unsigned long mask = array_index_mask_nospec(fd, fdt->max_fds); struct file *needs_masking; /* * 'mask' is zero for an out-of-bounds fd, all ones for ok. * 'fd&mask' is 'fd' for ok, or 0 for out of bounds. * * Accessing fdt->fd[0] is ok, but needs masking of the result. */ needs_masking = rcu_dereference_raw(fdt->fd[fd&mask]); return (struct file *)(mask & (unsigned long)needs_masking); } static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd) { RCU_LOCKDEP_WARN(!lockdep_is_held(&files->file_lock), "suspicious rcu_dereference_check() usage"); return files_lookup_fd_raw(files, fd); } struct file *lookup_fdget_rcu(unsigned int fd); struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd); struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *fd); struct task_struct; void put_files_struct(struct files_struct *fs); int unshare_files(void); struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy; void do_close_on_exec(struct files_struct *); int iterate_fd(struct files_struct *, unsigned, int (*)(const void *, struct file *, unsigned), const void *); extern int close_fd(unsigned int fd); extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); extern struct file *file_close_fd(unsigned int fd); extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, struct files_struct **new_fdp); extern struct kmem_cache *files_cachep; #endif /* __LINUX_FDTABLE_H */
2664 29 2651 1780 1934 2651 1800 2634 78 2664 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 /* SPDX-License-Identifier: GPL-2.0 * * Legacy blkg rwstat helpers enabled by CONFIG_BLK_CGROUP_RWSTAT. * Do not use in new code. */ #ifndef _BLK_CGROUP_RWSTAT_H #define _BLK_CGROUP_RWSTAT_H #include "blk-cgroup.h" enum blkg_rwstat_type { BLKG_RWSTAT_READ, BLKG_RWSTAT_WRITE, BLKG_RWSTAT_SYNC, BLKG_RWSTAT_ASYNC, BLKG_RWSTAT_DISCARD, BLKG_RWSTAT_NR, BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, }; /* * blkg_[rw]stat->aux_cnt is excluded for local stats but included for * recursive. Used to carry stats of dead children. */ struct blkg_rwstat { struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR]; atomic64_t aux_cnt[BLKG_RWSTAT_NR]; }; struct blkg_rwstat_sample { u64 cnt[BLKG_RWSTAT_NR]; }; static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat, unsigned int idx) { return atomic64_read(&rwstat->aux_cnt[idx]) + percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]); } int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp); void blkg_rwstat_exit(struct blkg_rwstat *rwstat); u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, const struct blkg_rwstat_sample *rwstat); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, int off, struct blkg_rwstat_sample *sum); /** * blkg_rwstat_add - add a value to a blkg_rwstat * @rwstat: target blkg_rwstat * @op: REQ_OP and flags * @val: value to add * * Add @val to @rwstat. The counters are chosen according to @rw. The * caller is responsible for synchronizing calls to this function. */ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, blk_opf_t opf, uint64_t val) { struct percpu_counter *cnt; if (op_is_discard(opf)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD]; else if (op_is_write(opf)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); if (op_is_sync(opf)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); } /** * blkg_rwstat_read - read the current values of a blkg_rwstat * @rwstat: blkg_rwstat to read * * Read the current snapshot of @rwstat and return it in the aux counts. */ static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat, struct blkg_rwstat_sample *result) { int i; for (i = 0; i < BLKG_RWSTAT_NR; i++) result->cnt[i] = percpu_counter_sum_positive(&rwstat->cpu_cnt[i]); } /** * blkg_rwstat_total - read the total count of a blkg_rwstat * @rwstat: blkg_rwstat to read * * Return the total count of @rwstat regardless of the IO direction. This * function can be called without synchronization and takes care of u64 * atomicity. */ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) { struct blkg_rwstat_sample tmp = { }; blkg_rwstat_read(rwstat, &tmp); return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; } /** * blkg_rwstat_reset - reset a blkg_rwstat * @rwstat: blkg_rwstat to reset */ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) { int i; for (i = 0; i < BLKG_RWSTAT_NR; i++) { percpu_counter_set(&rwstat->cpu_cnt[i], 0); atomic64_set(&rwstat->aux_cnt[i], 0); } } /** * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count * @to: the destination blkg_rwstat * @from: the source * * Add @from's count including the aux one to @to's aux count. */ static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, struct blkg_rwstat *from) { u64 sum[BLKG_RWSTAT_NR]; int i; for (i = 0; i < BLKG_RWSTAT_NR; i++) sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]); for (i = 0; i < BLKG_RWSTAT_NR; i++) atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]), &to->aux_cnt[i]); } #endif /* _BLK_CGROUP_RWSTAT_H */
21059 21102 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 // SPDX-License-Identifier: GPL-2.0-only /* Common code for 32 and 64-bit NUMA */ #include <linux/acpi.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/of.h> #include <linux/string.h> #include <linux/init.h> #include <linux/memblock.h> #include <linux/mmzone.h> #include <linux/ctype.h> #include <linux/nodemask.h> #include <linux/sched.h> #include <linux/topology.h> #include <linux/sort.h> #include <asm/e820/api.h> #include <asm/proto.h> #include <asm/dma.h> #include <asm/amd_nb.h> #include "numa_internal.h" int numa_off; nodemask_t numa_nodes_parsed __initdata; struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); static struct numa_meminfo numa_meminfo __initdata_or_meminfo; static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; static int numa_distance_cnt; static u8 *numa_distance; static __init int numa_setup(char *opt) { if (!opt) return -EINVAL; if (!strncmp(opt, "off", 3)) numa_off = 1; if (!strncmp(opt, "fake=", 5)) return numa_emu_cmdline(opt + 5); if (!strncmp(opt, "noacpi", 6)) disable_srat(); if (!strncmp(opt, "nohmat", 6)) disable_hmat(); return 0; } early_param("numa", numa_setup); /* * apicid, cpu, node mappings */ s16 __apicid_to_node[MAX_LOCAL_APIC] = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; int numa_cpu_node(int cpu) { u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu); if (apicid != BAD_APICID) return __apicid_to_node[apicid]; return NUMA_NO_NODE; } cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; EXPORT_SYMBOL(node_to_cpumask_map); /* * Map cpu index to node index */ DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); void numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); /* early setting, no percpu area yet */ if (cpu_to_node_map) { cpu_to_node_map[cpu] = node; return; } #ifdef CONFIG_DEBUG_PER_CPU_MAPS if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); dump_stack(); return; } #endif per_cpu(x86_cpu_to_node_map, cpu) = node; set_cpu_numa_node(cpu, node); } void numa_clear_node(int cpu) { numa_set_node(cpu, NUMA_NO_NODE); } /* * Allocate node_to_cpumask_map based on number of available nodes * Requires node_possible_map to be valid. * * Note: cpumask_of_node() is not valid until after this is done. * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) */ void __init setup_node_to_cpumask_map(void) { unsigned int node; /* setup nr_node_ids if not done yet */ if (nr_node_ids == MAX_NUMNODES) setup_nr_node_ids(); /* allocate the map */ for (node = 0; node < nr_node_ids; node++) alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); /* cpumask_of_node() will now work */ pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); } static int __init numa_add_memblk_to(int nid, u64 start, u64 end, struct numa_meminfo *mi) { /* ignore zero length blks */ if (start == end) return 0; /* whine about and ignore invalid blks */ if (start > end || nid < 0 || nid >= MAX_NUMNODES) { pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", nid, start, end - 1); return 0; } if (mi->nr_blks >= NR_NODE_MEMBLKS) { pr_err("too many memblk ranges\n"); return -EINVAL; } mi->blk[mi->nr_blks].start = start; mi->blk[mi->nr_blks].end = end; mi->blk[mi->nr_blks].nid = nid; mi->nr_blks++; return 0; } /** * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo * @idx: Index of memblk to remove * @mi: numa_meminfo to remove memblk from * * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and * decrementing @mi->nr_blks. */ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) { mi->nr_blks--; memmove(&mi->blk[idx], &mi->blk[idx + 1], (mi->nr_blks - idx) * sizeof(mi->blk[0])); } /** * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another * @dst: numa_meminfo to append block to * @idx: Index of memblk to remove * @src: numa_meminfo to remove memblk from */ static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, struct numa_meminfo *src) { dst->blk[dst->nr_blks++] = src->blk[idx]; numa_remove_memblk_from(idx, src); } /** * numa_add_memblk - Add one numa_memblk to numa_meminfo * @nid: NUMA node ID of the new memblk * @start: Start address of the new memblk * @end: End address of the new memblk * * Add a new memblk to the default numa_meminfo. * * RETURNS: * 0 on success, -errno on failure. */ int __init numa_add_memblk(int nid, u64 start, u64 end) { return numa_add_memblk_to(nid, start, end, &numa_meminfo); } /* Allocate NODE_DATA for a node on the local memory */ static void __init alloc_node_data(int nid) { const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); u64 nd_pa; void *nd; int tnid; /* * Allocate node data. Try node-local memory and then any node. * Never allocate in DMA zone. */ nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); if (!nd_pa) { pr_err("Cannot find %zu bytes in any node (initial node: %d)\n", nd_size, nid); return; } nd = __va(nd_pa); /* report and initialize */ printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid, nd_pa, nd_pa + nd_size - 1); tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); if (tnid != nid) printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); node_data[nid] = nd; memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); node_set_online(nid); } /** * numa_cleanup_meminfo - Cleanup a numa_meminfo * @mi: numa_meminfo to clean up * * Sanitize @mi by merging and removing unnecessary memblks. Also check for * conflicts and clear unused memblks. * * RETURNS: * 0 on success, -errno on failure. */ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) { const u64 low = 0; const u64 high = PFN_PHYS(max_pfn); int i, j, k; /* first, trim all entries */ for (i = 0; i < mi->nr_blks; i++) { struct numa_memblk *bi = &mi->blk[i]; /* move / save reserved memory ranges */ if (!memblock_overlaps_region(&memblock.memory, bi->start, bi->end - bi->start)) { numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); continue; } /* make sure all non-reserved blocks are inside the limits */ bi->start = max(bi->start, low); /* preserve info for non-RAM areas above 'max_pfn': */ if (bi->end > high) { numa_add_memblk_to(bi->nid, high, bi->end, &numa_reserved_meminfo); bi->end = high; } /* and there's no empty block */ if (bi->start >= bi->end) numa_remove_memblk_from(i--, mi); } /* merge neighboring / overlapping entries */ for (i = 0; i < mi->nr_blks; i++) { struct numa_memblk *bi = &mi->blk[i]; for (j = i + 1; j < mi->nr_blks; j++) { struct numa_memblk *bj = &mi->blk[j]; u64 start, end; /* * See whether there are overlapping blocks. Whine * about but allow overlaps of the same nid. They * will be merged below. */ if (bi->end > bj->start && bi->start < bj->end) { if (bi->nid != bj->nid) { pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", bi->nid, bi->start, bi->end - 1, bj->nid, bj->start, bj->end - 1); return -EINVAL; } pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", bi->nid, bi->start, bi->end - 1, bj->start, bj->end - 1); } /* * Join together blocks on the same node, holes * between which don't overlap with memory on other * nodes. */ if (bi->nid != bj->nid) continue; start = min(bi->start, bj->start); end = max(bi->end, bj->end); for (k = 0; k < mi->nr_blks; k++) { struct numa_memblk *bk = &mi->blk[k]; if (bi->nid == bk->nid) continue; if (start < bk->end && end > bk->start) break; } if (k < mi->nr_blks) continue; printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n", bi->nid, bi->start, bi->end - 1, bj->start, bj->end - 1, start, end - 1); bi->start = start; bi->end = end; numa_remove_memblk_from(j--, mi); } } /* clear unused ones */ for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { mi->blk[i].start = mi->blk[i].end = 0; mi->blk[i].nid = NUMA_NO_NODE; } return 0; } /* * Set nodes, which have memory in @mi, in *@nodemask. */ static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, const struct numa_meminfo *mi) { int i; for (i = 0; i < ARRAY_SIZE(mi->blk); i++) if (mi->blk[i].start != mi->blk[i].end && mi->blk[i].nid != NUMA_NO_NODE) node_set(mi->blk[i].nid, *nodemask); } /** * numa_reset_distance - Reset NUMA distance table * * The current table is freed. The next numa_set_distance() call will * create a new one. */ void __init numa_reset_distance(void) { size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); /* numa_distance could be 1LU marking allocation failure, test cnt */ if (numa_distance_cnt) memblock_free(numa_distance, size); numa_distance_cnt = 0; numa_distance = NULL; /* enable table creation */ } static int __init numa_alloc_distance(void) { nodemask_t nodes_parsed; size_t size; int i, j, cnt = 0; u64 phys; /* size the new table and allocate it */ nodes_parsed = numa_nodes_parsed; numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); for_each_node_mask(i, nodes_parsed) cnt = i; cnt++; size = cnt * cnt * sizeof(numa_distance[0]); phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, PFN_PHYS(max_pfn_mapped)); if (!phys) { pr_warn("Warning: can't allocate distance table!\n"); /* don't retry until explicitly reset */ numa_distance = (void *)1LU; return -ENOMEM; } numa_distance = __va(phys); numa_distance_cnt = cnt; /* fill with the default distances */ for (i = 0; i < cnt; i++) for (j = 0; j < cnt; j++) numa_distance[i * cnt + j] = i == j ? LOCAL_DISTANCE : REMOTE_DISTANCE; printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); return 0; } /** * numa_set_distance - Set NUMA distance from one NUMA to another * @from: the 'from' node to set distance * @to: the 'to' node to set distance * @distance: NUMA distance * * Set the distance from node @from to @to to @distance. If distance table * doesn't exist, one which is large enough to accommodate all the currently * known nodes will be created. * * If such table cannot be allocated, a warning is printed and further * calls are ignored until the distance table is reset with * numa_reset_distance(). * * If @from or @to is higher than the highest known node or lower than zero * at the time of table creation or @distance doesn't make sense, the call * is ignored. * This is to allow simplification of specific NUMA config implementations. */ void __init numa_set_distance(int from, int to, int distance) { if (!numa_distance && numa_alloc_distance() < 0) return; if (from >= numa_distance_cnt || to >= numa_distance_cnt || from < 0 || to < 0) { pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n", from, to, distance); return; } if ((u8)distance != distance || (from == to && distance != LOCAL_DISTANCE)) { pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n", from, to, distance); return; } numa_distance[from * numa_distance_cnt + to] = distance; } int __node_distance(int from, int to) { if (from >= numa_distance_cnt || to >= numa_distance_cnt) return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; return numa_distance[from * numa_distance_cnt + to]; } EXPORT_SYMBOL(__node_distance); /* * Mark all currently memblock-reserved physical memory (which covers the * kernel's own memory ranges) as hot-unswappable. */ static void __init numa_clear_kernel_node_hotplug(void) { nodemask_t reserved_nodemask = NODE_MASK_NONE; struct memblock_region *mb_region; int i; /* * We have to do some preprocessing of memblock regions, to * make them suitable for reservation. * * At this time, all memory regions reserved by memblock are * used by the kernel, but those regions are not split up * along node boundaries yet, and don't necessarily have their * node ID set yet either. * * So iterate over all memory known to the x86 architecture, * and use those ranges to set the nid in memblock.reserved. * This will split up the memblock regions along node * boundaries and will set the node IDs as well. */ for (i = 0; i < numa_meminfo.nr_blks; i++) { struct numa_memblk *mb = numa_meminfo.blk + i; int ret; ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid); WARN_ON_ONCE(ret); } /* * Now go over all reserved memblock regions, to construct a * node mask of all kernel reserved memory areas. * * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, * numa_meminfo might not include all memblock.reserved * memory ranges, because quirks such as trim_snb_memory() * reserve specific pages for Sandy Bridge graphics. ] */ for_each_reserved_mem_region(mb_region) { int nid = memblock_get_region_node(mb_region); if (nid != MAX_NUMNODES) node_set(nid, reserved_nodemask); } /* * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory * belonging to the reserved node mask. * * Note that this will include memory regions that reside * on nodes that contain kernel memory - entire nodes * become hot-unpluggable: */ for (i = 0; i < numa_meminfo.nr_blks; i++) { struct numa_memblk *mb = numa_meminfo.blk + i; if (!node_isset(mb->nid, reserved_nodemask)) continue; memblock_clear_hotplug(mb->start, mb->end - mb->start); } } static int __init numa_register_memblks(struct numa_meminfo *mi) { int i, nid; /* Account for nodes with cpus and no memory */ node_possible_map = numa_nodes_parsed; numa_nodemask_from_meminfo(&node_possible_map, mi); if (WARN_ON(nodes_empty(node_possible_map))) return -EINVAL; for (i = 0; i < mi->nr_blks; i++) { struct numa_memblk *mb = &mi->blk[i]; memblock_set_node(mb->start, mb->end - mb->start, &memblock.memory, mb->nid); } /* * At very early time, the kernel have to use some memory such as * loading the kernel image. We cannot prevent this anyway. So any * node the kernel resides in should be un-hotpluggable. * * And when we come here, alloc node data won't fail. */ numa_clear_kernel_node_hotplug(); /* * If sections array is gonna be used for pfn -> nid mapping, check * whether its granularity is fine enough. */ if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) { unsigned long pfn_align = node_map_pfn_alignment(); if (pfn_align && pfn_align < PAGES_PER_SECTION) { pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n", PFN_PHYS(pfn_align) >> 20, PFN_PHYS(PAGES_PER_SECTION) >> 20); return -EINVAL; } } if (!memblock_validate_numa_coverage(SZ_1M)) return -EINVAL; /* Finally register nodes. */ for_each_node_mask(nid, node_possible_map) { u64 start = PFN_PHYS(max_pfn); u64 end = 0; for (i = 0; i < mi->nr_blks; i++) { if (nid != mi->blk[i].nid) continue; start = min(mi->blk[i].start, start); end = max(mi->blk[i].end, end); } if (start >= end) continue; alloc_node_data(nid); } /* Dump memblock with node info and return. */ memblock_dump_all(); return 0; } /* * There are unfortunately some poorly designed mainboards around that * only connect memory to a single CPU. This breaks the 1:1 cpu->node * mapping. To avoid this fill in the mapping for all possible CPUs, * as the number of CPUs is not known yet. We round robin the existing * nodes. */ static void __init numa_init_array(void) { int rr, i; rr = first_node(node_online_map); for (i = 0; i < nr_cpu_ids; i++) { if (early_cpu_to_node(i) != NUMA_NO_NODE) continue; numa_set_node(i, rr); rr = next_node_in(rr, node_online_map); } } static int __init numa_init(int (*init_func)(void)) { int i; int ret; for (i = 0; i < MAX_LOCAL_APIC; i++) set_apicid_to_node(i, NUMA_NO_NODE); nodes_clear(numa_nodes_parsed); nodes_clear(node_possible_map); nodes_clear(node_online_map); memset(&numa_meminfo, 0, sizeof(numa_meminfo)); WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, MAX_NUMNODES)); WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved, MAX_NUMNODES)); /* In case that parsing SRAT failed. */ WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX)); numa_reset_distance(); ret = init_func(); if (ret < 0) return ret; /* * We reset memblock back to the top-down direction * here because if we configured ACPI_NUMA, we have * parsed SRAT in init_func(). It is ok to have the * reset here even if we did't configure ACPI_NUMA * or acpi numa init fails and fallbacks to dummy * numa init. */ memblock_set_bottom_up(false); ret = numa_cleanup_meminfo(&numa_meminfo); if (ret < 0) return ret; numa_emulation(&numa_meminfo, numa_distance_cnt); ret = numa_register_memblks(&numa_meminfo); if (ret < 0) return ret; for (i = 0; i < nr_cpu_ids; i++) { int nid = early_cpu_to_node(i); if (nid == NUMA_NO_NODE) continue; if (!node_online(nid)) numa_clear_node(i); } numa_init_array(); return 0; } /** * dummy_numa_init - Fallback dummy NUMA init * * Used if there's no underlying NUMA architecture, NUMA initialization * fails, or NUMA is disabled on the command line. * * Must online at least one node and add memory blocks that cover all * allowed memory. This function must not fail. */ static int __init dummy_numa_init(void) { printk(KERN_INFO "%s\n", numa_off ? "NUMA turned off" : "No NUMA configuration found"); printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n", 0LLU, PFN_PHYS(max_pfn) - 1); node_set(0, numa_nodes_parsed); numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); return 0; } /** * x86_numa_init - Initialize NUMA * * Try each configured NUMA initialization method until one succeeds. The * last fallback is dummy single node config encompassing whole memory and * never fails. */ void __init x86_numa_init(void) { if (!numa_off) { #ifdef CONFIG_ACPI_NUMA if (!numa_init(x86_acpi_numa_init)) return; #endif #ifdef CONFIG_AMD_NUMA if (!numa_init(amd_numa_init)) return; #endif if (acpi_disabled && !numa_init(of_numa_init)) return; } numa_init(dummy_numa_init); } /* * A node may exist which has one or more Generic Initiators but no CPUs and no * memory. * * This function must be called after init_cpu_to_node(), to ensure that any * memoryless CPU nodes have already been brought online, and before the * node_data[nid] is needed for zone list setup in build_all_zonelists(). * * When this function is called, any nodes containing either memory and/or CPUs * will already be online and there is no need to do anything extra, even if * they also contain one or more Generic Initiators. */ void __init init_gi_nodes(void) { int nid; /* * Exclude this node from * bringup_nonboot_cpus * cpu_up * __try_online_node * register_one_node * because node_subsys is not initialized yet. * TODO remove dependency on node_online */ for_each_node_state(nid, N_GENERIC_INITIATOR) if (!node_online(nid)) node_set_online(nid); } /* * Setup early cpu_to_node. * * Populate cpu_to_node[] only if x86_cpu_to_apicid[], * and apicid_to_node[] tables have valid entries for a CPU. * This means we skip cpu_to_node[] initialisation for NUMA * emulation and faking node case (when running a kernel compiled * for NUMA on a non NUMA box), which is OK as cpu_to_node[] * is already initialized in a round robin manner at numa_init_array, * prior to this call, and this initialization is good enough * for the fake NUMA cases. * * Called before the per_cpu areas are setup. */ void __init init_cpu_to_node(void) { int cpu; u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); BUG_ON(cpu_to_apicid == NULL); for_each_possible_cpu(cpu) { int node = numa_cpu_node(cpu); if (node == NUMA_NO_NODE) continue; /* * Exclude this node from * bringup_nonboot_cpus * cpu_up * __try_online_node * register_one_node * because node_subsys is not initialized yet. * TODO remove dependency on node_online */ if (!node_online(node)) node_set_online(node); numa_set_node(cpu, node); } } #ifndef CONFIG_DEBUG_PER_CPU_MAPS # ifndef CONFIG_NUMA_EMU void numa_add_cpu(int cpu) { cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } void numa_remove_cpu(int cpu) { cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } # endif /* !CONFIG_NUMA_EMU */ #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ int __cpu_to_node(int cpu) { if (early_per_cpu_ptr(x86_cpu_to_node_map)) { printk(KERN_WARNING "cpu_to_node(%d): usage too early!\n", cpu); dump_stack(); return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; } return per_cpu(x86_cpu_to_node_map, cpu); } EXPORT_SYMBOL(__cpu_to_node); /* * Same function as cpu_to_node() but used if called before the * per_cpu areas are setup. */ int early_cpu_to_node(int cpu) { if (early_per_cpu_ptr(x86_cpu_to_node_map)) return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; if (!cpu_possible(cpu)) { printk(KERN_WARNING "early_cpu_to_node(%d): no per_cpu area!\n", cpu); dump_stack(); return NUMA_NO_NODE; } return per_cpu(x86_cpu_to_node_map, cpu); } void debug_cpumask_set_cpu(int cpu, int node, bool enable) { struct cpumask *mask; if (node == NUMA_NO_NODE) { /* early_cpu_to_node() already emits a warning and trace */ return; } mask = node_to_cpumask_map[node]; if (!cpumask_available(mask)) { pr_err("node_to_cpumask_map[%i] NULL\n", node); dump_stack(); return; } if (enable) cpumask_set_cpu(cpu, mask); else cpumask_clear_cpu(cpu, mask); printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n", enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, cpumask_pr_args(mask)); return; } # ifndef CONFIG_NUMA_EMU static void numa_set_cpumask(int cpu, bool enable) { debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); } void numa_add_cpu(int cpu) { numa_set_cpumask(cpu, true); } void numa_remove_cpu(int cpu) { numa_set_cpumask(cpu, false); } # endif /* !CONFIG_NUMA_EMU */ /* * Returns a pointer to the bitmask of CPUs on Node 'node'. */ const struct cpumask *cpumask_of_node(int node) { if ((unsigned)node >= nr_node_ids) { printk(KERN_WARNING "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n", node, nr_node_ids); dump_stack(); return cpu_none_mask; } if (!cpumask_available(node_to_cpumask_map[node])) { printk(KERN_WARNING "cpumask_of_node(%d): no node_to_cpumask_map!\n", node); dump_stack(); return cpu_online_mask; } return node_to_cpumask_map[node]; } EXPORT_SYMBOL(cpumask_of_node); #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ #ifdef CONFIG_NUMA_KEEP_MEMINFO static int meminfo_to_nid(struct numa_meminfo *mi, u64 start) { int i; for (i = 0; i < mi->nr_blks; i++) if (mi->blk[i].start <= start && mi->blk[i].end > start) return mi->blk[i].nid; return NUMA_NO_NODE; } int phys_to_target_node(phys_addr_t start) { int nid = meminfo_to_nid(&numa_meminfo, start); /* * Prefer online nodes, but if reserved memory might be * hot-added continue the search with reserved ranges. */ if (nid != NUMA_NO_NODE) return nid; return meminfo_to_nid(&numa_reserved_meminfo, start); } EXPORT_SYMBOL_GPL(phys_to_target_node); int memory_add_physaddr_to_nid(u64 start) { int nid = meminfo_to_nid(&numa_meminfo, start); if (nid == NUMA_NO_NODE) nid = numa_meminfo.blk[0].nid; return nid; } EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); static int __init cmp_memblk(const void *a, const void *b) { const struct numa_memblk *ma = *(const struct numa_memblk **)a; const struct numa_memblk *mb = *(const struct numa_memblk **)b; return ma->start - mb->start; } static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; /** * numa_fill_memblks - Fill gaps in numa_meminfo memblks * @start: address to begin fill * @end: address to end fill * * Find and extend numa_meminfo memblks to cover the @start-@end * physical address range, such that the first memblk includes * @start, the last memblk includes @end, and any gaps in between * are filled. * * RETURNS: * 0 : Success * NUMA_NO_MEMBLK : No memblk exists in @start-@end range */ int __init numa_fill_memblks(u64 start, u64 end) { struct numa_memblk **blk = &numa_memblk_list[0]; struct numa_meminfo *mi = &numa_meminfo; int count = 0; u64 prev_end; /* * Create a list of pointers to numa_meminfo memblks that * overlap start, end. Exclude (start == bi->end) since * end addresses in both a CFMWS range and a memblk range * are exclusive. * * This list of pointers is used to make in-place changes * that fill out the numa_meminfo memblks. */ for (int i = 0; i < mi->nr_blks; i++) { struct numa_memblk *bi = &mi->blk[i]; if (start < bi->end && end >= bi->start) { blk[count] = &mi->blk[i]; count++; } } if (!count) return NUMA_NO_MEMBLK; /* Sort the list of pointers in memblk->start order */ sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); /* Make sure the first/last memblks include start/end */ blk[0]->start = min(blk[0]->start, start); blk[count - 1]->end = max(blk[count - 1]->end, end); /* * Fill any gaps by tracking the previous memblks * end address and backfilling to it if needed. */ prev_end = blk[0]->end; for (int i = 1; i < count; i++) { struct numa_memblk *curr = blk[i]; if (prev_end >= curr->start) { if (prev_end < curr->end) prev_end = curr->end; } else { curr->start = prev_end; prev_end = curr->end; } } return 0; } #endif
23 23 23 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 // SPDX-License-Identifier: GPL-2.0 /* * drivers/usb/core/file.c * * (C) Copyright Linus Torvalds 1999 * (C) Copyright Johannes Erdfelt 1999-2001 * (C) Copyright Andreas Gal 1999 * (C) Copyright Gregory P. Smith 1999 * (C) Copyright Deti Fliegl 1999 (new USB architecture) * (C) Copyright Randy Dunlap 2000 * (C) Copyright David Brownell 2000-2001 (kernel hotplug, usb_device_id, * more docs, etc) * (C) Copyright Yggdrasil Computing, Inc. 2000 * (usb_device_id matching changes by Adam J. Richter) * (C) Copyright Greg Kroah-Hartman 2002-2003 * * Released under the GPLv2 only. */ #include <linux/module.h> #include <linux/errno.h> #include <linux/rwsem.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/usb.h> #include "usb.h" #define MAX_USB_MINORS 256 static const struct file_operations *usb_minors[MAX_USB_MINORS]; static DECLARE_RWSEM(minor_rwsem); static int usb_open(struct inode *inode, struct file *file) { int err = -ENODEV; const struct file_operations *new_fops; down_read(&minor_rwsem); new_fops = fops_get(usb_minors[iminor(inode)]); if (!new_fops) goto done; replace_fops(file, new_fops); /* Curiouser and curiouser... NULL ->open() as "no device" ? */ if (file->f_op->open) err = file->f_op->open(inode, file); done: up_read(&minor_rwsem); return err; } static const struct file_operations usb_fops = { .owner = THIS_MODULE, .open = usb_open, .llseek = noop_llseek, }; static char *usb_devnode(const struct device *dev, umode_t *mode) { struct usb_class_driver *drv; drv = dev_get_drvdata(dev); if (!drv || !drv->devnode) return NULL; return drv->devnode(dev, mode); } const struct class usbmisc_class = { .name = "usbmisc", .devnode = usb_devnode, }; int usb_major_init(void) { int error; error = register_chrdev(USB_MAJOR, "usb", &usb_fops); if (error) printk(KERN_ERR "Unable to get major %d for usb devices\n", USB_MAJOR); return error; } void usb_major_cleanup(void) { unregister_chrdev(USB_MAJOR, "usb"); } /** * usb_register_dev - register a USB device, and ask for a minor number * @intf: pointer to the usb_interface that is being registered * @class_driver: pointer to the usb_class_driver for this device * * This should be called by all USB drivers that use the USB major number. * If CONFIG_USB_DYNAMIC_MINORS is enabled, the minor number will be * dynamically allocated out of the list of available ones. If it is not * enabled, the minor number will be based on the next available free minor, * starting at the class_driver->minor_base. * * This function also creates a usb class device in the sysfs tree. * * usb_deregister_dev() must be called when the driver is done with * the minor numbers given out by this function. * * Return: -EINVAL if something bad happens with trying to register a * device, and 0 on success. */ int usb_register_dev(struct usb_interface *intf, struct usb_class_driver *class_driver) { int retval = 0; int minor_base = class_driver->minor_base; int minor; char name[20]; #ifdef CONFIG_USB_DYNAMIC_MINORS /* * We don't care what the device tries to start at, we want to start * at zero to pack the devices into the smallest available space with * no holes in the minor range. */ minor_base = 0; #endif if (class_driver->fops == NULL) return -EINVAL; if (intf->minor >= 0) return -EADDRINUSE; dev_dbg(&intf->dev, "looking for a minor, starting at %d\n", minor_base); down_write(&minor_rwsem); for (minor = minor_base; minor < MAX_USB_MINORS; ++minor) { if (usb_minors[minor]) continue; usb_minors[minor] = class_driver->fops; intf->minor = minor; break; } if (intf->minor < 0) { up_write(&minor_rwsem); return -EXFULL; } /* create a usb class device for this usb interface */ snprintf(name, sizeof(name), class_driver->name, minor - minor_base); intf->usb_dev = device_create(&usbmisc_class, &intf->dev, MKDEV(USB_MAJOR, minor), class_driver, "%s", kbasename(name)); if (IS_ERR(intf->usb_dev)) { usb_minors[minor] = NULL; intf->minor = -1; retval = PTR_ERR(intf->usb_dev); } up_write(&minor_rwsem); return retval; } EXPORT_SYMBOL_GPL(usb_register_dev); /** * usb_deregister_dev - deregister a USB device's dynamic minor. * @intf: pointer to the usb_interface that is being deregistered * @class_driver: pointer to the usb_class_driver for this device * * Used in conjunction with usb_register_dev(). This function is called * when the USB driver is finished with the minor numbers gotten from a * call to usb_register_dev() (usually when the device is disconnected * from the system.) * * This function also removes the usb class device from the sysfs tree. * * This should be called by all drivers that use the USB major number. */ void usb_deregister_dev(struct usb_interface *intf, struct usb_class_driver *class_driver) { if (intf->minor == -1) return; dev_dbg(&intf->dev, "removing %d minor\n", intf->minor); device_destroy(&usbmisc_class, MKDEV(USB_MAJOR, intf->minor)); down_write(&minor_rwsem); usb_minors[intf->minor] = NULL; up_write(&minor_rwsem); intf->usb_dev = NULL; intf->minor = -1; } EXPORT_SYMBOL_GPL(usb_deregister_dev);
13 26 26 26 26 24 27 26 2 2 2 24 23 23 23 23 24 27 32 32 1 31 21 9 8 30 30 29 24 23 23 23 23 1 24 24 23 23 24 11 32 37 37 6 65 9 61 22 61 13 50 15 37 36 32 20 6 15 14 65 55 28 28 55 55 28 55 9 55 55 26 25 16 26 16 26 26 26 26 25 26 26 26 22 26 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* af_can.c - Protocol family CAN core module * (used by different CAN protocol modules) * * Copyright (c) 2002-2017 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/stddef.h> #include <linux/init.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/uaccess.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/if_ether.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/can-ml.h> #include <linux/ratelimit.h> #include <net/net_namespace.h> #include <net/sock.h> #include "af_can.h" MODULE_DESCRIPTION("Controller Area Network PF_CAN core"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>, " "Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); MODULE_ALIAS_NETPROTO(PF_CAN); static int stats_timer __read_mostly = 1; module_param(stats_timer, int, 0444); MODULE_PARM_DESC(stats_timer, "enable timer for statistics (default:on)"); static struct kmem_cache *rcv_cache __read_mostly; /* table of registered CAN protocols */ static const struct can_proto __rcu *proto_tab[CAN_NPROTO] __read_mostly; static DEFINE_MUTEX(proto_tab_lock); static atomic_t skbcounter = ATOMIC_INIT(0); /* af_can socket functions */ void can_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_error_queue); } EXPORT_SYMBOL(can_sock_destruct); static const struct can_proto *can_get_proto(int protocol) { const struct can_proto *cp; rcu_read_lock(); cp = rcu_dereference(proto_tab[protocol]); if (cp && !try_module_get(cp->prot->owner)) cp = NULL; rcu_read_unlock(); return cp; } static inline void can_put_proto(const struct can_proto *cp) { module_put(cp->prot->owner); } static int can_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; const struct can_proto *cp; int err = 0; sock->state = SS_UNCONNECTED; if (protocol < 0 || protocol >= CAN_NPROTO) return -EINVAL; cp = can_get_proto(protocol); #ifdef CONFIG_MODULES if (!cp) { /* try to load protocol module if kernel is modular */ err = request_module("can-proto-%d", protocol); /* In case of error we only print a message but don't * return the error code immediately. Below we will * return -EPROTONOSUPPORT */ if (err) pr_err_ratelimited("can: request_module (can-proto-%d) failed.\n", protocol); cp = can_get_proto(protocol); } #endif /* check for available protocol and correct usage */ if (!cp) return -EPROTONOSUPPORT; if (cp->type != sock->type) { err = -EPROTOTYPE; goto errout; } sock->ops = cp->ops; sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot, kern); if (!sk) { err = -ENOMEM; goto errout; } sock_init_data(sock, sk); sk->sk_destruct = can_sock_destruct; if (sk->sk_prot->init) err = sk->sk_prot->init(sk); if (err) { /* release sk on errors */ sock_orphan(sk); sock_put(sk); } errout: can_put_proto(cp); return err; } /* af_can tx path */ /** * can_send - transmit a CAN frame (optional with local loopback) * @skb: pointer to socket buffer with CAN frame in data section * @loop: loopback for listeners on local CAN sockets (recommended default!) * * Due to the loopback this routine must not be called from hardirq context. * * Return: * 0 on success * -ENETDOWN when the selected interface is down * -ENOBUFS on full driver queue (see net_xmit_errno()) * -ENOMEM when local loopback failed at calling skb_clone() * -EPERM when trying to send on a non-CAN interface * -EMSGSIZE CAN frame size is bigger than CAN interface MTU * -EINVAL when the skb->data does not contain a valid CAN frame */ int can_send(struct sk_buff *skb, int loop) { struct sk_buff *newskb = NULL; struct can_pkg_stats *pkg_stats = dev_net(skb->dev)->can.pkg_stats; int err = -EINVAL; if (can_is_canxl_skb(skb)) { skb->protocol = htons(ETH_P_CANXL); } else if (can_is_can_skb(skb)) { skb->protocol = htons(ETH_P_CAN); } else if (can_is_canfd_skb(skb)) { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; skb->protocol = htons(ETH_P_CANFD); /* set CAN FD flag for CAN FD frames by default */ cfd->flags |= CANFD_FDF; } else { goto inval_skb; } /* Make sure the CAN frame can pass the selected CAN netdevice. */ if (unlikely(skb->len > skb->dev->mtu)) { err = -EMSGSIZE; goto inval_skb; } if (unlikely(skb->dev->type != ARPHRD_CAN)) { err = -EPERM; goto inval_skb; } if (unlikely(!(skb->dev->flags & IFF_UP))) { err = -ENETDOWN; goto inval_skb; } skb->ip_summed = CHECKSUM_UNNECESSARY; skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); if (loop) { /* local loopback of sent CAN frames */ /* indication for the CAN driver: do loopback */ skb->pkt_type = PACKET_LOOPBACK; /* The reference to the originating sock may be required * by the receiving socket to check whether the frame is * its own. Example: can_raw sockopt CAN_RAW_RECV_OWN_MSGS * Therefore we have to ensure that skb->sk remains the * reference to the originating sock by restoring skb->sk * after each skb_clone() or skb_orphan() usage. */ if (!(skb->dev->flags & IFF_ECHO)) { /* If the interface is not capable to do loopback * itself, we do it here. */ newskb = skb_clone(skb, GFP_ATOMIC); if (!newskb) { kfree_skb(skb); return -ENOMEM; } can_skb_set_owner(newskb, skb->sk); newskb->ip_summed = CHECKSUM_UNNECESSARY; newskb->pkt_type = PACKET_BROADCAST; } } else { /* indication for the CAN driver: no loopback required */ skb->pkt_type = PACKET_HOST; } /* send to netdevice */ err = dev_queue_xmit(skb); if (err > 0) err = net_xmit_errno(err); if (err) { kfree_skb(newskb); return err; } if (newskb) netif_rx(newskb); /* update statistics */ pkg_stats->tx_frames++; pkg_stats->tx_frames_delta++; return 0; inval_skb: kfree_skb(skb); return err; } EXPORT_SYMBOL(can_send); /* af_can rx path */ static struct can_dev_rcv_lists *can_dev_rcv_lists_find(struct net *net, struct net_device *dev) { if (dev) { struct can_ml_priv *can_ml = can_get_ml_priv(dev); return &can_ml->dev_rcv_lists; } else { return net->can.rx_alldev_list; } } /** * effhash - hash function for 29 bit CAN identifier reduction * @can_id: 29 bit CAN identifier * * Description: * To reduce the linear traversal in one linked list of _single_ EFF CAN * frame subscriptions the 29 bit identifier is mapped to 10 bits. * (see CAN_EFF_RCV_HASH_BITS definition) * * Return: * Hash value from 0x000 - 0x3FF ( enforced by CAN_EFF_RCV_HASH_BITS mask ) */ static unsigned int effhash(canid_t can_id) { unsigned int hash; hash = can_id; hash ^= can_id >> CAN_EFF_RCV_HASH_BITS; hash ^= can_id >> (2 * CAN_EFF_RCV_HASH_BITS); return hash & ((1 << CAN_EFF_RCV_HASH_BITS) - 1); } /** * can_rcv_list_find - determine optimal filterlist inside device filter struct * @can_id: pointer to CAN identifier of a given can_filter * @mask: pointer to CAN mask of a given can_filter * @dev_rcv_lists: pointer to the device filter struct * * Description: * Returns the optimal filterlist to reduce the filter handling in the * receive path. This function is called by service functions that need * to register or unregister a can_filter in the filter lists. * * A filter matches in general, when * * <received_can_id> & mask == can_id & mask * * so every bit set in the mask (even CAN_EFF_FLAG, CAN_RTR_FLAG) describe * relevant bits for the filter. * * The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can * filter for error messages (CAN_ERR_FLAG bit set in mask). For error msg * frames there is a special filterlist and a special rx path filter handling. * * Return: * Pointer to optimal filterlist for the given can_id/mask pair. * Consistency checked mask. * Reduced can_id to have a preprocessed filter compare value. */ static struct hlist_head *can_rcv_list_find(canid_t *can_id, canid_t *mask, struct can_dev_rcv_lists *dev_rcv_lists) { canid_t inv = *can_id & CAN_INV_FILTER; /* save flag before masking */ /* filter for error message frames in extra filterlist */ if (*mask & CAN_ERR_FLAG) { /* clear CAN_ERR_FLAG in filter entry */ *mask &= CAN_ERR_MASK; return &dev_rcv_lists->rx[RX_ERR]; } /* with cleared CAN_ERR_FLAG we have a simple mask/value filterpair */ #define CAN_EFF_RTR_FLAGS (CAN_EFF_FLAG | CAN_RTR_FLAG) /* ensure valid values in can_mask for 'SFF only' frame filtering */ if ((*mask & CAN_EFF_FLAG) && !(*can_id & CAN_EFF_FLAG)) *mask &= (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS); /* reduce condition testing at receive time */ *can_id &= *mask; /* inverse can_id/can_mask filter */ if (inv) return &dev_rcv_lists->rx[RX_INV]; /* mask == 0 => no condition testing at receive time */ if (!(*mask)) return &dev_rcv_lists->rx[RX_ALL]; /* extra filterlists for the subscription of a single non-RTR can_id */ if (((*mask & CAN_EFF_RTR_FLAGS) == CAN_EFF_RTR_FLAGS) && !(*can_id & CAN_RTR_FLAG)) { if (*can_id & CAN_EFF_FLAG) { if (*mask == (CAN_EFF_MASK | CAN_EFF_RTR_FLAGS)) return &dev_rcv_lists->rx_eff[effhash(*can_id)]; } else { if (*mask == (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS)) return &dev_rcv_lists->rx_sff[*can_id]; } } /* default: filter via can_id/can_mask */ return &dev_rcv_lists->rx[RX_FIL]; } /** * can_rx_register - subscribe CAN frames from a specific interface * @net: the applicable net namespace * @dev: pointer to netdevice (NULL => subscribe from 'all' CAN devices list) * @can_id: CAN identifier (see description) * @mask: CAN mask (see description) * @func: callback function on filter match * @data: returned parameter for callback function * @ident: string for calling module identification * @sk: socket pointer (might be NULL) * * Description: * Invokes the callback function with the received sk_buff and the given * parameter 'data' on a matching receive filter. A filter matches, when * * <received_can_id> & mask == can_id & mask * * The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can * filter for error message frames (CAN_ERR_FLAG bit set in mask). * * The provided pointer to the sk_buff is guaranteed to be valid as long as * the callback function is running. The callback function must *not* free * the given sk_buff while processing it's task. When the given sk_buff is * needed after the end of the callback function it must be cloned inside * the callback function with skb_clone(). * * Return: * 0 on success * -ENOMEM on missing cache mem to create subscription entry * -ENODEV unknown device */ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id, canid_t mask, void (*func)(struct sk_buff *, void *), void *data, char *ident, struct sock *sk) { struct receiver *rcv; struct hlist_head *rcv_list; struct can_dev_rcv_lists *dev_rcv_lists; struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; /* insert new receiver (dev,canid,mask) -> (func,data) */ if (dev && (dev->type != ARPHRD_CAN || !can_get_ml_priv(dev))) return -ENODEV; if (dev && !net_eq(net, dev_net(dev))) return -ENODEV; rcv = kmem_cache_alloc(rcv_cache, GFP_KERNEL); if (!rcv) return -ENOMEM; spin_lock_bh(&net->can.rcvlists_lock); dev_rcv_lists = can_dev_rcv_lists_find(net, dev); rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists); rcv->can_id = can_id; rcv->mask = mask; rcv->matches = 0; rcv->func = func; rcv->data = data; rcv->ident = ident; rcv->sk = sk; hlist_add_head_rcu(&rcv->list, rcv_list); dev_rcv_lists->entries++; rcv_lists_stats->rcv_entries++; rcv_lists_stats->rcv_entries_max = max(rcv_lists_stats->rcv_entries_max, rcv_lists_stats->rcv_entries); spin_unlock_bh(&net->can.rcvlists_lock); return 0; } EXPORT_SYMBOL(can_rx_register); /* can_rx_delete_receiver - rcu callback for single receiver entry removal */ static void can_rx_delete_receiver(struct rcu_head *rp) { struct receiver *rcv = container_of(rp, struct receiver, rcu); struct sock *sk = rcv->sk; kmem_cache_free(rcv_cache, rcv); if (sk) sock_put(sk); } /** * can_rx_unregister - unsubscribe CAN frames from a specific interface * @net: the applicable net namespace * @dev: pointer to netdevice (NULL => unsubscribe from 'all' CAN devices list) * @can_id: CAN identifier * @mask: CAN mask * @func: callback function on filter match * @data: returned parameter for callback function * * Description: * Removes subscription entry depending on given (subscription) values. */ void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id, canid_t mask, void (*func)(struct sk_buff *, void *), void *data) { struct receiver *rcv = NULL; struct hlist_head *rcv_list; struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; struct can_dev_rcv_lists *dev_rcv_lists; if (dev && dev->type != ARPHRD_CAN) return; if (dev && !net_eq(net, dev_net(dev))) return; spin_lock_bh(&net->can.rcvlists_lock); dev_rcv_lists = can_dev_rcv_lists_find(net, dev); rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists); /* Search the receiver list for the item to delete. This should * exist, since no receiver may be unregistered that hasn't * been registered before. */ hlist_for_each_entry_rcu(rcv, rcv_list, list) { if (rcv->can_id == can_id && rcv->mask == mask && rcv->func == func && rcv->data == data) break; } /* Check for bugs in CAN protocol implementations using af_can.c: * 'rcv' will be NULL if no matching list item was found for removal. * As this case may potentially happen when closing a socket while * the notifier for removing the CAN netdev is running we just print * a warning here. */ if (!rcv) { pr_warn("can: receive list entry not found for dev %s, id %03X, mask %03X\n", DNAME(dev), can_id, mask); goto out; } hlist_del_rcu(&rcv->list); dev_rcv_lists->entries--; if (rcv_lists_stats->rcv_entries > 0) rcv_lists_stats->rcv_entries--; out: spin_unlock_bh(&net->can.rcvlists_lock); /* schedule the receiver item for deletion */ if (rcv) { if (rcv->sk) sock_hold(rcv->sk); call_rcu(&rcv->rcu, can_rx_delete_receiver); } } EXPORT_SYMBOL(can_rx_unregister); static inline void deliver(struct sk_buff *skb, struct receiver *rcv) { rcv->func(skb, rcv->data); rcv->matches++; } static int can_rcv_filter(struct can_dev_rcv_lists *dev_rcv_lists, struct sk_buff *skb) { struct receiver *rcv; int matches = 0; struct can_frame *cf = (struct can_frame *)skb->data; canid_t can_id = cf->can_id; if (dev_rcv_lists->entries == 0) return 0; if (can_id & CAN_ERR_FLAG) { /* check for error message frame entries only */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ERR], list) { if (can_id & rcv->mask) { deliver(skb, rcv); matches++; } } return matches; } /* check for unfiltered entries */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ALL], list) { deliver(skb, rcv); matches++; } /* check for can_id/mask entries */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_FIL], list) { if ((can_id & rcv->mask) == rcv->can_id) { deliver(skb, rcv); matches++; } } /* check for inverted can_id/mask entries */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_INV], list) { if ((can_id & rcv->mask) != rcv->can_id) { deliver(skb, rcv); matches++; } } /* check filterlists for single non-RTR can_ids */ if (can_id & CAN_RTR_FLAG) return matches; if (can_id & CAN_EFF_FLAG) { hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_eff[effhash(can_id)], list) { if (rcv->can_id == can_id) { deliver(skb, rcv); matches++; } } } else { can_id &= CAN_SFF_MASK; hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_sff[can_id], list) { deliver(skb, rcv); matches++; } } return matches; } static void can_receive(struct sk_buff *skb, struct net_device *dev) { struct can_dev_rcv_lists *dev_rcv_lists; struct net *net = dev_net(dev); struct can_pkg_stats *pkg_stats = net->can.pkg_stats; int matches; /* update statistics */ pkg_stats->rx_frames++; pkg_stats->rx_frames_delta++; /* create non-zero unique skb identifier together with *skb */ while (!(can_skb_prv(skb)->skbcnt)) can_skb_prv(skb)->skbcnt = atomic_inc_return(&skbcounter); rcu_read_lock(); /* deliver the packet to sockets listening on all devices */ matches = can_rcv_filter(net->can.rx_alldev_list, skb); /* find receive list for this device */ dev_rcv_lists = can_dev_rcv_lists_find(net, dev); matches += can_rcv_filter(dev_rcv_lists, skb); rcu_read_unlock(); /* consume the skbuff allocated by the netdevice driver */ consume_skb(skb); if (matches > 0) { pkg_stats->matches++; pkg_stats->matches_delta++; } } static int can_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_can_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n", dev->type, skb->len); kfree_skb(skb); return NET_RX_DROP; } can_receive(skb, dev); return NET_RX_SUCCESS; } static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canfd_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n", dev->type, skb->len); kfree_skb(skb); return NET_RX_DROP; } can_receive(skb, dev); return NET_RX_SUCCESS; } static int canxl_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canxl_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN XL skbuff: dev type %d, len %d\n", dev->type, skb->len); kfree_skb(skb); return NET_RX_DROP; } can_receive(skb, dev); return NET_RX_SUCCESS; } /* af_can protocol functions */ /** * can_proto_register - register CAN transport protocol * @cp: pointer to CAN protocol structure * * Return: * 0 on success * -EINVAL invalid (out of range) protocol number * -EBUSY protocol already in use * -ENOBUF if proto_register() fails */ int can_proto_register(const struct can_proto *cp) { int proto = cp->protocol; int err = 0; if (proto < 0 || proto >= CAN_NPROTO) { pr_err("can: protocol number %d out of range\n", proto); return -EINVAL; } err = proto_register(cp->prot, 0); if (err < 0) return err; mutex_lock(&proto_tab_lock); if (rcu_access_pointer(proto_tab[proto])) { pr_err("can: protocol %d already registered\n", proto); err = -EBUSY; } else { RCU_INIT_POINTER(proto_tab[proto], cp); } mutex_unlock(&proto_tab_lock); if (err < 0) proto_unregister(cp->prot); return err; } EXPORT_SYMBOL(can_proto_register); /** * can_proto_unregister - unregister CAN transport protocol * @cp: pointer to CAN protocol structure */ void can_proto_unregister(const struct can_proto *cp) { int proto = cp->protocol; mutex_lock(&proto_tab_lock); BUG_ON(rcu_access_pointer(proto_tab[proto]) != cp); RCU_INIT_POINTER(proto_tab[proto], NULL); mutex_unlock(&proto_tab_lock); synchronize_rcu(); proto_unregister(cp->prot); } EXPORT_SYMBOL(can_proto_unregister); static int can_pernet_init(struct net *net) { spin_lock_init(&net->can.rcvlists_lock); net->can.rx_alldev_list = kzalloc(sizeof(*net->can.rx_alldev_list), GFP_KERNEL); if (!net->can.rx_alldev_list) goto out; net->can.pkg_stats = kzalloc(sizeof(*net->can.pkg_stats), GFP_KERNEL); if (!net->can.pkg_stats) goto out_free_rx_alldev_list; net->can.rcv_lists_stats = kzalloc(sizeof(*net->can.rcv_lists_stats), GFP_KERNEL); if (!net->can.rcv_lists_stats) goto out_free_pkg_stats; if (IS_ENABLED(CONFIG_PROC_FS)) { /* the statistics are updated every second (timer triggered) */ if (stats_timer) { timer_setup(&net->can.stattimer, can_stat_update, 0); mod_timer(&net->can.stattimer, round_jiffies(jiffies + HZ)); } net->can.pkg_stats->jiffies_init = jiffies; can_init_proc(net); } return 0; out_free_pkg_stats: kfree(net->can.pkg_stats); out_free_rx_alldev_list: kfree(net->can.rx_alldev_list); out: return -ENOMEM; } static void can_pernet_exit(struct net *net) { if (IS_ENABLED(CONFIG_PROC_FS)) { can_remove_proc(net); if (stats_timer) del_timer_sync(&net->can.stattimer); } kfree(net->can.rx_alldev_list); kfree(net->can.pkg_stats); kfree(net->can.rcv_lists_stats); } /* af_can module init/exit functions */ static struct packet_type can_packet __read_mostly = { .type = cpu_to_be16(ETH_P_CAN), .func = can_rcv, }; static struct packet_type canfd_packet __read_mostly = { .type = cpu_to_be16(ETH_P_CANFD), .func = canfd_rcv, }; static struct packet_type canxl_packet __read_mostly = { .type = cpu_to_be16(ETH_P_CANXL), .func = canxl_rcv, }; static const struct net_proto_family can_family_ops = { .family = PF_CAN, .create = can_create, .owner = THIS_MODULE, }; static struct pernet_operations can_pernet_ops __read_mostly = { .init = can_pernet_init, .exit = can_pernet_exit, }; static __init int can_init(void) { int err; /* check for correct padding to be able to use the structs similarly */ BUILD_BUG_ON(offsetof(struct can_frame, len) != offsetof(struct canfd_frame, len) || offsetof(struct can_frame, data) != offsetof(struct canfd_frame, data)); pr_info("can: controller area network core\n"); rcv_cache = kmem_cache_create("can_receiver", sizeof(struct receiver), 0, 0, NULL); if (!rcv_cache) return -ENOMEM; err = register_pernet_subsys(&can_pernet_ops); if (err) goto out_pernet; /* protocol register */ err = sock_register(&can_family_ops); if (err) goto out_sock; dev_add_pack(&can_packet); dev_add_pack(&canfd_packet); dev_add_pack(&canxl_packet); return 0; out_sock: unregister_pernet_subsys(&can_pernet_ops); out_pernet: kmem_cache_destroy(rcv_cache); return err; } static __exit void can_exit(void) { /* protocol unregister */ dev_remove_pack(&canxl_packet); dev_remove_pack(&canfd_packet); dev_remove_pack(&can_packet); sock_unregister(PF_CAN); unregister_pernet_subsys(&can_pernet_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ kmem_cache_destroy(rcv_cache); } module_init(can_init); module_exit(can_exit);
40 37 37 40 8 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 // SPDX-License-Identifier: GPL-2.0-or-later /* * Media Controller ancillary functions * * Copyright (c) 2016 Mauro Carvalho Chehab <mchehab@kernel.org> * Copyright (C) 2016 Shuah Khan <shuahkh@osg.samsung.com> * Copyright (C) 2006-2010 Nokia Corporation * Copyright (c) 2016 Intel Corporation. */ #include <linux/module.h> #include <linux/pci.h> #include <linux/usb.h> #include <media/media-device.h> #include <media/media-entity.h> #include <media/v4l2-fh.h> #include <media/v4l2-mc.h> #include <media/v4l2-subdev.h> #include <media/videobuf2-core.h> int v4l2_mc_create_media_graph(struct media_device *mdev) { struct media_entity *entity; struct media_entity *if_vid = NULL, *if_aud = NULL; struct media_entity *tuner = NULL, *decoder = NULL; struct media_entity *io_v4l = NULL, *io_vbi = NULL, *io_swradio = NULL; bool is_webcam = false; u32 flags; int ret, pad_sink, pad_source; if (!mdev) return 0; media_device_for_each_entity(entity, mdev) { switch (entity->function) { case MEDIA_ENT_F_IF_VID_DECODER: if_vid = entity; break; case MEDIA_ENT_F_IF_AUD_DECODER: if_aud = entity; break; case MEDIA_ENT_F_TUNER: tuner = entity; break; case MEDIA_ENT_F_ATV_DECODER: decoder = entity; break; case MEDIA_ENT_F_IO_V4L: io_v4l = entity; break; case MEDIA_ENT_F_IO_VBI: io_vbi = entity; break; case MEDIA_ENT_F_IO_SWRADIO: io_swradio = entity; break; case MEDIA_ENT_F_CAM_SENSOR: is_webcam = true; break; } } /* It should have at least one I/O entity */ if (!io_v4l && !io_vbi && !io_swradio) { dev_warn(mdev->dev, "Didn't find any I/O entity\n"); return -EINVAL; } /* * Here, webcams are modelled on a very simple way: the sensor is * connected directly to the I/O entity. All dirty details, like * scaler and crop HW are hidden. While such mapping is not enough * for mc-centric hardware, it is enough for v4l2 interface centric * PC-consumer's hardware. */ if (is_webcam) { if (!io_v4l) { dev_warn(mdev->dev, "Didn't find a MEDIA_ENT_F_IO_V4L\n"); return -EINVAL; } media_device_for_each_entity(entity, mdev) { if (entity->function != MEDIA_ENT_F_CAM_SENSOR) continue; ret = media_create_pad_link(entity, 0, io_v4l, 0, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "Failed to create a sensor link\n"); return ret; } } if (!decoder) return 0; } /* The device isn't a webcam. So, it should have a decoder */ if (!decoder) { dev_warn(mdev->dev, "Decoder not found\n"); return -EINVAL; } /* Link the tuner and IF video output pads */ if (tuner) { if (if_vid) { pad_source = media_get_pad_index(tuner, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_ANALOG); pad_sink = media_get_pad_index(if_vid, MEDIA_PAD_FL_SINK, PAD_SIGNAL_ANALOG); if (pad_source < 0 || pad_sink < 0) { dev_warn(mdev->dev, "Couldn't get tuner and/or PLL pad(s): (%d, %d)\n", pad_source, pad_sink); return -EINVAL; } ret = media_create_pad_link(tuner, pad_source, if_vid, pad_sink, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "Couldn't create tuner->PLL link)\n"); return ret; } pad_source = media_get_pad_index(if_vid, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_ANALOG); pad_sink = media_get_pad_index(decoder, MEDIA_PAD_FL_SINK, PAD_SIGNAL_ANALOG); if (pad_source < 0 || pad_sink < 0) { dev_warn(mdev->dev, "get decoder and/or PLL pad(s): (%d, %d)\n", pad_source, pad_sink); return -EINVAL; } ret = media_create_pad_link(if_vid, pad_source, decoder, pad_sink, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "couldn't link PLL to decoder\n"); return ret; } } else { pad_source = media_get_pad_index(tuner, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_ANALOG); pad_sink = media_get_pad_index(decoder, MEDIA_PAD_FL_SINK, PAD_SIGNAL_ANALOG); if (pad_source < 0 || pad_sink < 0) { dev_warn(mdev->dev, "couldn't get tuner and/or decoder pad(s): (%d, %d)\n", pad_source, pad_sink); return -EINVAL; } ret = media_create_pad_link(tuner, pad_source, decoder, pad_sink, MEDIA_LNK_FL_ENABLED); if (ret) return ret; } if (if_aud) { pad_source = media_get_pad_index(tuner, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_AUDIO); pad_sink = media_get_pad_index(if_aud, MEDIA_PAD_FL_SINK, PAD_SIGNAL_AUDIO); if (pad_source < 0 || pad_sink < 0) { dev_warn(mdev->dev, "couldn't get tuner and/or decoder pad(s) for audio: (%d, %d)\n", pad_source, pad_sink); return -EINVAL; } ret = media_create_pad_link(tuner, pad_source, if_aud, pad_sink, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "couldn't link tuner->audio PLL\n"); return ret; } } else { if_aud = tuner; } } /* Create demod to V4L, VBI and SDR radio links */ if (io_v4l) { pad_source = media_get_pad_index(decoder, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_DV); if (pad_source < 0) { dev_warn(mdev->dev, "couldn't get decoder output pad for V4L I/O\n"); return -EINVAL; } ret = media_create_pad_link(decoder, pad_source, io_v4l, 0, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "couldn't link decoder output to V4L I/O\n"); return ret; } } if (io_swradio) { pad_source = media_get_pad_index(decoder, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_DV); if (pad_source < 0) { dev_warn(mdev->dev, "couldn't get decoder output pad for SDR\n"); return -EINVAL; } ret = media_create_pad_link(decoder, pad_source, io_swradio, 0, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "couldn't link decoder output to SDR\n"); return ret; } } if (io_vbi) { pad_source = media_get_pad_index(decoder, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_DV); if (pad_source < 0) { dev_warn(mdev->dev, "couldn't get decoder output pad for VBI\n"); return -EINVAL; } ret = media_create_pad_link(decoder, pad_source, io_vbi, 0, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "couldn't link decoder output to VBI\n"); return ret; } } /* Create links for the media connectors */ flags = MEDIA_LNK_FL_ENABLED; media_device_for_each_entity(entity, mdev) { switch (entity->function) { case MEDIA_ENT_F_CONN_RF: if (!tuner) continue; pad_sink = media_get_pad_index(tuner, MEDIA_PAD_FL_SINK, PAD_SIGNAL_ANALOG); if (pad_sink < 0) { dev_warn(mdev->dev, "couldn't get tuner analog pad sink\n"); return -EINVAL; } ret = media_create_pad_link(entity, 0, tuner, pad_sink, flags); break; case MEDIA_ENT_F_CONN_SVIDEO: case MEDIA_ENT_F_CONN_COMPOSITE: pad_sink = media_get_pad_index(decoder, MEDIA_PAD_FL_SINK, PAD_SIGNAL_ANALOG); if (pad_sink < 0) { dev_warn(mdev->dev, "couldn't get decoder analog pad sink\n"); return -EINVAL; } ret = media_create_pad_link(entity, 0, decoder, pad_sink, flags); break; default: continue; } if (ret) return ret; flags = 0; } return 0; } EXPORT_SYMBOL_GPL(v4l2_mc_create_media_graph); int v4l_enable_media_source(struct video_device *vdev) { struct media_device *mdev = vdev->entity.graph_obj.mdev; int ret = 0, err; if (!mdev) return 0; mutex_lock(&mdev->graph_mutex); if (!mdev->enable_source) goto end; err = mdev->enable_source(&vdev->entity, &vdev->pipe); if (err) ret = -EBUSY; end: mutex_unlock(&mdev->graph_mutex); return ret; } EXPORT_SYMBOL_GPL(v4l_enable_media_source); void v4l_disable_media_source(struct video_device *vdev) { struct media_device *mdev = vdev->entity.graph_obj.mdev; if (mdev) { mutex_lock(&mdev->graph_mutex); if (mdev->disable_source) mdev->disable_source(&vdev->entity); mutex_unlock(&mdev->graph_mutex); } } EXPORT_SYMBOL_GPL(v4l_disable_media_source); int v4l_vb2q_enable_media_source(struct vb2_queue *q) { struct v4l2_fh *fh = q->owner; if (fh && fh->vdev) return v4l_enable_media_source(fh->vdev); return 0; } EXPORT_SYMBOL_GPL(v4l_vb2q_enable_media_source); int v4l2_create_fwnode_links_to_pad(struct v4l2_subdev *src_sd, struct media_pad *sink, u32 flags) { struct fwnode_handle *endpoint; if (!(sink->flags & MEDIA_PAD_FL_SINK)) return -EINVAL; fwnode_graph_for_each_endpoint(dev_fwnode(src_sd->dev), endpoint) { struct fwnode_handle *remote_ep; int src_idx, sink_idx, ret; struct media_pad *src; src_idx = media_entity_get_fwnode_pad(&src_sd->entity, endpoint, MEDIA_PAD_FL_SOURCE); if (src_idx < 0) continue; remote_ep = fwnode_graph_get_remote_endpoint(endpoint); if (!remote_ep) continue; /* * ask the sink to verify it owns the remote endpoint, * and translate to a sink pad. */ sink_idx = media_entity_get_fwnode_pad(sink->entity, remote_ep, MEDIA_PAD_FL_SINK); fwnode_handle_put(remote_ep); if (sink_idx < 0 || sink_idx != sink->index) continue; /* * the source endpoint corresponds to one of its source pads, * the source endpoint connects to an endpoint at the sink * entity, and the sink endpoint corresponds to the sink * pad requested, so we have found an endpoint connection * that works, create the media link for it. */ src = &src_sd->entity.pads[src_idx]; /* skip if link already exists */ if (media_entity_find_link(src, sink)) continue; dev_dbg(src_sd->dev, "creating link %s:%d -> %s:%d\n", src_sd->entity.name, src_idx, sink->entity->name, sink_idx); ret = media_create_pad_link(&src_sd->entity, src_idx, sink->entity, sink_idx, flags); if (ret) { dev_err(src_sd->dev, "link %s:%d -> %s:%d failed with %d\n", src_sd->entity.name, src_idx, sink->entity->name, sink_idx, ret); fwnode_handle_put(endpoint); return ret; } } return 0; } EXPORT_SYMBOL_GPL(v4l2_create_fwnode_links_to_pad); int v4l2_create_fwnode_links(struct v4l2_subdev *src_sd, struct v4l2_subdev *sink_sd) { unsigned int i; for (i = 0; i < sink_sd->entity.num_pads; i++) { struct media_pad *pad = &sink_sd->entity.pads[i]; int ret; if (!(pad->flags & MEDIA_PAD_FL_SINK)) continue; ret = v4l2_create_fwnode_links_to_pad(src_sd, pad, 0); if (ret) return ret; } return 0; } EXPORT_SYMBOL_GPL(v4l2_create_fwnode_links); /* ----------------------------------------------------------------------------- * Pipeline power management * * Entities must be powered up when part of a pipeline that contains at least * one open video device node. * * To achieve this use the entity use_count field to track the number of users. * For entities corresponding to video device nodes the use_count field stores * the users count of the node. For entities corresponding to subdevs the * use_count field stores the total number of users of all video device nodes * in the pipeline. * * The v4l2_pipeline_pm_{get, put}() functions must be called in the open() and * close() handlers of video device nodes. It increments or decrements the use * count of all subdev entities in the pipeline. * * To react to link management on powered pipelines, the link setup notification * callback updates the use count of all entities in the source and sink sides * of the link. */ /* * pipeline_pm_use_count - Count the number of users of a pipeline * @entity: The entity * * Return the total number of users of all video device nodes in the pipeline. */ static int pipeline_pm_use_count(struct media_entity *entity, struct media_graph *graph) { int use = 0; media_graph_walk_start(graph, entity); while ((entity = media_graph_walk_next(graph))) { if (is_media_entity_v4l2_video_device(entity)) use += entity->use_count; } return use; } /* * pipeline_pm_power_one - Apply power change to an entity * @entity: The entity * @change: Use count change * * Change the entity use count by @change. If the entity is a subdev update its * power state by calling the core::s_power operation when the use count goes * from 0 to != 0 or from != 0 to 0. * * Return 0 on success or a negative error code on failure. */ static int pipeline_pm_power_one(struct media_entity *entity, int change) { struct v4l2_subdev *subdev; int ret; subdev = is_media_entity_v4l2_subdev(entity) ? media_entity_to_v4l2_subdev(entity) : NULL; if (entity->use_count == 0 && change > 0 && subdev != NULL) { ret = v4l2_subdev_call(subdev, core, s_power, 1); if (ret < 0 && ret != -ENOIOCTLCMD) return ret; } entity->use_count += change; WARN_ON(entity->use_count < 0); if (entity->use_count == 0 && change < 0 && subdev != NULL) v4l2_subdev_call(subdev, core, s_power, 0); return 0; } /* * pipeline_pm_power - Apply power change to all entities in a pipeline * @entity: The entity * @change: Use count change * * Walk the pipeline to update the use count and the power state of all non-node * entities. * * Return 0 on success or a negative error code on failure. */ static int pipeline_pm_power(struct media_entity *entity, int change, struct media_graph *graph) { struct media_entity *first = entity; int ret = 0; if (!change) return 0; media_graph_walk_start(graph, entity); while (!ret && (entity = media_graph_walk_next(graph))) if (is_media_entity_v4l2_subdev(entity)) ret = pipeline_pm_power_one(entity, change); if (!ret) return ret; media_graph_walk_start(graph, first); while ((first = media_graph_walk_next(graph)) && first != entity) if (is_media_entity_v4l2_subdev(first)) pipeline_pm_power_one(first, -change); return ret; } static int v4l2_pipeline_pm_use(struct media_entity *entity, unsigned int use) { struct media_device *mdev = entity->graph_obj.mdev; int change = use ? 1 : -1; int ret; mutex_lock(&mdev->graph_mutex); /* Apply use count to node. */ entity->use_count += change; WARN_ON(entity->use_count < 0); /* Apply power change to connected non-nodes. */ ret = pipeline_pm_power(entity, change, &mdev->pm_count_walk); if (ret < 0) entity->use_count -= change; mutex_unlock(&mdev->graph_mutex); return ret; } int v4l2_pipeline_pm_get(struct media_entity *entity) { return v4l2_pipeline_pm_use(entity, 1); } EXPORT_SYMBOL_GPL(v4l2_pipeline_pm_get); void v4l2_pipeline_pm_put(struct media_entity *entity) { /* Powering off entities shouldn't fail. */ WARN_ON(v4l2_pipeline_pm_use(entity, 0)); } EXPORT_SYMBOL_GPL(v4l2_pipeline_pm_put); int v4l2_pipeline_link_notify(struct media_link *link, u32 flags, unsigned int notification) { struct media_graph *graph = &link->graph_obj.mdev->pm_count_walk; struct media_entity *source = link->source->entity; struct media_entity *sink = link->sink->entity; int source_use; int sink_use; int ret = 0; source_use = pipeline_pm_use_count(source, graph); sink_use = pipeline_pm_use_count(sink, graph); if (notification == MEDIA_DEV_NOTIFY_POST_LINK_CH && !(flags & MEDIA_LNK_FL_ENABLED)) { /* Powering off entities is assumed to never fail. */ pipeline_pm_power(source, -sink_use, graph); pipeline_pm_power(sink, -source_use, graph); return 0; } if (notification == MEDIA_DEV_NOTIFY_PRE_LINK_CH && (flags & MEDIA_LNK_FL_ENABLED)) { ret = pipeline_pm_power(source, sink_use, graph); if (ret < 0) return ret; ret = pipeline_pm_power(sink, source_use, graph); if (ret < 0) pipeline_pm_power(source, -sink_use, graph); } return ret; } EXPORT_SYMBOL_GPL(v4l2_pipeline_link_notify);
18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * OSS compatible sequencer driver * * Copyright (C) 1998,99 Takashi Iwai <tiwai@suse.de> */ #ifndef __SEQ_OSS_DEVICE_H #define __SEQ_OSS_DEVICE_H #include <linux/time.h> #include <linux/wait.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <sound/core.h> #include <sound/seq_oss.h> #include <sound/rawmidi.h> #include <sound/seq_kernel.h> #include <sound/info.h> #include "../seq_clientmgr.h" /* max. applications */ #define SNDRV_SEQ_OSS_MAX_CLIENTS 16 #define SNDRV_SEQ_OSS_MAX_SYNTH_DEVS 16 #define SNDRV_SEQ_OSS_MAX_MIDI_DEVS 32 /* version */ #define SNDRV_SEQ_OSS_MAJOR_VERSION 0 #define SNDRV_SEQ_OSS_MINOR_VERSION 1 #define SNDRV_SEQ_OSS_TINY_VERSION 8 #define SNDRV_SEQ_OSS_VERSION_STR "0.1.8" /* device and proc interface name */ #define SNDRV_SEQ_OSS_PROCNAME "oss" /* * type definitions */ typedef unsigned int reltime_t; typedef unsigned int abstime_t; /* * synthesizer channel information */ struct seq_oss_chinfo { int note, vel; }; /* * synthesizer information */ struct seq_oss_synthinfo { struct snd_seq_oss_arg arg; struct seq_oss_chinfo *ch; struct seq_oss_synth_sysex *sysex; int nr_voices; int opened; int is_midi; int midi_mapped; }; /* * sequencer client information */ struct seq_oss_devinfo { int index; /* application index */ int cseq; /* sequencer client number */ int port; /* sequencer port number */ int queue; /* sequencer queue number */ struct snd_seq_addr addr; /* address of this device */ int seq_mode; /* sequencer mode */ int file_mode; /* file access */ /* midi device table */ int max_mididev; /* synth device table */ int max_synthdev; struct seq_oss_synthinfo synths[SNDRV_SEQ_OSS_MAX_SYNTH_DEVS]; int synth_opened; /* output queue */ struct seq_oss_writeq *writeq; /* midi input queue */ struct seq_oss_readq *readq; /* timer */ struct seq_oss_timer *timer; }; /* * function prototypes */ /* create/delete OSS sequencer client */ int snd_seq_oss_create_client(void); int snd_seq_oss_delete_client(void); /* device file interface */ int snd_seq_oss_open(struct file *file, int level); void snd_seq_oss_release(struct seq_oss_devinfo *dp); int snd_seq_oss_ioctl(struct seq_oss_devinfo *dp, unsigned int cmd, unsigned long arg); int snd_seq_oss_read(struct seq_oss_devinfo *dev, char __user *buf, int count); int snd_seq_oss_write(struct seq_oss_devinfo *dp, const char __user *buf, int count, struct file *opt); __poll_t snd_seq_oss_poll(struct seq_oss_devinfo *dp, struct file *file, poll_table * wait); void snd_seq_oss_reset(struct seq_oss_devinfo *dp); /* */ void snd_seq_oss_process_queue(struct seq_oss_devinfo *dp, abstime_t time); /* proc interface */ void snd_seq_oss_system_info_read(struct snd_info_buffer *buf); void snd_seq_oss_midi_info_read(struct snd_info_buffer *buf); void snd_seq_oss_synth_info_read(struct snd_info_buffer *buf); void snd_seq_oss_readq_info_read(struct seq_oss_readq *q, struct snd_info_buffer *buf); /* file mode macros */ #define is_read_mode(mode) ((mode) & SNDRV_SEQ_OSS_FILE_READ) #define is_write_mode(mode) ((mode) & SNDRV_SEQ_OSS_FILE_WRITE) #define is_nonblock_mode(mode) ((mode) & SNDRV_SEQ_OSS_FILE_NONBLOCK) /* dispatch event */ static inline int snd_seq_oss_dispatch(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, int atomic, int hop) { return snd_seq_kernel_client_dispatch(dp->cseq, ev, atomic, hop); } /* ioctl for writeq */ static inline int snd_seq_oss_control(struct seq_oss_devinfo *dp, unsigned int type, void *arg) { int err; snd_seq_client_ioctl_lock(dp->cseq); err = snd_seq_kernel_client_ctl(dp->cseq, type, arg); snd_seq_client_ioctl_unlock(dp->cseq); return err; } /* fill the addresses in header */ static inline void snd_seq_oss_fill_addr(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, int dest_client, int dest_port) { ev->queue = dp->queue; ev->source = dp->addr; ev->dest.client = dest_client; ev->dest.port = dest_port; } /* misc. functions for proc interface */ char *enabled_str(int bool); #endif /* __SEQ_OSS_DEVICE_H */
61 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 /* * Performance events x86 architecture code * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> * Copyright (C) 2009 Google, Inc., Stephane Eranian * * For licencing details see kernel-base/COPYING */ #include <linux/perf_event.h> #include <linux/capability.h> #include <linux/notifier.h> #include <linux/hardirq.h> #include <linux/kprobes.h> #include <linux/export.h> #include <linux/init.h> #include <linux/kdebug.h> #include <linux/sched/mm.h> #include <linux/sched/clock.h> #include <linux/uaccess.h> #include <linux/slab.h> #include <linux/cpu.h> #include <linux/bitops.h> #include <linux/device.h> #include <linux/nospec.h> #include <linux/static_call.h> #include <asm/apic.h> #include <asm/stacktrace.h> #include <asm/nmi.h> #include <asm/smp.h> #include <asm/alternative.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> #include <asm/timer.h> #include <asm/desc.h> #include <asm/ldt.h> #include <asm/unwind.h> #include "perf_event.h" struct x86_pmu x86_pmu __read_mostly; static struct pmu pmu; DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, .pmu = &pmu, }; DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); DEFINE_STATIC_KEY_FALSE(perf_is_hybrid); /* * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined * from just a typename, as opposed to an actual function. */ DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq, *x86_pmu.handle_irq); DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all); DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all); DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable); DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable); DEFINE_STATIC_CALL_NULL(x86_pmu_assign, *x86_pmu.assign); DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period); DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update); DEFINE_STATIC_CALL_NULL(x86_pmu_limit_period, *x86_pmu.limit_period); DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events); DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints); DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints); DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling, *x86_pmu.start_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task); DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx); DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter); /* * This one is magic, it will get called even when PMU init fails (because * there is no PMU), in which case it should simply return NULL. */ DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. * Returns the delta events processed. */ u64 x86_perf_event_update(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; int shift = 64 - x86_pmu.cntval_bits; u64 prev_raw_count, new_raw_count; u64 delta; if (unlikely(!hwc->event_base)) return 0; /* * Careful: an NMI might modify the previous event value. * * Our tactic to handle this is to first atomically read and * exchange a new raw count - then add that new-prev delta * count to the generic event atomically: */ prev_raw_count = local64_read(&hwc->prev_count); do { rdpmcl(hwc->event_base_rdpmc, new_raw_count); } while (!local64_try_cmpxchg(&hwc->prev_count, &prev_raw_count, new_raw_count)); /* * Now we have the new raw value and have updated the prev * timestamp already. We can now calculate the elapsed delta * (event-)time and add that to the generic event. * * Careful, not all hw sign-extends above the physical width * of the count. */ delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; local64_add(delta, &event->count); local64_sub(delta, &hwc->period_left); return new_raw_count; } /* * Find and validate any extra registers to set up. */ static int x86_pmu_extra_regs(u64 config, struct perf_event *event) { struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs); struct hw_perf_event_extra *reg; struct extra_reg *er; reg = &event->hw.extra_reg; if (!extra_regs) return 0; for (er = extra_regs; er->msr; er++) { if (er->event != (config & er->config_mask)) continue; if (event->attr.config1 & ~er->valid_mask) return -EINVAL; /* Check if the extra msrs can be safely accessed*/ if (!er->extra_msr_access) return -ENXIO; reg->idx = er->idx; reg->config = event->attr.config1; reg->reg = er->msr; break; } return 0; } static atomic_t active_events; static atomic_t pmc_refcount; static DEFINE_MUTEX(pmc_reserve_mutex); #ifdef CONFIG_X86_LOCAL_APIC static inline int get_possible_num_counters(void) { int i, num_counters = x86_pmu.num_counters; if (!is_hybrid()) return num_counters; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) num_counters = max_t(int, num_counters, x86_pmu.hybrid_pmu[i].num_counters); return num_counters; } static bool reserve_pmc_hardware(void) { int i, num_counters = get_possible_num_counters(); for (i = 0; i < num_counters; i++) { if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) goto perfctr_fail; } for (i = 0; i < num_counters; i++) { if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) goto eventsel_fail; } return true; eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu_config_addr(i)); i = num_counters; perfctr_fail: for (i--; i >= 0; i--) release_perfctr_nmi(x86_pmu_event_addr(i)); return false; } static void release_pmc_hardware(void) { int i, num_counters = get_possible_num_counters(); for (i = 0; i < num_counters; i++) { release_perfctr_nmi(x86_pmu_event_addr(i)); release_evntsel_nmi(x86_pmu_config_addr(i)); } } #else static bool reserve_pmc_hardware(void) { return true; } static void release_pmc_hardware(void) {} #endif bool check_hw_exists(struct pmu *pmu, int num_counters, int num_counters_fixed) { u64 val, val_fail = -1, val_new= ~0; int i, reg, reg_fail = -1, ret = 0; int bios_fail = 0; int reg_safe = -1; /* * Check to see if the BIOS enabled any of the counters, if so * complain and bail. */ for (i = 0; i < num_counters; i++) { reg = x86_pmu_config_addr(i); ret = rdmsrl_safe(reg, &val); if (ret) goto msr_fail; if (val & ARCH_PERFMON_EVENTSEL_ENABLE) { bios_fail = 1; val_fail = val; reg_fail = reg; } else { reg_safe = i; } } if (num_counters_fixed) { reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; ret = rdmsrl_safe(reg, &val); if (ret) goto msr_fail; for (i = 0; i < num_counters_fixed; i++) { if (fixed_counter_disabled(i, pmu)) continue; if (val & (0x03ULL << i*4)) { bios_fail = 1; val_fail = val; reg_fail = reg; } } } /* * If all the counters are enabled, the below test will always * fail. The tools will also become useless in this scenario. * Just fail and disable the hardware counters. */ if (reg_safe == -1) { reg = reg_safe; goto msr_fail; } /* * Read the current value, change it and read it back to see if it * matches, this is needed to detect certain hardware emulators * (qemu/kvm) that don't trap on the MSR access and always return 0s. */ reg = x86_pmu_event_addr(reg_safe); if (rdmsrl_safe(reg, &val)) goto msr_fail; val ^= 0xffffUL; ret = wrmsrl_safe(reg, val); ret |= rdmsrl_safe(reg, &val_new); if (ret || val != val_new) goto msr_fail; /* * We still allow the PMU driver to operate: */ if (bios_fail) { pr_cont("Broken BIOS detected, complain to your hardware vendor.\n"); pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail); } return true; msr_fail: if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { pr_cont("PMU not available due to virtualization, using software events only.\n"); } else { pr_cont("Broken PMU hardware detected, using software events only.\n"); pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new); } return false; } static void hw_perf_event_destroy(struct perf_event *event) { x86_release_hardware(); atomic_dec(&active_events); } void hw_perf_lbr_event_destroy(struct perf_event *event) { hw_perf_event_destroy(event); /* undo the lbr/bts event accounting */ x86_del_exclusive(x86_lbr_exclusive_lbr); } static inline int x86_pmu_initialized(void) { return x86_pmu.handle_irq != NULL; } static inline int set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) { struct perf_event_attr *attr = &event->attr; unsigned int cache_type, cache_op, cache_result; u64 config, val; config = attr->config; cache_type = (config >> 0) & 0xff; if (cache_type >= PERF_COUNT_HW_CACHE_MAX) return -EINVAL; cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX); cache_op = (config >> 8) & 0xff; if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) return -EINVAL; cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX); cache_result = (config >> 16) & 0xff; if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) return -EINVAL; cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX); val = hybrid_var(event->pmu, hw_cache_event_ids)[cache_type][cache_op][cache_result]; if (val == 0) return -ENOENT; if (val == -1) return -EINVAL; hwc->config |= val; attr->config1 = hybrid_var(event->pmu, hw_cache_extra_regs)[cache_type][cache_op][cache_result]; return x86_pmu_extra_regs(val, event); } int x86_reserve_hardware(void) { int err = 0; if (!atomic_inc_not_zero(&pmc_refcount)) { mutex_lock(&pmc_reserve_mutex); if (atomic_read(&pmc_refcount) == 0) { if (!reserve_pmc_hardware()) { err = -EBUSY; } else { reserve_ds_buffers(); reserve_lbr_buffers(); } } if (!err) atomic_inc(&pmc_refcount); mutex_unlock(&pmc_reserve_mutex); } return err; } void x86_release_hardware(void) { if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) { release_pmc_hardware(); release_ds_buffers(); release_lbr_buffers(); mutex_unlock(&pmc_reserve_mutex); } } /* * Check if we can create event of a certain type (that no conflicting events * are present). */ int x86_add_exclusive(unsigned int what) { int i; /* * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS. * LBR and BTS are still mutually exclusive. */ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) goto out; if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) { mutex_lock(&pmc_reserve_mutex); for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) { if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i])) goto fail_unlock; } atomic_inc(&x86_pmu.lbr_exclusive[what]); mutex_unlock(&pmc_reserve_mutex); } out: atomic_inc(&active_events); return 0; fail_unlock: mutex_unlock(&pmc_reserve_mutex); return -EBUSY; } void x86_del_exclusive(unsigned int what) { atomic_dec(&active_events); /* * See the comment in x86_add_exclusive(). */ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) return; atomic_dec(&x86_pmu.lbr_exclusive[what]); } int x86_setup_perfctr(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; u64 config; if (!is_sampling_event(event)) { hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); } if (attr->type == event->pmu->type) return x86_pmu_extra_regs(event->attr.config, event); if (attr->type == PERF_TYPE_HW_CACHE) return set_ext_hw_attr(hwc, event); if (attr->config >= x86_pmu.max_events) return -EINVAL; attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events); /* * The generic map: */ config = x86_pmu.event_map(attr->config); if (config == 0) return -ENOENT; if (config == -1LL) return -EINVAL; hwc->config |= config; return 0; } /* * check that branch_sample_type is compatible with * settings needed for precise_ip > 1 which implies * using the LBR to capture ALL taken branches at the * priv levels of the measurement */ static inline int precise_br_compat(struct perf_event *event) { u64 m = event->attr.branch_sample_type; u64 b = 0; /* must capture all branches */ if (!(m & PERF_SAMPLE_BRANCH_ANY)) return 0; m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; if (!event->attr.exclude_user) b |= PERF_SAMPLE_BRANCH_USER; if (!event->attr.exclude_kernel) b |= PERF_SAMPLE_BRANCH_KERNEL; /* * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 */ return m == b; } int x86_pmu_max_precise(void) { int precise = 0; /* Support for constant skid */ if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { precise++; /* Support for IP fixup */ if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) precise++; if (x86_pmu.pebs_prec_dist) precise++; } return precise; } int x86_pmu_hw_config(struct perf_event *event) { if (event->attr.precise_ip) { int precise = x86_pmu_max_precise(); if (event->attr.precise_ip > precise) return -EOPNOTSUPP; /* There's no sense in having PEBS for non sampling events: */ if (!is_sampling_event(event)) return -EINVAL; } /* * check that PEBS LBR correction does not conflict with * whatever the user is asking with attr->branch_sample_type */ if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) { u64 *br_type = &event->attr.branch_sample_type; if (has_branch_stack(event)) { if (!precise_br_compat(event)) return -EOPNOTSUPP; /* branch_sample_type is compatible */ } else { /* * user did not specify branch_sample_type * * For PEBS fixups, we capture all * the branches at the priv level of the * event. */ *br_type = PERF_SAMPLE_BRANCH_ANY; if (!event->attr.exclude_user) *br_type |= PERF_SAMPLE_BRANCH_USER; if (!event->attr.exclude_kernel) *br_type |= PERF_SAMPLE_BRANCH_KERNEL; } } if (branch_sample_call_stack(event)) event->attach_state |= PERF_ATTACH_TASK_DATA; /* * Generate PMC IRQs: * (keep 'enabled' bit clear for now) */ event->hw.config = ARCH_PERFMON_EVENTSEL_INT; /* * Count user and OS events unless requested not to */ if (!event->attr.exclude_user) event->hw.config |= ARCH_PERFMON_EVENTSEL_USR; if (!event->attr.exclude_kernel) event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; if (event->attr.type == event->pmu->type) event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; if (event->attr.sample_period && x86_pmu.limit_period) { s64 left = event->attr.sample_period; x86_pmu.limit_period(event, &left); if (left > event->attr.sample_period) return -EINVAL; } /* sample_regs_user never support XMM registers */ if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK)) return -EINVAL; /* * Besides the general purpose registers, XMM registers may * be collected in PEBS on some platforms, e.g. Icelake */ if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) { if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS)) return -EINVAL; if (!event->attr.precise_ip) return -EINVAL; } return x86_setup_perfctr(event); } /* * Setup the hardware configuration for a given attr_type */ static int __x86_pmu_event_init(struct perf_event *event) { int err; if (!x86_pmu_initialized()) return -ENODEV; err = x86_reserve_hardware(); if (err) return err; atomic_inc(&active_events); event->destroy = hw_perf_event_destroy; event->hw.idx = -1; event->hw.last_cpu = -1; event->hw.last_tag = ~0ULL; /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; event->hw.branch_reg.idx = EXTRA_REG_NONE; return x86_pmu.hw_config(event); } void x86_pmu_disable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; for (idx = 0; idx < x86_pmu.num_counters; idx++) { struct hw_perf_event *hwc = &cpuc->events[idx]->hw; u64 val; if (!test_bit(idx, cpuc->active_mask)) continue; rdmsrl(x86_pmu_config_addr(idx), val); if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) continue; val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(x86_pmu_config_addr(idx), val); if (is_counter_pair(hwc)) wrmsrl(x86_pmu_config_addr(idx + 1), 0); } } struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data) { return static_call(x86_pmu_guest_get_msrs)(nr, data); } EXPORT_SYMBOL_GPL(perf_guest_get_msrs); /* * There may be PMI landing after enabled=0. The PMI hitting could be before or * after disable_all. * * If PMI hits before disable_all, the PMU will be disabled in the NMI handler. * It will not be re-enabled in the NMI handler again, because enabled=0. After * handling the NMI, disable_all will be called, which will not change the * state either. If PMI hits after disable_all, the PMU is already disabled * before entering NMI handler. The NMI handler will not change the state * either. * * So either situation is harmless. */ static void x86_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (!x86_pmu_initialized()) return; if (!cpuc->enabled) return; cpuc->n_added = 0; cpuc->enabled = 0; barrier(); static_call(x86_pmu_disable_all)(); } void x86_pmu_enable_all(int added) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; for (idx = 0; idx < x86_pmu.num_counters; idx++) { struct hw_perf_event *hwc = &cpuc->events[idx]->hw; if (!test_bit(idx, cpuc->active_mask)) continue; __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } } static inline int is_x86_event(struct perf_event *event) { int i; if (!is_hybrid()) return event->pmu == &pmu; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { if (event->pmu == &x86_pmu.hybrid_pmu[i].pmu) return true; } return false; } struct pmu *x86_get_pmu(unsigned int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); /* * All CPUs of the hybrid type have been offline. * The x86_get_pmu() should not be invoked. */ if (WARN_ON_ONCE(!cpuc->pmu)) return &pmu; return cpuc->pmu; } /* * Event scheduler state: * * Assign events iterating over all events and counters, beginning * with events with least weights first. Keep the current iterator * state in struct sched_state. */ struct sched_state { int weight; int event; /* event index */ int counter; /* counter index */ int unassigned; /* number of events to be assigned left */ int nr_gp; /* number of GP counters used */ u64 used; }; /* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ #define SCHED_STATES_MAX 2 struct perf_sched { int max_weight; int max_events; int max_gp; int saved_states; struct event_constraint **constraints; struct sched_state state; struct sched_state saved[SCHED_STATES_MAX]; }; /* * Initialize iterator that runs through all events and counters. */ static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, int num, int wmin, int wmax, int gpmax) { int idx; memset(sched, 0, sizeof(*sched)); sched->max_events = num; sched->max_weight = wmax; sched->max_gp = gpmax; sched->constraints = constraints; for (idx = 0; idx < num; idx++) { if (constraints[idx]->weight == wmin) break; } sched->state.event = idx; /* start with min weight */ sched->state.weight = wmin; sched->state.unassigned = num; } static void perf_sched_save_state(struct perf_sched *sched) { if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) return; sched->saved[sched->saved_states] = sched->state; sched->saved_states++; } static bool perf_sched_restore_state(struct perf_sched *sched) { if (!sched->saved_states) return false; sched->saved_states--; sched->state = sched->saved[sched->saved_states]; /* this assignment didn't work out */ /* XXX broken vs EVENT_PAIR */ sched->state.used &= ~BIT_ULL(sched->state.counter); /* try the next one */ sched->state.counter++; return true; } /* * Select a counter for the current event to schedule. Return true on * success. */ static bool __perf_sched_find_counter(struct perf_sched *sched) { struct event_constraint *c; int idx; if (!sched->state.unassigned) return false; if (sched->state.event >= sched->max_events) return false; c = sched->constraints[sched->state.event]; /* Prefer fixed purpose counters */ if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { idx = INTEL_PMC_IDX_FIXED; for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { u64 mask = BIT_ULL(idx); if (sched->state.used & mask) continue; sched->state.used |= mask; goto done; } } /* Grab the first unused counter starting with idx */ idx = sched->state.counter; for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { u64 mask = BIT_ULL(idx); if (c->flags & PERF_X86_EVENT_PAIR) mask |= mask << 1; if (sched->state.used & mask) continue; if (sched->state.nr_gp++ >= sched->max_gp) return false; sched->state.used |= mask; goto done; } return false; done: sched->state.counter = idx; if (c->overlap) perf_sched_save_state(sched); return true; } static bool perf_sched_find_counter(struct perf_sched *sched) { while (!__perf_sched_find_counter(sched)) { if (!perf_sched_restore_state(sched)) return false; } return true; } /* * Go through all unassigned events and find the next one to schedule. * Take events with the least weight first. Return true on success. */ static bool perf_sched_next_event(struct perf_sched *sched) { struct event_constraint *c; if (!sched->state.unassigned || !--sched->state.unassigned) return false; do { /* next event */ sched->state.event++; if (sched->state.event >= sched->max_events) { /* next weight */ sched->state.event = 0; sched->state.weight++; if (sched->state.weight > sched->max_weight) return false; } c = sched->constraints[sched->state.event]; } while (c->weight != sched->state.weight); sched->state.counter = 0; /* start with first counter */ return true; } /* * Assign a counter for each event. */ int perf_assign_events(struct event_constraint **constraints, int n, int wmin, int wmax, int gpmax, int *assign) { struct perf_sched sched; perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax); do { if (!perf_sched_find_counter(&sched)) break; /* failed */ if (assign) assign[sched.state.event] = sched.state.counter; } while (perf_sched_next_event(&sched)); return sched.state.unassigned; } EXPORT_SYMBOL_GPL(perf_assign_events); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { int num_counters = hybrid(cpuc->pmu, num_counters); struct event_constraint *c; struct perf_event *e; int n0, i, wmin, wmax, unsched = 0; struct hw_perf_event *hwc; u64 used_mask = 0; /* * Compute the number of events already present; see x86_pmu_add(), * validate_group() and x86_pmu_commit_txn(). For the former two * cpuc->n_events hasn't been updated yet, while for the latter * cpuc->n_txn contains the number of events added in the current * transaction. */ n0 = cpuc->n_events; if (cpuc->txn_flags & PERF_PMU_TXN_ADD) n0 -= cpuc->n_txn; static_call_cond(x86_pmu_start_scheduling)(cpuc); for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { c = cpuc->event_constraint[i]; /* * Previously scheduled events should have a cached constraint, * while new events should not have one. */ WARN_ON_ONCE((c && i >= n0) || (!c && i < n0)); /* * Request constraints for new events; or for those events that * have a dynamic constraint -- for those the constraint can * change due to external factors (sibling state, allow_tfa). */ if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) { c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]); cpuc->event_constraint[i] = c; } wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); } /* * fastpath, try to reuse previous register */ for (i = 0; i < n; i++) { u64 mask; hwc = &cpuc->event_list[i]->hw; c = cpuc->event_constraint[i]; /* never assigned */ if (hwc->idx == -1) break; /* constraint still honored */ if (!test_bit(hwc->idx, c->idxmsk)) break; mask = BIT_ULL(hwc->idx); if (is_counter_pair(hwc)) mask |= mask << 1; /* not already used */ if (used_mask & mask) break; used_mask |= mask; if (assign) assign[i] = hwc->idx; } /* slow path */ if (i != n) { int gpmax = num_counters; /* * Do not allow scheduling of more than half the available * generic counters. * * This helps avoid counter starvation of sibling thread by * ensuring at most half the counters cannot be in exclusive * mode. There is no designated counters for the limits. Any * N/2 counters can be used. This helps with events with * specific counter constraints. */ if (is_ht_workaround_enabled() && !cpuc->is_fake && READ_ONCE(cpuc->excl_cntrs->exclusive_present)) gpmax /= 2; /* * Reduce the amount of available counters to allow fitting * the extra Merge events needed by large increment events. */ if (x86_pmu.flags & PMU_FL_PAIR) { gpmax = num_counters - cpuc->n_pair; WARN_ON(gpmax <= 0); } unsched = perf_assign_events(cpuc->event_constraint, n, wmin, wmax, gpmax, assign); } /* * In case of success (unsched = 0), mark events as committed, * so we do not put_constraint() in case new events are added * and fail to be scheduled * * We invoke the lower level commit callback to lock the resource * * We do not need to do all of this in case we are called to * validate an event group (assign == NULL) */ if (!unsched && assign) { for (i = 0; i < n; i++) static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]); } else { for (i = n0; i < n; i++) { e = cpuc->event_list[i]; /* * release events that failed scheduling */ static_call_cond(x86_pmu_put_event_constraints)(cpuc, e); cpuc->event_constraint[i] = NULL; } } static_call_cond(x86_pmu_stop_scheduling)(cpuc); return unsched ? -EINVAL : 0; } static int add_nr_metric_event(struct cpu_hw_events *cpuc, struct perf_event *event) { if (is_metric_event(event)) { if (cpuc->n_metric == INTEL_TD_METRIC_NUM) return -EINVAL; cpuc->n_metric++; cpuc->n_txn_metric++; } return 0; } static void del_nr_metric_event(struct cpu_hw_events *cpuc, struct perf_event *event) { if (is_metric_event(event)) cpuc->n_metric--; } static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event, int max_count, int n) { union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); if (intel_cap.perf_metrics && add_nr_metric_event(cpuc, event)) return -EINVAL; if (n >= max_count + cpuc->n_metric) return -EINVAL; cpuc->event_list[n] = event; if (is_counter_pair(&event->hw)) { cpuc->n_pair++; cpuc->n_txn_pair++; } return 0; } /* * dogrp: true if must collect siblings events (group) * returns total number of events and error code */ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) { int num_counters = hybrid(cpuc->pmu, num_counters); int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); struct perf_event *event; int n, max_count; max_count = num_counters + num_counters_fixed; /* current number of events already accepted */ n = cpuc->n_events; if (!cpuc->n_events) cpuc->pebs_output = 0; if (!cpuc->is_fake && leader->attr.precise_ip) { /* * For PEBS->PT, if !aux_event, the group leader (PT) went * away, the group was broken down and this singleton event * can't schedule any more. */ if (is_pebs_pt(leader) && !leader->aux_event) return -EINVAL; /* * pebs_output: 0: no PEBS so far, 1: PT, 2: DS */ if (cpuc->pebs_output && cpuc->pebs_output != is_pebs_pt(leader) + 1) return -EINVAL; cpuc->pebs_output = is_pebs_pt(leader) + 1; } if (is_x86_event(leader)) { if (collect_event(cpuc, leader, max_count, n)) return -EINVAL; n++; } if (!dogrp) return n; for_each_sibling_event(event, leader) { if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF) continue; if (collect_event(cpuc, event, max_count, n)) return -EINVAL; n++; } return n; } static inline void x86_assign_hw_event(struct perf_event *event, struct cpu_hw_events *cpuc, int i) { struct hw_perf_event *hwc = &event->hw; int idx; idx = hwc->idx = cpuc->assign[i]; hwc->last_cpu = smp_processor_id(); hwc->last_tag = ++cpuc->tags[i]; static_call_cond(x86_pmu_assign)(event, idx); switch (hwc->idx) { case INTEL_PMC_IDX_FIXED_BTS: case INTEL_PMC_IDX_FIXED_VLBR: hwc->config_base = 0; hwc->event_base = 0; break; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: /* All the metric events are mapped onto the fixed counter 3. */ idx = INTEL_PMC_IDX_FIXED_SLOTS; fallthrough; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (idx - INTEL_PMC_IDX_FIXED); hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | INTEL_PMC_FIXED_RDPMC_BASE; break; default: hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); break; } } /** * x86_perf_rdpmc_index - Return PMC counter used for event * @event: the perf_event to which the PMC counter was assigned * * The counter assigned to this performance event may change if interrupts * are enabled. This counter should thus never be used while interrupts are * enabled. Before this function is used to obtain the assigned counter the * event should be checked for validity using, for example, * perf_event_read_local(), within the same interrupt disabled section in * which this counter is planned to be used. * * Return: The index of the performance monitoring counter assigned to * @perf_event. */ int x86_perf_rdpmc_index(struct perf_event *event) { lockdep_assert_irqs_disabled(); return event->hw.event_base_rdpmc; } static inline int match_prev_assignment(struct hw_perf_event *hwc, struct cpu_hw_events *cpuc, int i) { return hwc->idx == cpuc->assign[i] && hwc->last_cpu == smp_processor_id() && hwc->last_tag == cpuc->tags[i]; } static void x86_pmu_start(struct perf_event *event, int flags); static void x86_pmu_enable(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *event; struct hw_perf_event *hwc; int i, added = cpuc->n_added; if (!x86_pmu_initialized()) return; if (cpuc->enabled) return; if (cpuc->n_added) { int n_running = cpuc->n_events - cpuc->n_added; /* * apply assignment obtained either from * hw_perf_group_sched_in() or x86_pmu_enable() * * step1: save events moving to new counters */ for (i = 0; i < n_running; i++) { event = cpuc->event_list[i]; hwc = &event->hw; /* * we can avoid reprogramming counter if: * - assigned same counter as last time * - running on same CPU as last time * - no other event has used the counter since */ if (hwc->idx == -1 || match_prev_assignment(hwc, cpuc, i)) continue; /* * Ensure we don't accidentally enable a stopped * counter simply because we rescheduled. */ if (hwc->state & PERF_HES_STOPPED) hwc->state |= PERF_HES_ARCH; x86_pmu_stop(event, PERF_EF_UPDATE); } /* * step2: reprogram moved events into new counters */ for (i = 0; i < cpuc->n_events; i++) { event = cpuc->event_list[i]; hwc = &event->hw; if (!match_prev_assignment(hwc, cpuc, i)) x86_assign_hw_event(event, cpuc, i); else if (i < n_running) continue; if (hwc->state & PERF_HES_ARCH) continue; /* * if cpuc->enabled = 0, then no wrmsr as * per x86_pmu_enable_event() */ x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; perf_events_lapic_init(); } cpuc->enabled = 1; barrier(); static_call(x86_pmu_enable_all)(added); } DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. * To be called with the event disabled in hw: */ int x86_perf_event_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; if (unlikely(!hwc->event_base)) return 0; /* * If we are way outside a reasonable range then just skip forward: */ if (unlikely(left <= -period)) { left = period; local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } /* * Quirk: certain CPUs dont like it if just 1 hw_event is left: */ if (unlikely(left < 2)) left = 2; if (left > x86_pmu.max_period) left = x86_pmu.max_period; static_call_cond(x86_pmu_limit_period)(event, &left); this_cpu_write(pmc_prev_left[idx], left); /* * The hw event starts counting from this event offset, * mark it to be able to extra future deltas: */ local64_set(&hwc->prev_count, (u64)-left); wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); /* * Sign extend the Merge event counter's upper 16 bits since * we currently declare a 48-bit counter width */ if (is_counter_pair(hwc)) wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff); perf_event_update_userpage(event); return ret; } void x86_pmu_enable_event(struct perf_event *event) { if (__this_cpu_read(cpu_hw_events.enabled)) __x86_pmu_enable_event(&event->hw, ARCH_PERFMON_EVENTSEL_ENABLE); } /* * Add a single event to the PMU. * * The event is added to the group of enabled events * but only if it can be scheduled with existing events. */ static int x86_pmu_add(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc; int assign[X86_PMC_IDX_MAX]; int n, n0, ret; hwc = &event->hw; n0 = cpuc->n_events; ret = n = collect_events(cpuc, event, false); if (ret < 0) goto out; hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; if (!(flags & PERF_EF_START)) hwc->state |= PERF_HES_ARCH; /* * If group events scheduling transaction was started, * skip the schedulability test here, it will be performed * at commit time (->commit_txn) as a whole. * * If commit fails, we'll call ->del() on all events * for which ->add() was called. */ if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto done_collect; ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); if (ret) goto out; /* * copy new assignment, now we know it is possible * will be used by hw_perf_enable() */ memcpy(cpuc->assign, assign, n*sizeof(int)); done_collect: /* * Commit the collect_events() state. See x86_pmu_del() and * x86_pmu_*_txn(). */ cpuc->n_events = n; cpuc->n_added += n - n0; cpuc->n_txn += n - n0; /* * This is before x86_pmu_enable() will call x86_pmu_start(), * so we enable LBRs before an event needs them etc.. */ static_call_cond(x86_pmu_add)(event); ret = 0; out: return ret; } static void x86_pmu_start(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx = event->hw.idx; if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) return; if (WARN_ON_ONCE(idx == -1)) return; if (flags & PERF_EF_RELOAD) { WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); static_call(x86_pmu_set_period)(event); } event->hw.state = 0; cpuc->events[idx] = event; __set_bit(idx, cpuc->active_mask); static_call(x86_pmu_enable)(event); perf_event_update_userpage(event); } void perf_event_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; u64 pebs, debugctl; int cpu = smp_processor_id(); struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int num_counters = hybrid(cpuc->pmu, num_counters); int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); struct event_constraint *pebs_constraints = hybrid(cpuc->pmu, pebs_constraints); unsigned long flags; int idx; if (!num_counters) return; local_irq_save(flags); if (x86_pmu.version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); pr_info("\n"); pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); pr_info("CPU#%d: status: %016llx\n", cpu, status); pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); if (pebs_constraints) { rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); } if (x86_pmu.lbr_nr) { rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl); } } pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); for (idx = 0; idx < num_counters; idx++) { rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); rdmsrl(x86_pmu_event_addr(idx), pmc_count); prev_left = per_cpu(pmc_prev_left[idx], cpu); pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); pr_info("CPU#%d: gen-PMC%d count: %016llx\n", cpu, idx, pmc_count); pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } for (idx = 0; idx < num_counters_fixed; idx++) { if (fixed_counter_disabled(idx, cpuc->pmu)) continue; rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", cpu, idx, pmc_count); } local_irq_restore(flags); } void x86_pmu_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; if (test_bit(hwc->idx, cpuc->active_mask)) { static_call(x86_pmu_disable)(event); __clear_bit(hwc->idx, cpuc->active_mask); cpuc->events[hwc->idx] = NULL; WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; } if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { /* * Drain the remaining delta count out of a event * that we are disabling: */ static_call(x86_pmu_update)(event); hwc->state |= PERF_HES_UPTODATE; } } static void x86_pmu_del(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); int i; /* * If we're called during a txn, we only need to undo x86_pmu.add. * The events never got scheduled and ->cancel_txn will truncate * the event_list. * * XXX assumes any ->del() called during a TXN will only be on * an event added during that same TXN. */ if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto do_del; __set_bit(event->hw.idx, cpuc->dirty); /* * Not a TXN, therefore cleanup properly. */ x86_pmu_stop(event, PERF_EF_UPDATE); for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event_list[i]) break; } if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */ return; /* If we have a newly added event; make sure to decrease n_added. */ if (i >= cpuc->n_events - cpuc->n_added) --cpuc->n_added; static_call_cond(x86_pmu_put_event_constraints)(cpuc, event); /* Delete the array entry. */ while (++i < cpuc->n_events) { cpuc->event_list[i-1] = cpuc->event_list[i]; cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; } cpuc->event_constraint[i-1] = NULL; --cpuc->n_events; if (intel_cap.perf_metrics) del_nr_metric_event(cpuc, event); perf_event_update_userpage(event); do_del: /* * This is after x86_pmu_stop(); so we disable LBRs after any * event can need them etc.. */ static_call_cond(x86_pmu_del)(event); } int x86_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; struct cpu_hw_events *cpuc; struct perf_event *event; int idx, handled = 0; u64 val; cpuc = this_cpu_ptr(&cpu_hw_events); /* * Some chipsets need to unmask the LVTPC in a particular spot * inside the nmi handler. As a result, the unmasking was pushed * into all the nmi handlers. * * This generic handler doesn't seem to have any issues where the * unmasking occurs so it was left at the top. */ apic_write(APIC_LVTPC, APIC_DM_NMI); for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) continue; event = cpuc->events[idx]; val = static_call(x86_pmu_update)(event); if (val & (1ULL << (x86_pmu.cntval_bits - 1))) continue; /* * event overflow */ handled++; if (!static_call(x86_pmu_set_period)(event)) continue; perf_sample_data_init(&data, 0, event->hw.last_period); if (has_branch_stack(event)) perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } if (handled) inc_irq_stat(apic_perf_irqs); return handled; } void perf_events_lapic_init(void) { if (!x86_pmu.apic || !x86_pmu_initialized()) return; /* * Always use NMI for PMU */ apic_write(APIC_LVTPC, APIC_DM_NMI); } static int perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) { u64 start_clock; u64 finish_clock; int ret; /* * All PMUs/events that share this PMI handler should make sure to * increment active_events for their events. */ if (!atomic_read(&active_events)) return NMI_DONE; start_clock = sched_clock(); ret = static_call(x86_pmu_handle_irq)(regs); finish_clock = sched_clock(); perf_sample_event_took(finish_clock - start_clock); return ret; } NOKPROBE_SYMBOL(perf_event_nmi_handler); struct event_constraint emptyconstraint; struct event_constraint unconstrained; static int x86_pmu_prepare_cpu(unsigned int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int i; for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) cpuc->kfree_on_online[i] = NULL; if (x86_pmu.cpu_prepare) return x86_pmu.cpu_prepare(cpu); return 0; } static int x86_pmu_dead_cpu(unsigned int cpu) { if (x86_pmu.cpu_dead) x86_pmu.cpu_dead(cpu); return 0; } static int x86_pmu_online_cpu(unsigned int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int i; for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) { kfree(cpuc->kfree_on_online[i]); cpuc->kfree_on_online[i] = NULL; } return 0; } static int x86_pmu_starting_cpu(unsigned int cpu) { if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); return 0; } static int x86_pmu_dying_cpu(unsigned int cpu) { if (x86_pmu.cpu_dying) x86_pmu.cpu_dying(cpu); return 0; } static void __init pmu_check_apic(void) { if (boot_cpu_has(X86_FEATURE_APIC)) return; x86_pmu.apic = 0; pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); pr_info("no hardware sampling interrupt available.\n"); /* * If we have a PMU initialized but no APIC * interrupts, we cannot sample hardware * events (user-space has to fall back and * sample via a hrtimer based software event): */ pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; } static struct attribute_group x86_pmu_format_group __ro_after_init = { .name = "format", .attrs = NULL, }; ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_attr *pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); u64 config = 0; if (pmu_attr->id < x86_pmu.max_events) config = x86_pmu.event_map(pmu_attr->id); /* string trumps id */ if (pmu_attr->event_str) return sprintf(page, "%s\n", pmu_attr->event_str); return x86_pmu.events_sysfs_show(page, config); } EXPORT_SYMBOL_GPL(events_sysfs_show); ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_ht_attr *pmu_attr = container_of(attr, struct perf_pmu_events_ht_attr, attr); /* * Report conditional events depending on Hyper-Threading. * * This is overly conservative as usually the HT special * handling is not needed if the other CPU thread is idle. * * Note this does not (and cannot) handle the case when thread * siblings are invisible, for example with virtualization * if they are owned by some other guest. The user tool * has to re-read when a thread sibling gets onlined later. */ return sprintf(page, "%s", topology_max_smt_threads() > 1 ? pmu_attr->event_str_ht : pmu_attr->event_str_noht); } ssize_t events_hybrid_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_hybrid_attr *pmu_attr = container_of(attr, struct perf_pmu_events_hybrid_attr, attr); struct x86_hybrid_pmu *pmu; const char *str, *next_str; int i; if (hweight64(pmu_attr->pmu_type) == 1) return sprintf(page, "%s", pmu_attr->event_str); /* * Hybrid PMUs may support the same event name, but with different * event encoding, e.g., the mem-loads event on an Atom PMU has * different event encoding from a Core PMU. * * The event_str includes all event encodings. Each event encoding * is divided by ";". The order of the event encodings must follow * the order of the hybrid PMU index. */ pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); str = pmu_attr->event_str; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { if (!(x86_pmu.hybrid_pmu[i].pmu_type & pmu_attr->pmu_type)) continue; if (x86_pmu.hybrid_pmu[i].pmu_type & pmu->pmu_type) { next_str = strchr(str, ';'); if (next_str) return snprintf(page, next_str - str + 1, "%s", str); else return sprintf(page, "%s", str); } str = strchr(str, ';'); str++; } return 0; } EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show); EVENT_ATTR(cpu-cycles, CPU_CYCLES ); EVENT_ATTR(instructions, INSTRUCTIONS ); EVENT_ATTR(cache-references, CACHE_REFERENCES ); EVENT_ATTR(cache-misses, CACHE_MISSES ); EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); EVENT_ATTR(branch-misses, BRANCH_MISSES ); EVENT_ATTR(bus-cycles, BUS_CYCLES ); EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); static struct attribute *empty_attrs; static struct attribute *events_attr[] = { EVENT_PTR(CPU_CYCLES), EVENT_PTR(INSTRUCTIONS), EVENT_PTR(CACHE_REFERENCES), EVENT_PTR(CACHE_MISSES), EVENT_PTR(BRANCH_INSTRUCTIONS), EVENT_PTR(BRANCH_MISSES), EVENT_PTR(BUS_CYCLES), EVENT_PTR(STALLED_CYCLES_FRONTEND), EVENT_PTR(STALLED_CYCLES_BACKEND), EVENT_PTR(REF_CPU_CYCLES), NULL, }; /* * Remove all undefined events (x86_pmu.event_map(id) == 0) * out of events_attr attributes. */ static umode_t is_visible(struct kobject *kobj, struct attribute *attr, int idx) { struct perf_pmu_events_attr *pmu_attr; if (idx >= x86_pmu.max_events) return 0; pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr); /* str trumps id */ return pmu_attr->event_str || x86_pmu.event_map(idx) ? attr->mode : 0; } static struct attribute_group x86_pmu_events_group __ro_after_init = { .name = "events", .attrs = events_attr, .is_visible = is_visible, }; ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) { u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); ssize_t ret; /* * We have whole page size to spend and just little data * to write, so we can safely use sprintf. */ ret = sprintf(page, "event=0x%02llx", event); if (umask) ret += sprintf(page + ret, ",umask=0x%02llx", umask); if (edge) ret += sprintf(page + ret, ",edge"); if (pc) ret += sprintf(page + ret, ",pc"); if (any) ret += sprintf(page + ret, ",any"); if (inv) ret += sprintf(page + ret, ",inv"); if (cmask) ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); ret += sprintf(page + ret, "\n"); return ret; } static struct attribute_group x86_pmu_attr_group; static struct attribute_group x86_pmu_caps_group; static void x86_pmu_static_call_update(void) { static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq); static_call_update(x86_pmu_disable_all, x86_pmu.disable_all); static_call_update(x86_pmu_enable_all, x86_pmu.enable_all); static_call_update(x86_pmu_enable, x86_pmu.enable); static_call_update(x86_pmu_disable, x86_pmu.disable); static_call_update(x86_pmu_assign, x86_pmu.assign); static_call_update(x86_pmu_add, x86_pmu.add); static_call_update(x86_pmu_del, x86_pmu.del); static_call_update(x86_pmu_read, x86_pmu.read); static_call_update(x86_pmu_set_period, x86_pmu.set_period); static_call_update(x86_pmu_update, x86_pmu.update); static_call_update(x86_pmu_limit_period, x86_pmu.limit_period); static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events); static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints); static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints); static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling); static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling); static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling); static_call_update(x86_pmu_sched_task, x86_pmu.sched_task); static_call_update(x86_pmu_swap_task_ctx, x86_pmu.swap_task_ctx); static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs); static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases); static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs); static_call_update(x86_pmu_filter, x86_pmu.filter); } static void _x86_pmu_read(struct perf_event *event) { static_call(x86_pmu_update)(event); } void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, u64 intel_ctrl) { pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); pr_info("... generic registers: %d\n", num_counters); pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); pr_info("... max period: %016Lx\n", x86_pmu.max_period); pr_info("... fixed-purpose events: %lu\n", hweight64((((1ULL << num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED) & intel_ctrl)); pr_info("... event mask: %016Lx\n", intel_ctrl); } static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; int err; pr_info("Performance Events: "); switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: err = intel_pmu_init(); break; case X86_VENDOR_AMD: err = amd_pmu_init(); break; case X86_VENDOR_HYGON: err = amd_pmu_init(); x86_pmu.name = "HYGON"; break; case X86_VENDOR_ZHAOXIN: case X86_VENDOR_CENTAUR: err = zhaoxin_pmu_init(); break; default: err = -ENOTSUPP; } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); err = 0; goto out_bad_pmu; } pmu_check_apic(); /* sanity check that the hardware exists or is emulated */ if (!check_hw_exists(&pmu, x86_pmu.num_counters, x86_pmu.num_counters_fixed)) goto out_bad_pmu; pr_cont("%s PMU driver.\n", x86_pmu.name); x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) quirk->func(); if (!x86_pmu.intel_ctrl) x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; perf_events_lapic_init(); register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 0, x86_pmu.num_counters, 0, 0); x86_pmu_format_group.attrs = x86_pmu.format_attrs; if (!x86_pmu.events_sysfs_show) x86_pmu_events_group.attrs = &empty_attrs; pmu.attr_update = x86_pmu.attr_update; if (!is_hybrid()) { x86_pmu_show_pmu_cap(x86_pmu.num_counters, x86_pmu.num_counters_fixed, x86_pmu.intel_ctrl); } if (!x86_pmu.read) x86_pmu.read = _x86_pmu_read; if (!x86_pmu.guest_get_msrs) x86_pmu.guest_get_msrs = (void *)&__static_call_return0; if (!x86_pmu.set_period) x86_pmu.set_period = x86_perf_event_set_period; if (!x86_pmu.update) x86_pmu.update = x86_perf_event_update; x86_pmu_static_call_update(); /* * Install callbacks. Core will call them for each online * cpu. */ err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare", x86_pmu_prepare_cpu, x86_pmu_dead_cpu); if (err) return err; err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING, "perf/x86:starting", x86_pmu_starting_cpu, x86_pmu_dying_cpu); if (err) goto out; err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online", x86_pmu_online_cpu, NULL); if (err) goto out1; if (!is_hybrid()) { err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); if (err) goto out2; } else { struct x86_hybrid_pmu *hybrid_pmu; int i, j; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { hybrid_pmu = &x86_pmu.hybrid_pmu[i]; hybrid_pmu->pmu = pmu; hybrid_pmu->pmu.type = -1; hybrid_pmu->pmu.attr_update = x86_pmu.attr_update; hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE; err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name, (hybrid_pmu->pmu_type == hybrid_big) ? PERF_TYPE_RAW : -1); if (err) break; } if (i < x86_pmu.num_hybrid_pmus) { for (j = 0; j < i; j++) perf_pmu_unregister(&x86_pmu.hybrid_pmu[j].pmu); pr_warn("Failed to register hybrid PMUs\n"); kfree(x86_pmu.hybrid_pmu); x86_pmu.hybrid_pmu = NULL; x86_pmu.num_hybrid_pmus = 0; goto out2; } } return 0; out2: cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE); out1: cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING); out: cpuhp_remove_state(CPUHP_PERF_X86_PREPARE); out_bad_pmu: memset(&x86_pmu, 0, sizeof(x86_pmu)); return err; } early_initcall(init_hw_perf_events); static void x86_pmu_read(struct perf_event *event) { static_call(x86_pmu_read)(event); } /* * Start group events scheduling transaction * Set the flag to make pmu::enable() not perform the * schedulability test, it will be performed at commit time * * We only support PERF_PMU_TXN_ADD transactions. Save the * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD * transactions. */ static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */ cpuc->txn_flags = txn_flags; if (txn_flags & ~PERF_PMU_TXN_ADD) return; perf_pmu_disable(pmu); __this_cpu_write(cpu_hw_events.n_txn, 0); __this_cpu_write(cpu_hw_events.n_txn_pair, 0); __this_cpu_write(cpu_hw_events.n_txn_metric, 0); } /* * Stop group events scheduling transaction * Clear the flag and pmu::enable() will perform the * schedulability test. */ static void x86_pmu_cancel_txn(struct pmu *pmu) { unsigned int txn_flags; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ txn_flags = cpuc->txn_flags; cpuc->txn_flags = 0; if (txn_flags & ~PERF_PMU_TXN_ADD) return; /* * Truncate collected array by the number of events added in this * transaction. See x86_pmu_add() and x86_pmu_*_txn(). */ __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); __this_cpu_sub(cpu_hw_events.n_pair, __this_cpu_read(cpu_hw_events.n_txn_pair)); __this_cpu_sub(cpu_hw_events.n_metric, __this_cpu_read(cpu_hw_events.n_txn_metric)); perf_pmu_enable(pmu); } /* * Commit group events scheduling transaction * Perform the group schedulability test as a whole * Return 0 if success * * Does not cancel the transaction on failure; expects the caller to do this. */ static int x86_pmu_commit_txn(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int assign[X86_PMC_IDX_MAX]; int n, ret; WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) { cpuc->txn_flags = 0; return 0; } n = cpuc->n_events; if (!x86_pmu_initialized()) return -EAGAIN; ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); if (ret) return ret; /* * copy new assignment, now we know it is possible * will be used by hw_perf_enable() */ memcpy(cpuc->assign, assign, n*sizeof(int)); cpuc->txn_flags = 0; perf_pmu_enable(pmu); return 0; } /* * a fake_cpuc is used to validate event groups. Due to * the extra reg logic, we need to also allocate a fake * per_core and per_cpu structure. Otherwise, group events * using extra reg may conflict without the kernel being * able to catch this when the last event gets added to * the group. */ static void free_fake_cpuc(struct cpu_hw_events *cpuc) { intel_cpuc_finish(cpuc); kfree(cpuc); } static struct cpu_hw_events *allocate_fake_cpuc(struct pmu *event_pmu) { struct cpu_hw_events *cpuc; int cpu; cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); if (!cpuc) return ERR_PTR(-ENOMEM); cpuc->is_fake = 1; if (is_hybrid()) { struct x86_hybrid_pmu *h_pmu; h_pmu = hybrid_pmu(event_pmu); if (cpumask_empty(&h_pmu->supported_cpus)) goto error; cpu = cpumask_first(&h_pmu->supported_cpus); } else cpu = raw_smp_processor_id(); cpuc->pmu = event_pmu; if (intel_cpuc_prepare(cpuc, cpu)) goto error; return cpuc; error: free_fake_cpuc(cpuc); return ERR_PTR(-ENOMEM); } /* * validate that we can schedule this event */ static int validate_event(struct perf_event *event) { struct cpu_hw_events *fake_cpuc; struct event_constraint *c; int ret = 0; fake_cpuc = allocate_fake_cpuc(event->pmu); if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); c = x86_pmu.get_event_constraints(fake_cpuc, 0, event); if (!c || !c->weight) ret = -EINVAL; if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(fake_cpuc, event); free_fake_cpuc(fake_cpuc); return ret; } /* * validate a single event group * * validation include: * - check events are compatible which each other * - events do not compete for the same counter * - number of events <= number of counters * * validation ensures the group can be loaded onto the * PMU if it was the only group available. */ static int validate_group(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct cpu_hw_events *fake_cpuc; int ret = -EINVAL, n; /* * Reject events from different hybrid PMUs. */ if (is_hybrid()) { struct perf_event *sibling; struct pmu *pmu = NULL; if (is_x86_event(leader)) pmu = leader->pmu; for_each_sibling_event(sibling, leader) { if (!is_x86_event(sibling)) continue; if (!pmu) pmu = sibling->pmu; else if (pmu != sibling->pmu) return ret; } } fake_cpuc = allocate_fake_cpuc(event->pmu); if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); /* * the event is not yet connected with its * siblings therefore we must first collect * existing siblings, then add the new event * before we can simulate the scheduling */ n = collect_events(fake_cpuc, leader, true); if (n < 0) goto out; fake_cpuc->n_events = n; n = collect_events(fake_cpuc, event, false); if (n < 0) goto out; fake_cpuc->n_events = 0; ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); out: free_fake_cpuc(fake_cpuc); return ret; } static int x86_pmu_event_init(struct perf_event *event) { struct x86_hybrid_pmu *pmu = NULL; int err; if ((event->attr.type != event->pmu->type) && (event->attr.type != PERF_TYPE_HARDWARE) && (event->attr.type != PERF_TYPE_HW_CACHE)) return -ENOENT; if (is_hybrid() && (event->cpu != -1)) { pmu = hybrid_pmu(event->pmu); if (!cpumask_test_cpu(event->cpu, &pmu->supported_cpus)) return -ENOENT; } err = __x86_pmu_event_init(event); if (!err) { if (event->group_leader != event) err = validate_group(event); else err = validate_event(event); } if (err) { if (event->destroy) event->destroy(event); event->destroy = NULL; } if (READ_ONCE(x86_pmu.attr_rdpmc) && !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS)) event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT; return err; } void perf_clear_dirty_counters(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int i; /* Don't need to clear the assigned counter. */ for (i = 0; i < cpuc->n_events; i++) __clear_bit(cpuc->assign[i], cpuc->dirty); if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX)) return; for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) { if (i >= INTEL_PMC_IDX_FIXED) { /* Metrics and fake events don't have corresponding HW counters. */ if ((i - INTEL_PMC_IDX_FIXED) >= hybrid(cpuc->pmu, num_counters_fixed)) continue; wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + (i - INTEL_PMC_IDX_FIXED), 0); } else { wrmsrl(x86_pmu_event_addr(i), 0); } } bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX); } static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; /* * This function relies on not being called concurrently in two * tasks in the same mm. Otherwise one task could observe * perf_rdpmc_allowed > 1 and return all the way back to * userspace with CR4.PCE clear while another task is still * doing on_each_cpu_mask() to propagate CR4.PCE. * * For now, this can't happen because all callers hold mmap_lock * for write. If this changes, we'll need a different solution. */ mmap_assert_write_locked(mm); if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); } static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); } static int x86_pmu_event_idx(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT)) return 0; if (is_metric_idx(hwc->idx)) return INTEL_PMC_FIXED_RDPMC_METRICS + 1; else return hwc->event_base_rdpmc + 1; } static ssize_t get_attr_rdpmc(struct device *cdev, struct device_attribute *attr, char *buf) { return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); } static ssize_t set_attr_rdpmc(struct device *cdev, struct device_attribute *attr, const char *buf, size_t count) { unsigned long val; ssize_t ret; ret = kstrtoul(buf, 0, &val); if (ret) return ret; if (val > 2) return -EINVAL; if (x86_pmu.attr_rdpmc_broken) return -ENOTSUPP; if (val != x86_pmu.attr_rdpmc) { /* * Changing into or out of never available or always available, * aka perf-event-bypassing mode. This path is extremely slow, * but only root can trigger it, so it's okay. */ if (val == 0) static_branch_inc(&rdpmc_never_available_key); else if (x86_pmu.attr_rdpmc == 0) static_branch_dec(&rdpmc_never_available_key); if (val == 2) static_branch_inc(&rdpmc_always_available_key); else if (x86_pmu.attr_rdpmc == 2) static_branch_dec(&rdpmc_always_available_key); on_each_cpu(cr4_update_pce, NULL, 1); x86_pmu.attr_rdpmc = val; } return count; } static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); static struct attribute *x86_pmu_attrs[] = { &dev_attr_rdpmc.attr, NULL, }; static struct attribute_group x86_pmu_attr_group __ro_after_init = { .attrs = x86_pmu_attrs, }; static ssize_t max_precise_show(struct device *cdev, struct device_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise()); } static DEVICE_ATTR_RO(max_precise); static struct attribute *x86_pmu_caps_attrs[] = { &dev_attr_max_precise.attr, NULL }; static struct attribute_group x86_pmu_caps_group __ro_after_init = { .name = "caps", .attrs = x86_pmu_caps_attrs, }; static const struct attribute_group *x86_pmu_attr_groups[] = { &x86_pmu_attr_group, &x86_pmu_format_group, &x86_pmu_events_group, &x86_pmu_caps_group, NULL, }; static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { static_call_cond(x86_pmu_sched_task)(pmu_ctx, sched_in); } static void x86_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc, struct perf_event_pmu_context *next_epc) { static_call_cond(x86_pmu_swap_task_ctx)(prev_epc, next_epc); } void perf_check_microcode(void) { if (x86_pmu.check_microcode) x86_pmu.check_microcode(); } static int x86_pmu_check_period(struct perf_event *event, u64 value) { if (x86_pmu.check_period && x86_pmu.check_period(event, value)) return -EINVAL; if (value && x86_pmu.limit_period) { s64 left = value; x86_pmu.limit_period(event, &left); if (left > value) return -EINVAL; } return 0; } static int x86_pmu_aux_output_match(struct perf_event *event) { if (!(pmu.capabilities & PERF_PMU_CAP_AUX_OUTPUT)) return 0; if (x86_pmu.aux_output_match) return x86_pmu.aux_output_match(event); return 0; } static bool x86_pmu_filter(struct pmu *pmu, int cpu) { bool ret = false; static_call_cond(x86_pmu_filter)(pmu, cpu, &ret); return ret; } static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, .attr_groups = x86_pmu_attr_groups, .event_init = x86_pmu_event_init, .event_mapped = x86_pmu_event_mapped, .event_unmapped = x86_pmu_event_unmapped, .add = x86_pmu_add, .del = x86_pmu_del, .start = x86_pmu_start, .stop = x86_pmu_stop, .read = x86_pmu_read, .start_txn = x86_pmu_start_txn, .cancel_txn = x86_pmu_cancel_txn, .commit_txn = x86_pmu_commit_txn, .event_idx = x86_pmu_event_idx, .sched_task = x86_pmu_sched_task, .swap_task_ctx = x86_pmu_swap_task_ctx, .check_period = x86_pmu_check_period, .aux_output_match = x86_pmu_aux_output_match, .filter = x86_pmu_filter, }; void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { struct cyc2ns_data data; u64 offset; userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; userpg->cap_user_rdpmc = !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT); userpg->pmc_width = x86_pmu.cntval_bits; if (!using_native_sched_clock() || !sched_clock_stable()) return; cyc2ns_read_begin(&data); offset = data.cyc2ns_offset + __sched_clock_offset; /* * Internal timekeeping for enabled/running/stopped times * is always in the local_clock domain. */ userpg->cap_user_time = 1; userpg->time_mult = data.cyc2ns_mul; userpg->time_shift = data.cyc2ns_shift; userpg->time_offset = offset - now; /* * cap_user_time_zero doesn't make sense when we're using a different * time base for the records. */ if (!event->attr.use_clockid) { userpg->cap_user_time_zero = 1; userpg->time_zero = offset; } cyc2ns_read_end(); } /* * Determine whether the regs were taken from an irq/exception handler rather * than from perf_arch_fetch_caller_regs(). */ static bool perf_hw_regs(struct pt_regs *regs) { return regs->flags & X86_EFLAGS_FIXED; } void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct unwind_state state; unsigned long addr; if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } if (perf_callchain_store(entry, regs->ip)) return; if (perf_hw_regs(regs)) unwind_start(&state, current, regs, NULL); else unwind_start(&state, current, NULL, (void *)regs->sp); for (; !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); if (!addr || perf_callchain_store(entry, addr)) return; } } static inline int valid_user_frame(const void __user *fp, unsigned long size) { return __access_ok(fp, size); } static unsigned long get_segment_base(unsigned int segment) { struct desc_struct *desc; unsigned int idx = segment >> 3; if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; /* IRQs are off, so this synchronizes with smp_store_release */ ldt = READ_ONCE(current->active_mm->context.ldt); if (!ldt || idx >= ldt->nr_entries) return 0; desc = &ldt->entries[idx]; #else return 0; #endif } else { if (idx >= GDT_ENTRIES) return 0; desc = raw_cpu_ptr(gdt_page.gdt) + idx; } return get_desc_base(desc); } #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) { /* 32-bit process in 64-bit kernel. */ unsigned long ss_base, cs_base; struct stack_frame_ia32 frame; const struct stack_frame_ia32 __user *fp; if (user_64bit_mode(regs)) return 0; cs_base = get_segment_base(regs->cs); ss_base = get_segment_base(regs->ss); fp = compat_ptr(ss_base + regs->bp); pagefault_disable(); while (entry->nr < entry->max_stack) { if (!valid_user_frame(fp, sizeof(frame))) break; if (__get_user(frame.next_frame, &fp->next_frame)) break; if (__get_user(frame.return_address, &fp->return_address)) break; perf_callchain_store(entry, cs_base + frame.return_address); fp = compat_ptr(ss_base + frame.next_frame); } pagefault_enable(); return 1; } #else static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) { return 0; } #endif void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct stack_frame frame; const struct stack_frame __user *fp; if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } /* * We don't know what to do with VM86 stacks.. ignore them for now. */ if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) return; fp = (void __user *)regs->bp; perf_callchain_store(entry, regs->ip); if (!nmi_uaccess_okay()) return; if (perf_callchain_user32(regs, entry)) return; pagefault_disable(); while (entry->nr < entry->max_stack) { if (!valid_user_frame(fp, sizeof(frame))) break; if (__get_user(frame.next_frame, &fp->next_frame)) break; if (__get_user(frame.return_address, &fp->return_address)) break; perf_callchain_store(entry, frame.return_address); fp = (void __user *)frame.next_frame; } pagefault_enable(); } /* * Deal with code segment offsets for the various execution modes: * * VM86 - the good olde 16 bit days, where the linear address is * 20 bits and we use regs->ip + 0x10 * regs->cs. * * IA32 - Where we need to look at GDT/LDT segment descriptor tables * to figure out what the 32bit base address is. * * X32 - has TIF_X32 set, but is running in x86_64 * * X86_64 - CS,DS,SS,ES are all zero based. */ static unsigned long code_segment_base(struct pt_regs *regs) { /* * For IA32 we look at the GDT/LDT segment base to convert the * effective IP to a linear address. */ #ifdef CONFIG_X86_32 /* * If we are in VM86 mode, add the segment offset to convert to a * linear address. */ if (regs->flags & X86_VM_MASK) return 0x10 * regs->cs; if (user_mode(regs) && regs->cs != __USER_CS) return get_segment_base(regs->cs); #else if (user_mode(regs) && !user_64bit_mode(regs) && regs->cs != __USER32_CS) return get_segment_base(regs->cs); #endif return 0; } unsigned long perf_instruction_pointer(struct pt_regs *regs) { if (perf_guest_state()) return perf_guest_get_ip(); return regs->ip + code_segment_base(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { unsigned int guest_state = perf_guest_state(); int misc = 0; if (guest_state) { if (guest_state & PERF_GUEST_USER) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; } else { if (user_mode(regs)) misc |= PERF_RECORD_MISC_USER; else misc |= PERF_RECORD_MISC_KERNEL; } if (regs->flags & PERF_EFLAGS_EXACT) misc |= PERF_RECORD_MISC_EXACT_IP; return misc; } void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) { /* This API doesn't currently support enumerating hybrid PMUs. */ if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) || !x86_pmu_initialized()) { memset(cap, 0, sizeof(*cap)); return; } /* * Note, hybrid CPU models get tracked as having hybrid PMUs even when * all E-cores are disabled via BIOS. When E-cores are disabled, the * base PMU holds the correct number of counters for P-cores. */ cap->version = x86_pmu.version; cap->num_counters_gp = x86_pmu.num_counters; cap->num_counters_fixed = x86_pmu.num_counters_fixed; cap->bit_width_gp = x86_pmu.cntval_bits; cap->bit_width_fixed = x86_pmu.cntval_bits; cap->events_mask = (unsigned int)x86_pmu.events_maskl; cap->events_mask_len = x86_pmu.events_mask_len; cap->pebs_ept = x86_pmu.pebs_ept; } EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); u64 perf_get_hw_event_config(int hw_event) { int max = x86_pmu.max_events; if (hw_event < max) return x86_pmu.event_map(array_index_nospec(hw_event, max)); return 0; } EXPORT_SYMBOL_GPL(perf_get_hw_event_config);
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/profile.c * Simple profiling. Manages a direct-mapped profile hit count buffer, * with configurable resolution, support for restricting the cpus on * which profiling is done, and switching between cpu time and * schedule() calls via kernel command line parameters passed at boot. * * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, * Red Hat, July 2004 * Consolidation of architecture support code for profiling, * Nadia Yvette Chambers, Oracle, July 2004 * Amortized hit count accounting via per-cpu open-addressed hashtables * to resolve timer interrupt livelocks, Nadia Yvette Chambers, * Oracle, 2004 */ #include <linux/export.h> #include <linux/profile.h> #include <linux/memblock.h> #include <linux/notifier.h> #include <linux/mm.h> #include <linux/cpumask.h> #include <linux/cpu.h> #include <linux/highmem.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/sched/stat.h> #include <asm/sections.h> #include <asm/irq_regs.h> #include <asm/ptrace.h> struct profile_hit { u32 pc, hits; }; #define PROFILE_GRPSHIFT 3 #define PROFILE_GRPSZ (1 << PROFILE_GRPSHIFT) #define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) static atomic_t *prof_buffer; static unsigned long prof_len; static unsigned short int prof_shift; int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); static cpumask_var_t prof_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); static DEFINE_MUTEX(profile_flip_mutex); #endif /* CONFIG_SMP */ int profile_setup(char *str) { static const char schedstr[] = "schedule"; static const char sleepstr[] = "sleep"; static const char kvmstr[] = "kvm"; const char *select = NULL; int par; if (!strncmp(str, sleepstr, strlen(sleepstr))) { #ifdef CONFIG_SCHEDSTATS force_schedstat_enabled(); prof_on = SLEEP_PROFILING; select = sleepstr; #else pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); #endif /* CONFIG_SCHEDSTATS */ } else if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; select = schedstr; } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { prof_on = KVM_PROFILING; select = kvmstr; } else if (get_option(&str, &par)) { prof_shift = clamp(par, 0, BITS_PER_LONG - 1); prof_on = CPU_PROFILING; pr_info("kernel profiling enabled (shift: %u)\n", prof_shift); } if (select) { if (str[strlen(select)] == ',') str += strlen(select) + 1; if (get_option(&str, &par)) prof_shift = clamp(par, 0, BITS_PER_LONG - 1); pr_info("kernel %s profiling enabled (shift: %u)\n", select, prof_shift); } return 1; } __setup("profile=", profile_setup); int __ref profile_init(void) { int buffer_bytes; if (!prof_on) return 0; /* only text is profiled */ prof_len = (_etext - _stext) >> prof_shift; if (!prof_len) { pr_warn("profiling shift: %u too large\n", prof_shift); prof_on = 0; return -EINVAL; } buffer_bytes = prof_len*sizeof(atomic_t); if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) return -ENOMEM; cpumask_copy(prof_cpu_mask, cpu_possible_mask); prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN); if (prof_buffer) return 0; prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN); if (prof_buffer) return 0; prof_buffer = vzalloc(buffer_bytes); if (prof_buffer) return 0; free_cpumask_var(prof_cpu_mask); return -ENOMEM; } #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) /* * Each cpu has a pair of open-addressed hashtables for pending * profile hits. read_profile() IPI's all cpus to request them * to flip buffers and flushes their contents to prof_buffer itself. * Flip requests are serialized by the profile_flip_mutex. The sole * use of having a second hashtable is for avoiding cacheline * contention that would otherwise happen during flushes of pending * profile hits required for the accuracy of reported profile hits * and so resurrect the interrupt livelock issue. * * The open-addressed hashtables are indexed by profile buffer slot * and hold the number of pending hits to that profile buffer slot on * a cpu in an entry. When the hashtable overflows, all pending hits * are accounted to their corresponding profile buffer slots with * atomic_add() and the hashtable emptied. As numerous pending hits * may be accounted to a profile buffer slot in a hashtable entry, * this amortizes a number of atomic profile buffer increments likely * to be far larger than the number of entries in the hashtable, * particularly given that the number of distinct profile buffer * positions to which hits are accounted during short intervals (e.g. * several seconds) is usually very small. Exclusion from buffer * flipping is provided by interrupt disablement (note that for * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from * process context). * The hash function is meant to be lightweight as opposed to strong, * and was vaguely inspired by ppc64 firmware-supported inverted * pagetable hash functions, but uses a full hashtable full of finite * collision chains, not just pairs of them. * * -- nyc */ static void __profile_flip_buffers(void *unused) { int cpu = smp_processor_id(); per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu); } static void profile_flip_buffers(void) { int i, j, cpu; mutex_lock(&profile_flip_mutex); j = per_cpu(cpu_profile_flip, get_cpu()); put_cpu(); on_each_cpu(__profile_flip_buffers, NULL, 1); for_each_online_cpu(cpu) { struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; for (i = 0; i < NR_PROFILE_HIT; ++i) { if (!hits[i].hits) { if (hits[i].pc) hits[i].pc = 0; continue; } atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); hits[i].hits = hits[i].pc = 0; } } mutex_unlock(&profile_flip_mutex); } static void profile_discard_flip_buffers(void) { int i, cpu; mutex_lock(&profile_flip_mutex); i = per_cpu(cpu_profile_flip, get_cpu()); put_cpu(); on_each_cpu(__profile_flip_buffers, NULL, 1); for_each_online_cpu(cpu) { struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); } mutex_unlock(&profile_flip_mutex); } static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long primary, secondary, flags, pc = (unsigned long)__pc; int i, j, cpu; struct profile_hit *hits; pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; cpu = get_cpu(); hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)]; if (!hits) { put_cpu(); return; } /* * We buffer the global profiler buffer into a per-CPU * queue and thus reduce the number of global (and possibly * NUMA-alien) accesses. The write-queue is self-coalescing: */ local_irq_save(flags); do { for (j = 0; j < PROFILE_GRPSZ; ++j) { if (hits[i + j].pc == pc) { hits[i + j].hits += nr_hits; goto out; } else if (!hits[i + j].hits) { hits[i + j].pc = pc; hits[i + j].hits = nr_hits; goto out; } } i = (i + secondary) & (NR_PROFILE_HIT - 1); } while (i != primary); /* * Add the current hit(s) and flush the write-queue out * to the global buffer: */ atomic_add(nr_hits, &prof_buffer[pc]); for (i = 0; i < NR_PROFILE_HIT; ++i) { atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); hits[i].pc = hits[i].hits = 0; } out: local_irq_restore(flags); put_cpu(); } static int profile_dead_cpu(unsigned int cpu) { struct page *page; int i; if (cpumask_available(prof_cpu_mask)) cpumask_clear_cpu(cpu, prof_cpu_mask); for (i = 0; i < 2; i++) { if (per_cpu(cpu_profile_hits, cpu)[i]) { page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]); per_cpu(cpu_profile_hits, cpu)[i] = NULL; __free_page(page); } } return 0; } static int profile_prepare_cpu(unsigned int cpu) { int i, node = cpu_to_mem(cpu); struct page *page; per_cpu(cpu_profile_flip, cpu) = 0; for (i = 0; i < 2; i++) { if (per_cpu(cpu_profile_hits, cpu)[i]) continue; page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) { profile_dead_cpu(cpu); return -ENOMEM; } per_cpu(cpu_profile_hits, cpu)[i] = page_address(page); } return 0; } static int profile_online_cpu(unsigned int cpu) { if (cpumask_available(prof_cpu_mask)) cpumask_set_cpu(cpu, prof_cpu_mask); return 0; } #else /* !CONFIG_SMP */ #define profile_flip_buffers() do { } while (0) #define profile_discard_flip_buffers() do { } while (0) static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long pc; pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); } #endif /* !CONFIG_SMP */ void profile_hits(int type, void *__pc, unsigned int nr_hits) { if (prof_on != type || !prof_buffer) return; do_profile_hits(type, __pc, nr_hits); } EXPORT_SYMBOL_GPL(profile_hits); void profile_tick(int type) { struct pt_regs *regs = get_irq_regs(); if (!user_mode(regs) && cpumask_available(prof_cpu_mask) && cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) profile_hit(type, (void *)profile_pc(regs)); } #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/uaccess.h> static int prof_cpu_mask_proc_show(struct seq_file *m, void *v) { seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask)); return 0; } static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file) { return single_open(file, prof_cpu_mask_proc_show, NULL); } static ssize_t prof_cpu_mask_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { cpumask_var_t new_value; int err; if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; err = cpumask_parse_user(buffer, count, new_value); if (!err) { cpumask_copy(prof_cpu_mask, new_value); err = count; } free_cpumask_var(new_value); return err; } static const struct proc_ops prof_cpu_mask_proc_ops = { .proc_open = prof_cpu_mask_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, .proc_write = prof_cpu_mask_proc_write, }; void create_prof_cpu_mask(void) { /* create /proc/irq/prof_cpu_mask */ proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_ops); } /* * This function accesses profiling information. The returned data is * binary: the sampling step and the actual contents of the profile * buffer. Use of the program readprofile is recommended in order to * get meaningful info out of these data. */ static ssize_t read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) { unsigned long p = *ppos; ssize_t read; char *pnt; unsigned long sample_step = 1UL << prof_shift; profile_flip_buffers(); if (p >= (prof_len+1)*sizeof(unsigned int)) return 0; if (count > (prof_len+1)*sizeof(unsigned int) - p) count = (prof_len+1)*sizeof(unsigned int) - p; read = 0; while (p < sizeof(unsigned int) && count > 0) { if (put_user(*((char *)(&sample_step)+p), buf)) return -EFAULT; buf++; p++; count--; read++; } pnt = (char *)prof_buffer + p - sizeof(atomic_t); if (copy_to_user(buf, (void *)pnt, count)) return -EFAULT; read += count; *ppos += read; return read; } /* default is to not implement this call */ int __weak setup_profiling_timer(unsigned mult) { return -EINVAL; } /* * Writing to /proc/profile resets the counters * * Writing a 'profiling multiplier' value into it also re-sets the profiling * interrupt frequency, on architectures that support this. */ static ssize_t write_profile(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { #ifdef CONFIG_SMP if (count == sizeof(int)) { unsigned int multiplier; if (copy_from_user(&multiplier, buf, sizeof(int))) return -EFAULT; if (setup_profiling_timer(multiplier)) return -EINVAL; } #endif profile_discard_flip_buffers(); memset(prof_buffer, 0, prof_len * sizeof(atomic_t)); return count; } static const struct proc_ops profile_proc_ops = { .proc_read = read_profile, .proc_write = write_profile, .proc_lseek = default_llseek, }; int __ref create_proc_profile(void) { struct proc_dir_entry *entry; #ifdef CONFIG_SMP enum cpuhp_state online_state; #endif int err = 0; if (!prof_on) return 0; #ifdef CONFIG_SMP err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE", profile_prepare_cpu, profile_dead_cpu); if (err) return err; err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE", profile_online_cpu, NULL); if (err < 0) goto err_state_prep; online_state = err; err = 0; #endif entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &profile_proc_ops); if (!entry) goto err_state_onl; proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); return err; err_state_onl: #ifdef CONFIG_SMP cpuhp_remove_state(online_state); err_state_prep: cpuhp_remove_state(CPUHP_PROFILE_PREPARE); #endif return err; } subsys_initcall(create_proc_profile); #endif /* CONFIG_PROC_FS */
4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 // SPDX-License-Identifier: GPL-2.0-only /* * AT and PS/2 keyboard driver * * Copyright (c) 1999-2002 Vojtech Pavlik */ /* * This driver can handle standard AT keyboards and PS/2 keyboards in * Translated and Raw Set 2 and Set 3, as well as AT keyboards on dumb * input-only controllers and AT keyboards connected over a one way RS232 * converter. */ #include <linux/delay.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/input.h> #include <linux/input/vivaldi-fmap.h> #include <linux/serio.h> #include <linux/workqueue.h> #include <linux/libps2.h> #include <linux/mutex.h> #include <linux/dmi.h> #include <linux/property.h> #define DRIVER_DESC "AT and PS/2 keyboard driver" MODULE_AUTHOR("Vojtech Pavlik <vojtech@suse.cz>"); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); static int atkbd_set = 2; module_param_named(set, atkbd_set, int, 0); MODULE_PARM_DESC(set, "Select keyboard code set (2 = default, 3 = PS/2 native)"); #if defined(__i386__) || defined(__x86_64__) || defined(__hppa__) static bool atkbd_reset; #else static bool atkbd_reset = true; #endif module_param_named(reset, atkbd_reset, bool, 0); MODULE_PARM_DESC(reset, "Reset keyboard during initialization"); static bool atkbd_softrepeat; module_param_named(softrepeat, atkbd_softrepeat, bool, 0); MODULE_PARM_DESC(softrepeat, "Use software keyboard repeat"); static bool atkbd_softraw = true; module_param_named(softraw, atkbd_softraw, bool, 0); MODULE_PARM_DESC(softraw, "Use software generated rawmode"); static bool atkbd_scroll; module_param_named(scroll, atkbd_scroll, bool, 0); MODULE_PARM_DESC(scroll, "Enable scroll-wheel on MS Office and similar keyboards"); static bool atkbd_extra; module_param_named(extra, atkbd_extra, bool, 0); MODULE_PARM_DESC(extra, "Enable extra LEDs and keys on IBM RapidAcces, EzKey and similar keyboards"); static bool atkbd_terminal; module_param_named(terminal, atkbd_terminal, bool, 0); MODULE_PARM_DESC(terminal, "Enable break codes on an IBM Terminal keyboard connected via AT/PS2"); #define SCANCODE(keymap) ((keymap >> 16) & 0xFFFF) #define KEYCODE(keymap) (keymap & 0xFFFF) /* * Scancode to keycode tables. These are just the default setting, and * are loadable via a userland utility. */ #define ATKBD_KEYMAP_SIZE 512 static const unsigned short atkbd_set2_keycode[ATKBD_KEYMAP_SIZE] = { #ifdef CONFIG_KEYBOARD_ATKBD_HP_KEYCODES /* XXX: need a more general approach */ #include "hpps2atkbd.h" /* include the keyboard scancodes */ #else 0, 67, 65, 63, 61, 59, 60, 88, 0, 68, 66, 64, 62, 15, 41,117, 0, 56, 42, 93, 29, 16, 2, 0, 0, 0, 44, 31, 30, 17, 3, 0, 0, 46, 45, 32, 18, 5, 4, 95, 0, 57, 47, 33, 20, 19, 6,183, 0, 49, 48, 35, 34, 21, 7,184, 0, 0, 50, 36, 22, 8, 9,185, 0, 51, 37, 23, 24, 11, 10, 0, 0, 52, 53, 38, 39, 25, 12, 0, 0, 89, 40, 0, 26, 13, 0, 0, 58, 54, 28, 27, 0, 43, 0, 85, 0, 86, 91, 90, 92, 0, 14, 94, 0, 79,124, 75, 71,121, 0, 0, 82, 83, 80, 76, 77, 72, 1, 69, 87, 78, 81, 74, 55, 73, 70, 99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 217,100,255, 0, 97,165, 0, 0,156, 0, 0, 0, 0, 0, 0,125, 173,114, 0,113, 0, 0, 0,126,128, 0, 0,140, 0, 0, 0,127, 159, 0,115, 0,164, 0, 0,116,158, 0,172,166, 0, 0, 0,142, 157, 0, 0, 0, 0, 0, 0, 0,155, 0, 98, 0, 0,163, 0, 0, 226, 0, 0, 0, 0, 0, 0, 0, 0,255, 96, 0, 0, 0,143, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,107, 0,105,102, 0, 0,112, 110,111,108,112,106,103, 0,119, 0,118,109, 0, 99,104,119, 0, 0, 0, 0, 65, 99, #endif }; static const unsigned short atkbd_set3_keycode[ATKBD_KEYMAP_SIZE] = { 0, 0, 0, 0, 0, 0, 0, 59, 1,138,128,129,130, 15, 41, 60, 131, 29, 42, 86, 58, 16, 2, 61,133, 56, 44, 31, 30, 17, 3, 62, 134, 46, 45, 32, 18, 5, 4, 63,135, 57, 47, 33, 20, 19, 6, 64, 136, 49, 48, 35, 34, 21, 7, 65,137,100, 50, 36, 22, 8, 9, 66, 125, 51, 37, 23, 24, 11, 10, 67,126, 52, 53, 38, 39, 25, 12, 68, 113,114, 40, 43, 26, 13, 87, 99, 97, 54, 28, 27, 43, 43, 88, 70, 108,105,119,103,111,107, 14,110, 0, 79,106, 75, 71,109,102,104, 82, 83, 80, 76, 77, 72, 69, 98, 0, 96, 81, 0, 78, 73, 55,183, 184,185,186,187, 74, 94, 92, 93, 0, 0, 0,125,126,127,112, 0, 0,139,172,163,165,115,152,172,166,140,160,154,113,114,167,168, 148,149,147,140 }; static const unsigned short atkbd_unxlate_table[128] = { 0,118, 22, 30, 38, 37, 46, 54, 61, 62, 70, 69, 78, 85,102, 13, 21, 29, 36, 45, 44, 53, 60, 67, 68, 77, 84, 91, 90, 20, 28, 27, 35, 43, 52, 51, 59, 66, 75, 76, 82, 14, 18, 93, 26, 34, 33, 42, 50, 49, 58, 65, 73, 74, 89,124, 17, 41, 88, 5, 6, 4, 12, 3, 11, 2, 10, 1, 9,119,126,108,117,125,123,107,115,116,121,105, 114,122,112,113,127, 96, 97,120, 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 86, 94, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 87,111, 19, 25, 57, 81, 83, 92, 95, 98, 99,100,101,103,104,106,109,110 }; #define ATKBD_CMD_SETLEDS 0x10ed #define ATKBD_CMD_GSCANSET 0x11f0 #define ATKBD_CMD_SSCANSET 0x10f0 #define ATKBD_CMD_GETID 0x02f2 #define ATKBD_CMD_SETREP 0x10f3 #define ATKBD_CMD_ENABLE 0x00f4 #define ATKBD_CMD_RESET_DIS 0x00f5 /* Reset to defaults and disable */ #define ATKBD_CMD_RESET_DEF 0x00f6 /* Reset to defaults */ #define ATKBD_CMD_SETALL_MB 0x00f8 /* Set all keys to give break codes */ #define ATKBD_CMD_SETALL_MBR 0x00fa /* ... and repeat */ #define ATKBD_CMD_RESET_BAT 0x02ff #define ATKBD_CMD_RESEND 0x00fe #define ATKBD_CMD_EX_ENABLE 0x10ea #define ATKBD_CMD_EX_SETLEDS 0x20eb #define ATKBD_CMD_OK_GETID 0x02e8 #define ATKBD_RET_ACK 0xfa #define ATKBD_RET_NAK 0xfe #define ATKBD_RET_BAT 0xaa #define ATKBD_RET_EMUL0 0xe0 #define ATKBD_RET_EMUL1 0xe1 #define ATKBD_RET_RELEASE 0xf0 #define ATKBD_RET_HANJA 0xf1 #define ATKBD_RET_HANGEUL 0xf2 #define ATKBD_RET_ERR 0xff #define ATKBD_KEY_UNKNOWN 0 #define ATKBD_KEY_NULL 255 #define ATKBD_SCR_1 0xfffe #define ATKBD_SCR_2 0xfffd #define ATKBD_SCR_4 0xfffc #define ATKBD_SCR_8 0xfffb #define ATKBD_SCR_CLICK 0xfffa #define ATKBD_SCR_LEFT 0xfff9 #define ATKBD_SCR_RIGHT 0xfff8 #define ATKBD_SPECIAL ATKBD_SCR_RIGHT #define ATKBD_LED_EVENT_BIT 0 #define ATKBD_REP_EVENT_BIT 1 #define ATKBD_XL_ERR 0x01 #define ATKBD_XL_BAT 0x02 #define ATKBD_XL_ACK 0x04 #define ATKBD_XL_NAK 0x08 #define ATKBD_XL_HANGEUL 0x10 #define ATKBD_XL_HANJA 0x20 static const struct { unsigned short keycode; unsigned char set2; } atkbd_scroll_keys[] = { { ATKBD_SCR_1, 0xc5 }, { ATKBD_SCR_2, 0x9d }, { ATKBD_SCR_4, 0xa4 }, { ATKBD_SCR_8, 0x9b }, { ATKBD_SCR_CLICK, 0xe0 }, { ATKBD_SCR_LEFT, 0xcb }, { ATKBD_SCR_RIGHT, 0xd2 }, }; /* * The atkbd control structure */ struct atkbd { struct ps2dev ps2dev; struct input_dev *dev; /* Written only during init */ char name[64]; char phys[32]; unsigned short id; unsigned short keycode[ATKBD_KEYMAP_SIZE]; DECLARE_BITMAP(force_release_mask, ATKBD_KEYMAP_SIZE); unsigned char set; bool translated; bool extra; bool write; bool softrepeat; bool softraw; bool scroll; bool enabled; /* Accessed only from interrupt */ unsigned char emul; bool resend; bool release; unsigned long xl_bit; unsigned int last; unsigned long time; unsigned long err_count; struct delayed_work event_work; unsigned long event_jiffies; unsigned long event_mask; /* Serializes reconnect(), attr->set() and event work */ struct mutex mutex; struct vivaldi_data vdata; }; /* * System-specific keymap fixup routine */ static void (*atkbd_platform_fixup)(struct atkbd *, const void *data); static void *atkbd_platform_fixup_data; static unsigned int (*atkbd_platform_scancode_fixup)(struct atkbd *, unsigned int); /* * Certain keyboards to not like ATKBD_CMD_RESET_DIS and stop responding * to many commands until full reset (ATKBD_CMD_RESET_BAT) is performed. */ static bool atkbd_skip_deactivate; static ssize_t atkbd_attr_show_helper(struct device *dev, char *buf, ssize_t (*handler)(struct atkbd *, char *)); static ssize_t atkbd_attr_set_helper(struct device *dev, const char *buf, size_t count, ssize_t (*handler)(struct atkbd *, const char *, size_t)); #define ATKBD_DEFINE_ATTR(_name) \ static ssize_t atkbd_show_##_name(struct atkbd *, char *); \ static ssize_t atkbd_set_##_name(struct atkbd *, const char *, size_t); \ static ssize_t atkbd_do_show_##_name(struct device *d, \ struct device_attribute *attr, char *b) \ { \ return atkbd_attr_show_helper(d, b, atkbd_show_##_name); \ } \ static ssize_t atkbd_do_set_##_name(struct device *d, \ struct device_attribute *attr, const char *b, size_t s) \ { \ return atkbd_attr_set_helper(d, b, s, atkbd_set_##_name); \ } \ static struct device_attribute atkbd_attr_##_name = \ __ATTR(_name, S_IWUSR | S_IRUGO, atkbd_do_show_##_name, atkbd_do_set_##_name); ATKBD_DEFINE_ATTR(extra); ATKBD_DEFINE_ATTR(force_release); ATKBD_DEFINE_ATTR(scroll); ATKBD_DEFINE_ATTR(set); ATKBD_DEFINE_ATTR(softrepeat); ATKBD_DEFINE_ATTR(softraw); #define ATKBD_DEFINE_RO_ATTR(_name) \ static ssize_t atkbd_show_##_name(struct atkbd *, char *); \ static ssize_t atkbd_do_show_##_name(struct device *d, \ struct device_attribute *attr, char *b) \ { \ return atkbd_attr_show_helper(d, b, atkbd_show_##_name); \ } \ static struct device_attribute atkbd_attr_##_name = \ __ATTR(_name, S_IRUGO, atkbd_do_show_##_name, NULL); ATKBD_DEFINE_RO_ATTR(err_count); ATKBD_DEFINE_RO_ATTR(function_row_physmap); static struct attribute *atkbd_attributes[] = { &atkbd_attr_extra.attr, &atkbd_attr_force_release.attr, &atkbd_attr_scroll.attr, &atkbd_attr_set.attr, &atkbd_attr_softrepeat.attr, &atkbd_attr_softraw.attr, &atkbd_attr_err_count.attr, &atkbd_attr_function_row_physmap.attr, NULL }; static ssize_t atkbd_show_function_row_physmap(struct atkbd *atkbd, char *buf) { return vivaldi_function_row_physmap_show(&atkbd->vdata, buf); } static struct atkbd *atkbd_from_serio(struct serio *serio) { struct ps2dev *ps2dev = serio_get_drvdata(serio); return container_of(ps2dev, struct atkbd, ps2dev); } static umode_t atkbd_attr_is_visible(struct kobject *kobj, struct attribute *attr, int i) { struct device *dev = kobj_to_dev(kobj); struct serio *serio = to_serio_port(dev); struct atkbd *atkbd = atkbd_from_serio(serio); if (attr == &atkbd_attr_function_row_physmap.attr && !atkbd->vdata.num_function_row_keys) return 0; return attr->mode; } static const struct attribute_group atkbd_attribute_group = { .attrs = atkbd_attributes, .is_visible = atkbd_attr_is_visible, }; __ATTRIBUTE_GROUPS(atkbd_attribute); static const unsigned int xl_table[] = { ATKBD_RET_BAT, ATKBD_RET_ERR, ATKBD_RET_ACK, ATKBD_RET_NAK, ATKBD_RET_HANJA, ATKBD_RET_HANGEUL, }; /* * Checks if we should mangle the scancode to extract 'release' bit * in translated mode. */ static bool atkbd_need_xlate(unsigned long xl_bit, unsigned char code) { int i; if (code == ATKBD_RET_EMUL0 || code == ATKBD_RET_EMUL1) return false; for (i = 0; i < ARRAY_SIZE(xl_table); i++) if (code == xl_table[i]) return test_bit(i, &xl_bit); return true; } /* * Calculates new value of xl_bit so the driver can distinguish * between make/break pair of scancodes for select keys and PS/2 * protocol responses. */ static void atkbd_calculate_xl_bit(struct atkbd *atkbd, unsigned char code) { int i; for (i = 0; i < ARRAY_SIZE(xl_table); i++) { if (!((code ^ xl_table[i]) & 0x7f)) { if (code & 0x80) __clear_bit(i, &atkbd->xl_bit); else __set_bit(i, &atkbd->xl_bit); break; } } } /* * Encode the scancode, 0xe0 prefix, and high bit into a single integer, * keeping kernel 2.4 compatibility for set 2 */ static unsigned int atkbd_compat_scancode(struct atkbd *atkbd, unsigned int code) { if (atkbd->set == 3) { if (atkbd->emul == 1) code |= 0x100; } else { code = (code & 0x7f) | ((code & 0x80) << 1); if (atkbd->emul == 1) code |= 0x80; } return code; } /* * Tries to handle frame or parity error by requesting the keyboard controller * to resend the last byte. This historically not done on x86 as controllers * there typically do not implement this command. */ static bool __maybe_unused atkbd_handle_frame_error(struct ps2dev *ps2dev, u8 data, unsigned int flags) { struct atkbd *atkbd = container_of(ps2dev, struct atkbd, ps2dev); struct serio *serio = ps2dev->serio; if ((flags & (SERIO_FRAME | SERIO_PARITY)) && (~flags & SERIO_TIMEOUT) && !atkbd->resend && atkbd->write) { dev_warn(&serio->dev, "Frame/parity error: %02x\n", flags); serio_write(serio, ATKBD_CMD_RESEND); atkbd->resend = true; return true; } if (!flags && data == ATKBD_RET_ACK) atkbd->resend = false; return false; } static enum ps2_disposition atkbd_pre_receive_byte(struct ps2dev *ps2dev, u8 data, unsigned int flags) { struct serio *serio = ps2dev->serio; dev_dbg(&serio->dev, "Received %02x flags %02x\n", data, flags); #if !defined(__i386__) && !defined (__x86_64__) if (atkbd_handle_frame_error(ps2dev, data, flags)) return PS2_IGNORE; #endif return PS2_PROCESS; } static void atkbd_receive_byte(struct ps2dev *ps2dev, u8 data) { struct serio *serio = ps2dev->serio; struct atkbd *atkbd = container_of(ps2dev, struct atkbd, ps2dev); struct input_dev *dev = atkbd->dev; unsigned int code = data; int scroll = 0, hscroll = 0, click = -1; int value; unsigned short keycode; pm_wakeup_event(&serio->dev, 0); if (!atkbd->enabled) return; input_event(dev, EV_MSC, MSC_RAW, code); if (atkbd_platform_scancode_fixup) code = atkbd_platform_scancode_fixup(atkbd, code); if (atkbd->translated) { if (atkbd->emul || atkbd_need_xlate(atkbd->xl_bit, code)) { atkbd->release = code >> 7; code &= 0x7f; } if (!atkbd->emul) atkbd_calculate_xl_bit(atkbd, data); } switch (code) { case ATKBD_RET_BAT: atkbd->enabled = false; serio_reconnect(atkbd->ps2dev.serio); return; case ATKBD_RET_EMUL0: atkbd->emul = 1; return; case ATKBD_RET_EMUL1: atkbd->emul = 2; return; case ATKBD_RET_RELEASE: atkbd->release = true; return; case ATKBD_RET_ACK: case ATKBD_RET_NAK: if (printk_ratelimit()) dev_warn(&serio->dev, "Spurious %s on %s. " "Some program might be trying to access hardware directly.\n", data == ATKBD_RET_ACK ? "ACK" : "NAK", serio->phys); return; case ATKBD_RET_ERR: atkbd->err_count++; dev_dbg(&serio->dev, "Keyboard on %s reports too many keys pressed.\n", serio->phys); return; } code = atkbd_compat_scancode(atkbd, code); if (atkbd->emul && --atkbd->emul) return; keycode = atkbd->keycode[code]; if (!(atkbd->release && test_bit(code, atkbd->force_release_mask))) if (keycode != ATKBD_KEY_NULL) input_event(dev, EV_MSC, MSC_SCAN, code); switch (keycode) { case ATKBD_KEY_NULL: break; case ATKBD_KEY_UNKNOWN: dev_warn(&serio->dev, "Unknown key %s (%s set %d, code %#x on %s).\n", atkbd->release ? "released" : "pressed", atkbd->translated ? "translated" : "raw", atkbd->set, code, serio->phys); dev_warn(&serio->dev, "Use 'setkeycodes %s%02x <keycode>' to make it known.\n", code & 0x80 ? "e0" : "", code & 0x7f); input_sync(dev); break; case ATKBD_SCR_1: scroll = 1; break; case ATKBD_SCR_2: scroll = 2; break; case ATKBD_SCR_4: scroll = 4; break; case ATKBD_SCR_8: scroll = 8; break; case ATKBD_SCR_CLICK: click = !atkbd->release; break; case ATKBD_SCR_LEFT: hscroll = -1; break; case ATKBD_SCR_RIGHT: hscroll = 1; break; default: if (atkbd->release) { value = 0; atkbd->last = 0; } else if (!atkbd->softrepeat && test_bit(keycode, dev->key)) { /* Workaround Toshiba laptop multiple keypress */ value = time_before(jiffies, atkbd->time) && atkbd->last == code ? 1 : 2; } else { value = 1; atkbd->last = code; atkbd->time = jiffies + msecs_to_jiffies(dev->rep[REP_DELAY]) / 2; } input_event(dev, EV_KEY, keycode, value); input_sync(dev); if (value && test_bit(code, atkbd->force_release_mask)) { input_event(dev, EV_MSC, MSC_SCAN, code); input_report_key(dev, keycode, 0); input_sync(dev); } } if (atkbd->scroll) { if (click != -1) input_report_key(dev, BTN_MIDDLE, click); input_report_rel(dev, REL_WHEEL, atkbd->release ? -scroll : scroll); input_report_rel(dev, REL_HWHEEL, hscroll); input_sync(dev); } atkbd->release = false; } static int atkbd_set_repeat_rate(struct atkbd *atkbd) { const short period[32] = { 33, 37, 42, 46, 50, 54, 58, 63, 67, 75, 83, 92, 100, 109, 116, 125, 133, 149, 167, 182, 200, 217, 232, 250, 270, 303, 333, 370, 400, 435, 470, 500 }; const short delay[4] = { 250, 500, 750, 1000 }; struct input_dev *dev = atkbd->dev; unsigned char param; int i = 0, j = 0; while (i < ARRAY_SIZE(period) - 1 && period[i] < dev->rep[REP_PERIOD]) i++; dev->rep[REP_PERIOD] = period[i]; while (j < ARRAY_SIZE(delay) - 1 && delay[j] < dev->rep[REP_DELAY]) j++; dev->rep[REP_DELAY] = delay[j]; param = i | (j << 5); return ps2_command(&atkbd->ps2dev, &param, ATKBD_CMD_SETREP); } static int atkbd_set_leds(struct atkbd *atkbd) { struct input_dev *dev = atkbd->dev; unsigned char param[2]; param[0] = (test_bit(LED_SCROLLL, dev->led) ? 1 : 0) | (test_bit(LED_NUML, dev->led) ? 2 : 0) | (test_bit(LED_CAPSL, dev->led) ? 4 : 0); if (ps2_command(&atkbd->ps2dev, param, ATKBD_CMD_SETLEDS)) return -1; if (atkbd->extra) { param[0] = 0; param[1] = (test_bit(LED_COMPOSE, dev->led) ? 0x01 : 0) | (test_bit(LED_SLEEP, dev->led) ? 0x02 : 0) | (test_bit(LED_SUSPEND, dev->led) ? 0x04 : 0) | (test_bit(LED_MISC, dev->led) ? 0x10 : 0) | (test_bit(LED_MUTE, dev->led) ? 0x20 : 0); if (ps2_command(&atkbd->ps2dev, param, ATKBD_CMD_EX_SETLEDS)) return -1; } return 0; } /* * atkbd_event_work() is used to complete processing of events that * can not be processed by input_event() which is often called from * interrupt context. */ static void atkbd_event_work(struct work_struct *work) { struct atkbd *atkbd = container_of(work, struct atkbd, event_work.work); mutex_lock(&atkbd->mutex); if (!atkbd->enabled) { /* * Serio ports are resumed asynchronously so while driver core * thinks that device is already fully operational in reality * it may not be ready yet. In this case we need to keep * rescheduling till reconnect completes. */ schedule_delayed_work(&atkbd->event_work, msecs_to_jiffies(100)); } else { if (test_and_clear_bit(ATKBD_LED_EVENT_BIT, &atkbd->event_mask)) atkbd_set_leds(atkbd); if (test_and_clear_bit(ATKBD_REP_EVENT_BIT, &atkbd->event_mask)) atkbd_set_repeat_rate(atkbd); } mutex_unlock(&atkbd->mutex); } /* * Schedule switch for execution. We need to throttle requests, * otherwise keyboard may become unresponsive. */ static void atkbd_schedule_event_work(struct atkbd *atkbd, int event_bit) { unsigned long delay = msecs_to_jiffies(50); if (time_after(jiffies, atkbd->event_jiffies + delay)) delay = 0; atkbd->event_jiffies = jiffies; set_bit(event_bit, &atkbd->event_mask); mb(); schedule_delayed_work(&atkbd->event_work, delay); } /* * Event callback from the input module. Events that change the state of * the hardware are processed here. If action can not be performed in * interrupt context it is offloaded to atkbd_event_work. */ static int atkbd_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) { struct atkbd *atkbd = input_get_drvdata(dev); if (!atkbd->write) return -1; switch (type) { case EV_LED: atkbd_schedule_event_work(atkbd, ATKBD_LED_EVENT_BIT); return 0; case EV_REP: if (!atkbd->softrepeat) atkbd_schedule_event_work(atkbd, ATKBD_REP_EVENT_BIT); return 0; default: return -1; } } /* * atkbd_enable() signals that interrupt handler is allowed to * generate input events. */ static inline void atkbd_enable(struct atkbd *atkbd) { serio_pause_rx(atkbd->ps2dev.serio); atkbd->enabled = true; serio_continue_rx(atkbd->ps2dev.serio); } /* * atkbd_disable() tells input handler that all incoming data except * for ACKs and command response should be dropped. */ static inline void atkbd_disable(struct atkbd *atkbd) { serio_pause_rx(atkbd->ps2dev.serio); atkbd->enabled = false; serio_continue_rx(atkbd->ps2dev.serio); } static int atkbd_activate(struct atkbd *atkbd) { struct ps2dev *ps2dev = &atkbd->ps2dev; /* * Enable the keyboard to receive keystrokes. */ if (ps2_command(ps2dev, NULL, ATKBD_CMD_ENABLE)) { dev_err(&ps2dev->serio->dev, "Failed to enable keyboard on %s\n", ps2dev->serio->phys); return -1; } return 0; } /* * atkbd_deactivate() resets and disables the keyboard from sending * keystrokes. */ static void atkbd_deactivate(struct atkbd *atkbd) { struct ps2dev *ps2dev = &atkbd->ps2dev; if (ps2_command(ps2dev, NULL, ATKBD_CMD_RESET_DIS)) dev_err(&ps2dev->serio->dev, "Failed to deactivate keyboard on %s\n", ps2dev->serio->phys); } #ifdef CONFIG_X86 static bool atkbd_is_portable_device(void) { static const char * const chassis_types[] = { "8", /* Portable */ "9", /* Laptop */ "10", /* Notebook */ "14", /* Sub-Notebook */ "31", /* Convertible */ "32", /* Detachable */ }; int i; for (i = 0; i < ARRAY_SIZE(chassis_types); i++) if (dmi_match(DMI_CHASSIS_TYPE, chassis_types[i])) return true; return false; } /* * On many modern laptops ATKBD_CMD_GETID may cause problems, on these laptops * the controller is always in translated mode. In this mode mice/touchpads will * not work. So in this case simply assume a keyboard is connected to avoid * confusing some laptop keyboards. * * Skipping ATKBD_CMD_GETID ends up using a fake keyboard id. Using the standard * 0xab83 id is ok in translated mode, only atkbd_select_set() checks atkbd->id * and in translated mode that is a no-op. */ static bool atkbd_skip_getid(struct atkbd *atkbd) { return atkbd->translated && atkbd_is_portable_device(); } #else static inline bool atkbd_skip_getid(struct atkbd *atkbd) { return false; } #endif /* * atkbd_probe() probes for an AT keyboard on a serio port. */ static int atkbd_probe(struct atkbd *atkbd) { struct ps2dev *ps2dev = &atkbd->ps2dev; unsigned char param[2]; bool skip_getid; /* * Some systems, where the bit-twiddling when testing the io-lines of the * controller may confuse the keyboard need a full reset of the keyboard. On * these systems the BIOS also usually doesn't do it for us. */ if (atkbd_reset) if (ps2_command(ps2dev, NULL, ATKBD_CMD_RESET_BAT)) dev_warn(&ps2dev->serio->dev, "keyboard reset failed on %s\n", ps2dev->serio->phys); /* * Then we check the keyboard ID. We should get 0xab83 under normal conditions. * Some keyboards report different values, but the first byte is always 0xab or * 0xac. Some old AT keyboards don't report anything. If a mouse is connected, this * should make sure we don't try to set the LEDs on it. */ param[0] = param[1] = 0xa5; /* initialize with invalid values */ skip_getid = atkbd_skip_getid(atkbd); if (skip_getid || ps2_command(ps2dev, param, ATKBD_CMD_GETID)) { /* * If the get ID command was skipped or failed, we check if we can at least set * the LEDs on the keyboard. This should work on every keyboard out there. * It also turns the LEDs off, which we want anyway. */ param[0] = 0; if (ps2_command(ps2dev, param, ATKBD_CMD_SETLEDS)) return -1; atkbd->id = skip_getid ? 0xab83 : 0xabba; return 0; } if (!ps2_is_keyboard_id(param[0])) return -1; atkbd->id = (param[0] << 8) | param[1]; if (atkbd->id == 0xaca1 && atkbd->translated) { dev_err(&ps2dev->serio->dev, "NCD terminal keyboards are only supported on non-translating controllers. " "Use i8042.direct=1 to disable translation.\n"); return -1; } /* * Make sure nothing is coming from the keyboard and disturbs our * internal state. */ if (!atkbd_skip_deactivate) atkbd_deactivate(atkbd); return 0; } /* * atkbd_select_set checks if a keyboard has a working Set 3 support, and * sets it into that. Unfortunately there are keyboards that can be switched * to Set 3, but don't work well in that (BTC Multimedia ...) */ static int atkbd_select_set(struct atkbd *atkbd, int target_set, int allow_extra) { struct ps2dev *ps2dev = &atkbd->ps2dev; unsigned char param[2]; atkbd->extra = false; /* * For known special keyboards we can go ahead and set the correct set. * We check for NCD PS/2 Sun, NorthGate OmniKey 101 and * IBM RapidAccess / IBM EzButton / Chicony KBP-8993 keyboards. */ if (atkbd->translated) return 2; if (atkbd->id == 0xaca1) { param[0] = 3; ps2_command(ps2dev, param, ATKBD_CMD_SSCANSET); return 3; } if (allow_extra) { param[0] = 0x71; if (!ps2_command(ps2dev, param, ATKBD_CMD_EX_ENABLE)) { atkbd->extra = true; return 2; } } if (atkbd_terminal) { ps2_command(ps2dev, param, ATKBD_CMD_SETALL_MB); return 3; } if (target_set != 3) return 2; if (!ps2_command(ps2dev, param, ATKBD_CMD_OK_GETID)) { atkbd->id = param[0] << 8 | param[1]; return 2; } param[0] = 3; if (ps2_command(ps2dev, param, ATKBD_CMD_SSCANSET)) return 2; param[0] = 0; if (ps2_command(ps2dev, param, ATKBD_CMD_GSCANSET)) return 2; if (param[0] != 3) { param[0] = 2; if (ps2_command(ps2dev, param, ATKBD_CMD_SSCANSET)) return 2; } ps2_command(ps2dev, param, ATKBD_CMD_SETALL_MBR); return 3; } static int atkbd_reset_state(struct atkbd *atkbd) { struct ps2dev *ps2dev = &atkbd->ps2dev; unsigned char param[1]; /* * Set the LEDs to a predefined state (all off). */ param[0] = 0; if (ps2_command(ps2dev, param, ATKBD_CMD_SETLEDS)) return -1; /* * Set autorepeat to fastest possible. */ param[0] = 0; if (ps2_command(ps2dev, param, ATKBD_CMD_SETREP)) return -1; return 0; } /* * atkbd_cleanup() restores the keyboard state so that BIOS is happy after a * reboot. */ static void atkbd_cleanup(struct serio *serio) { struct atkbd *atkbd = atkbd_from_serio(serio); atkbd_disable(atkbd); ps2_command(&atkbd->ps2dev, NULL, ATKBD_CMD_RESET_DEF); } /* * atkbd_disconnect() closes and frees. */ static void atkbd_disconnect(struct serio *serio) { struct atkbd *atkbd = atkbd_from_serio(serio); atkbd_disable(atkbd); input_unregister_device(atkbd->dev); /* * Make sure we don't have a command in flight. * Note that since atkbd->enabled is false event work will keep * rescheduling itself until it gets canceled and will not try * accessing freed input device or serio port. */ cancel_delayed_work_sync(&atkbd->event_work); serio_close(serio); serio_set_drvdata(serio, NULL); kfree(atkbd); } /* * generate release events for the keycodes given in data */ static void atkbd_apply_forced_release_keylist(struct atkbd* atkbd, const void *data) { const unsigned int *keys = data; unsigned int i; if (atkbd->set == 2) for (i = 0; keys[i] != -1U; i++) __set_bit(keys[i], atkbd->force_release_mask); } /* * Most special keys (Fn+F?) on Dell laptops do not generate release * events so we have to do it ourselves. */ static unsigned int atkbd_dell_laptop_forced_release_keys[] = { 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8f, 0x93, -1U }; /* * Perform fixup for HP system that doesn't generate release * for its video switch */ static unsigned int atkbd_hp_forced_release_keys[] = { 0x94, -1U }; /* * Samsung NC10,NC20 with Fn+F? key release not working */ static unsigned int atkbd_samsung_forced_release_keys[] = { 0x82, 0x83, 0x84, 0x86, 0x88, 0x89, 0xb3, 0xf7, 0xf9, -1U }; /* * Amilo Pi 3525 key release for Fn+Volume keys not working */ static unsigned int atkbd_amilo_pi3525_forced_release_keys[] = { 0x20, 0xa0, 0x2e, 0xae, 0x30, 0xb0, -1U }; /* * Amilo Xi 3650 key release for light touch bar not working */ static unsigned int atkbd_amilo_xi3650_forced_release_keys[] = { 0x67, 0xed, 0x90, 0xa2, 0x99, 0xa4, 0xae, 0xb0, -1U }; /* * Soltech TA12 system with broken key release on volume keys and mute key */ static unsigned int atkdb_soltech_ta12_forced_release_keys[] = { 0xa0, 0xae, 0xb0, -1U }; /* * Many notebooks don't send key release event for volume up/down * keys, with key list below common among them */ static unsigned int atkbd_volume_forced_release_keys[] = { 0xae, 0xb0, -1U }; /* * OQO 01+ multimedia keys (64--66) generate e0 6x upon release whereas * they should be generating e4-e6 (0x80 | code). */ static unsigned int atkbd_oqo_01plus_scancode_fixup(struct atkbd *atkbd, unsigned int code) { if (atkbd->translated && atkbd->emul == 1 && (code == 0x64 || code == 0x65 || code == 0x66)) { atkbd->emul = 0; code |= 0x80; } return code; } static int atkbd_get_keymap_from_fwnode(struct atkbd *atkbd) { struct device *dev = &atkbd->ps2dev.serio->dev; int i, n; u32 *ptr; u16 scancode, keycode; /* Parse "linux,keymap" property */ n = device_property_count_u32(dev, "linux,keymap"); if (n <= 0 || n > ATKBD_KEYMAP_SIZE) return -ENXIO; ptr = kcalloc(n, sizeof(u32), GFP_KERNEL); if (!ptr) return -ENOMEM; if (device_property_read_u32_array(dev, "linux,keymap", ptr, n)) { dev_err(dev, "problem parsing FW keymap property\n"); kfree(ptr); return -EINVAL; } memset(atkbd->keycode, 0, sizeof(atkbd->keycode)); for (i = 0; i < n; i++) { scancode = SCANCODE(ptr[i]); keycode = KEYCODE(ptr[i]); atkbd->keycode[scancode] = keycode; } kfree(ptr); return 0; } /* * atkbd_set_keycode_table() initializes keyboard's keycode table * according to the selected scancode set */ static void atkbd_set_keycode_table(struct atkbd *atkbd) { struct device *dev = &atkbd->ps2dev.serio->dev; unsigned int scancode; int i, j; memset(atkbd->keycode, 0, sizeof(atkbd->keycode)); bitmap_zero(atkbd->force_release_mask, ATKBD_KEYMAP_SIZE); if (!atkbd_get_keymap_from_fwnode(atkbd)) { dev_dbg(dev, "Using FW keymap\n"); } else if (atkbd->translated) { for (i = 0; i < 128; i++) { scancode = atkbd_unxlate_table[i]; atkbd->keycode[i] = atkbd_set2_keycode[scancode]; atkbd->keycode[i | 0x80] = atkbd_set2_keycode[scancode | 0x80]; if (atkbd->scroll) for (j = 0; j < ARRAY_SIZE(atkbd_scroll_keys); j++) if ((scancode | 0x80) == atkbd_scroll_keys[j].set2) atkbd->keycode[i | 0x80] = atkbd_scroll_keys[j].keycode; } } else if (atkbd->set == 3) { memcpy(atkbd->keycode, atkbd_set3_keycode, sizeof(atkbd->keycode)); } else { memcpy(atkbd->keycode, atkbd_set2_keycode, sizeof(atkbd->keycode)); if (atkbd->scroll) for (i = 0; i < ARRAY_SIZE(atkbd_scroll_keys); i++) { scancode = atkbd_scroll_keys[i].set2; atkbd->keycode[scancode] = atkbd_scroll_keys[i].keycode; } } /* * HANGEUL and HANJA keys do not send release events so we need to * generate such events ourselves */ scancode = atkbd_compat_scancode(atkbd, ATKBD_RET_HANGEUL); atkbd->keycode[scancode] = KEY_HANGEUL; __set_bit(scancode, atkbd->force_release_mask); scancode = atkbd_compat_scancode(atkbd, ATKBD_RET_HANJA); atkbd->keycode[scancode] = KEY_HANJA; __set_bit(scancode, atkbd->force_release_mask); /* * Perform additional fixups */ if (atkbd_platform_fixup) atkbd_platform_fixup(atkbd, atkbd_platform_fixup_data); } /* * atkbd_set_device_attrs() sets up keyboard's input device structure */ static void atkbd_set_device_attrs(struct atkbd *atkbd) { struct input_dev *input_dev = atkbd->dev; int i; if (atkbd->extra) snprintf(atkbd->name, sizeof(atkbd->name), "AT Set 2 Extra keyboard"); else snprintf(atkbd->name, sizeof(atkbd->name), "AT %s Set %d keyboard", atkbd->translated ? "Translated" : "Raw", atkbd->set); snprintf(atkbd->phys, sizeof(atkbd->phys), "%s/input0", atkbd->ps2dev.serio->phys); input_dev->name = atkbd->name; input_dev->phys = atkbd->phys; input_dev->id.bustype = BUS_I8042; input_dev->id.vendor = 0x0001; input_dev->id.product = atkbd->translated ? 1 : atkbd->set; input_dev->id.version = atkbd->id; input_dev->event = atkbd_event; input_dev->dev.parent = &atkbd->ps2dev.serio->dev; input_set_drvdata(input_dev, atkbd); input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP) | BIT_MASK(EV_MSC); if (atkbd->write) { input_dev->evbit[0] |= BIT_MASK(EV_LED); input_dev->ledbit[0] = BIT_MASK(LED_NUML) | BIT_MASK(LED_CAPSL) | BIT_MASK(LED_SCROLLL); } if (atkbd->extra) input_dev->ledbit[0] |= BIT_MASK(LED_COMPOSE) | BIT_MASK(LED_SUSPEND) | BIT_MASK(LED_SLEEP) | BIT_MASK(LED_MUTE) | BIT_MASK(LED_MISC); if (!atkbd->softrepeat) { input_dev->rep[REP_DELAY] = 250; input_dev->rep[REP_PERIOD] = 33; } input_dev->mscbit[0] = atkbd->softraw ? BIT_MASK(MSC_SCAN) : BIT_MASK(MSC_RAW) | BIT_MASK(MSC_SCAN); if (atkbd->scroll) { input_dev->evbit[0] |= BIT_MASK(EV_REL); input_dev->relbit[0] = BIT_MASK(REL_WHEEL) | BIT_MASK(REL_HWHEEL); __set_bit(BTN_MIDDLE, input_dev->keybit); } input_dev->keycode = atkbd->keycode; input_dev->keycodesize = sizeof(unsigned short); input_dev->keycodemax = ARRAY_SIZE(atkbd_set2_keycode); for (i = 0; i < ATKBD_KEYMAP_SIZE; i++) { if (atkbd->keycode[i] != KEY_RESERVED && atkbd->keycode[i] != ATKBD_KEY_NULL && atkbd->keycode[i] < ATKBD_SPECIAL) { __set_bit(atkbd->keycode[i], input_dev->keybit); } } } static void atkbd_parse_fwnode_data(struct serio *serio) { struct atkbd *atkbd = atkbd_from_serio(serio); struct device *dev = &serio->dev; int n; /* Parse "function-row-physmap" property */ n = device_property_count_u32(dev, "function-row-physmap"); if (n > 0 && n <= VIVALDI_MAX_FUNCTION_ROW_KEYS && !device_property_read_u32_array(dev, "function-row-physmap", atkbd->vdata.function_row_physmap, n)) { atkbd->vdata.num_function_row_keys = n; dev_dbg(dev, "FW reported %d function-row key locations\n", n); } } /* * atkbd_connect() is called when the serio module finds an interface * that isn't handled yet by an appropriate device driver. We check if * there is an AT keyboard out there and if yes, we register ourselves * to the input module. */ static int atkbd_connect(struct serio *serio, struct serio_driver *drv) { struct atkbd *atkbd; struct input_dev *dev; int err = -ENOMEM; atkbd = kzalloc(sizeof(struct atkbd), GFP_KERNEL); dev = input_allocate_device(); if (!atkbd || !dev) goto fail1; atkbd->dev = dev; ps2_init(&atkbd->ps2dev, serio, atkbd_pre_receive_byte, atkbd_receive_byte); INIT_DELAYED_WORK(&atkbd->event_work, atkbd_event_work); mutex_init(&atkbd->mutex); switch (serio->id.type) { case SERIO_8042_XL: atkbd->translated = true; fallthrough; case SERIO_8042: if (serio->write) atkbd->write = true; break; } atkbd->softraw = atkbd_softraw; atkbd->softrepeat = atkbd_softrepeat; atkbd->scroll = atkbd_scroll; if (atkbd->softrepeat) atkbd->softraw = true; serio_set_drvdata(serio, atkbd); err = serio_open(serio, drv); if (err) goto fail2; if (atkbd->write) { if (atkbd_probe(atkbd)) { err = -ENODEV; goto fail3; } atkbd->set = atkbd_select_set(atkbd, atkbd_set, atkbd_extra); atkbd_reset_state(atkbd); } else { atkbd->set = 2; atkbd->id = 0xab00; } atkbd_parse_fwnode_data(serio); atkbd_set_keycode_table(atkbd); atkbd_set_device_attrs(atkbd); atkbd_enable(atkbd); if (serio->write) atkbd_activate(atkbd); err = input_register_device(atkbd->dev); if (err) goto fail3; return 0; fail3: serio_close(serio); fail2: serio_set_drvdata(serio, NULL); fail1: input_free_device(dev); kfree(atkbd); return err; } /* * atkbd_reconnect() tries to restore keyboard into a sane state and is * most likely called on resume. */ static int atkbd_reconnect(struct serio *serio) { struct atkbd *atkbd = atkbd_from_serio(serio); struct serio_driver *drv = serio->drv; int retval = -1; if (!atkbd || !drv) { dev_dbg(&serio->dev, "reconnect request, but serio is disconnected, ignoring...\n"); return -1; } mutex_lock(&atkbd->mutex); atkbd_disable(atkbd); if (atkbd->write) { if (atkbd_probe(atkbd)) goto out; if (atkbd->set != atkbd_select_set(atkbd, atkbd->set, atkbd->extra)) goto out; /* * Restore LED state and repeat rate. While input core * will do this for us at resume time reconnect may happen * because user requested it via sysfs or simply because * keyboard was unplugged and plugged in again so we need * to do it ourselves here. */ atkbd_set_leds(atkbd); if (!atkbd->softrepeat) atkbd_set_repeat_rate(atkbd); } /* * Reset our state machine in case reconnect happened in the middle * of multi-byte scancode. */ atkbd->xl_bit = 0; atkbd->emul = 0; atkbd_enable(atkbd); if (atkbd->write) atkbd_activate(atkbd); retval = 0; out: mutex_unlock(&atkbd->mutex); return retval; } static const struct serio_device_id atkbd_serio_ids[] = { { .type = SERIO_8042, .proto = SERIO_ANY, .id = SERIO_ANY, .extra = SERIO_ANY, }, { .type = SERIO_8042_XL, .proto = SERIO_ANY, .id = SERIO_ANY, .extra = SERIO_ANY, }, { .type = SERIO_RS232, .proto = SERIO_PS2SER, .id = SERIO_ANY, .extra = SERIO_ANY, }, { 0 } }; MODULE_DEVICE_TABLE(serio, atkbd_serio_ids); static struct serio_driver atkbd_drv = { .driver = { .name = "atkbd", .dev_groups = atkbd_attribute_groups, }, .description = DRIVER_DESC, .id_table = atkbd_serio_ids, .interrupt = ps2_interrupt, .connect = atkbd_connect, .reconnect = atkbd_reconnect, .disconnect = atkbd_disconnect, .cleanup = atkbd_cleanup, }; static ssize_t atkbd_attr_show_helper(struct device *dev, char *buf, ssize_t (*handler)(struct atkbd *, char *)) { struct serio *serio = to_serio_port(dev); struct atkbd *atkbd = atkbd_from_serio(serio); return handler(atkbd, buf); } static ssize_t atkbd_attr_set_helper(struct device *dev, const char *buf, size_t count, ssize_t (*handler)(struct atkbd *, const char *, size_t)) { struct serio *serio = to_serio_port(dev); struct atkbd *atkbd = atkbd_from_serio(serio); int retval; retval = mutex_lock_interruptible(&atkbd->mutex); if (retval) return retval; atkbd_disable(atkbd); retval = handler(atkbd, buf, count); atkbd_enable(atkbd); mutex_unlock(&atkbd->mutex); return retval; } static ssize_t atkbd_show_extra(struct atkbd *atkbd, char *buf) { return sprintf(buf, "%d\n", atkbd->extra ? 1 : 0); } static ssize_t atkbd_set_extra(struct atkbd *atkbd, const char *buf, size_t count) { struct input_dev *old_dev, *new_dev; unsigned int value; int err; bool old_extra; unsigned char old_set; if (!atkbd->write) return -EIO; err = kstrtouint(buf, 10, &value); if (err) return err; if (value > 1) return -EINVAL; if (atkbd->extra != value) { /* * Since device's properties will change we need to * unregister old device. But allocate and register * new one first to make sure we have it. */ old_dev = atkbd->dev; old_extra = atkbd->extra; old_set = atkbd->set; new_dev = input_allocate_device(); if (!new_dev) return -ENOMEM; atkbd->dev = new_dev; atkbd->set = atkbd_select_set(atkbd, atkbd->set, value); atkbd_reset_state(atkbd); atkbd_activate(atkbd); atkbd_set_keycode_table(atkbd); atkbd_set_device_attrs(atkbd); err = input_register_device(atkbd->dev); if (err) { input_free_device(new_dev); atkbd->dev = old_dev; atkbd->set = atkbd_select_set(atkbd, old_set, old_extra); atkbd_set_keycode_table(atkbd); atkbd_set_device_attrs(atkbd); return err; } input_unregister_device(old_dev); } return count; } static ssize_t atkbd_show_force_release(struct atkbd *atkbd, char *buf) { size_t len = scnprintf(buf, PAGE_SIZE - 1, "%*pbl", ATKBD_KEYMAP_SIZE, atkbd->force_release_mask); buf[len++] = '\n'; buf[len] = '\0'; return len; } static ssize_t atkbd_set_force_release(struct atkbd *atkbd, const char *buf, size_t count) { /* 64 bytes on stack should be acceptable */ DECLARE_BITMAP(new_mask, ATKBD_KEYMAP_SIZE); int err; err = bitmap_parselist(buf, new_mask, ATKBD_KEYMAP_SIZE); if (err) return err; memcpy(atkbd->force_release_mask, new_mask, sizeof(atkbd->force_release_mask)); return count; } static ssize_t atkbd_show_scroll(struct atkbd *atkbd, char *buf) { return sprintf(buf, "%d\n", atkbd->scroll ? 1 : 0); } static ssize_t atkbd_set_scroll(struct atkbd *atkbd, const char *buf, size_t count) { struct input_dev *old_dev, *new_dev; unsigned int value; int err; bool old_scroll; err = kstrtouint(buf, 10, &value); if (err) return err; if (value > 1) return -EINVAL; if (atkbd->scroll != value) { old_dev = atkbd->dev; old_scroll = atkbd->scroll; new_dev = input_allocate_device(); if (!new_dev) return -ENOMEM; atkbd->dev = new_dev; atkbd->scroll = value; atkbd_set_keycode_table(atkbd); atkbd_set_device_attrs(atkbd); err = input_register_device(atkbd->dev); if (err) { input_free_device(new_dev); atkbd->scroll = old_scroll; atkbd->dev = old_dev; atkbd_set_keycode_table(atkbd); atkbd_set_device_attrs(atkbd); return err; } input_unregister_device(old_dev); } return count; } static ssize_t atkbd_show_set(struct atkbd *atkbd, char *buf) { return sprintf(buf, "%d\n", atkbd->set); } static ssize_t atkbd_set_set(struct atkbd *atkbd, const char *buf, size_t count) { struct input_dev *old_dev, *new_dev; unsigned int value; int err; unsigned char old_set; bool old_extra; if (!atkbd->write) return -EIO; err = kstrtouint(buf, 10, &value); if (err) return err; if (value != 2 && value != 3) return -EINVAL; if (atkbd->set != value) { old_dev = atkbd->dev; old_extra = atkbd->extra; old_set = atkbd->set; new_dev = input_allocate_device(); if (!new_dev) return -ENOMEM; atkbd->dev = new_dev; atkbd->set = atkbd_select_set(atkbd, value, atkbd->extra); atkbd_reset_state(atkbd); atkbd_activate(atkbd); atkbd_set_keycode_table(atkbd); atkbd_set_device_attrs(atkbd); err = input_register_device(atkbd->dev); if (err) { input_free_device(new_dev); atkbd->dev = old_dev; atkbd->set = atkbd_select_set(atkbd, old_set, old_extra); atkbd_set_keycode_table(atkbd); atkbd_set_device_attrs(atkbd); return err; } input_unregister_device(old_dev); } return count; } static ssize_t atkbd_show_softrepeat(struct atkbd *atkbd, char *buf) { return sprintf(buf, "%d\n", atkbd->softrepeat ? 1 : 0); } static ssize_t atkbd_set_softrepeat(struct atkbd *atkbd, const char *buf, size_t count) { struct input_dev *old_dev, *new_dev; unsigned int value; int err; bool old_softrepeat, old_softraw; if (!atkbd->write) return -EIO; err = kstrtouint(buf, 10, &value); if (err) return err; if (value > 1) return -EINVAL; if (atkbd->softrepeat != value) { old_dev = atkbd->dev; old_softrepeat = atkbd->softrepeat; old_softraw = atkbd->softraw; new_dev = input_allocate_device(); if (!new_dev) return -ENOMEM; atkbd->dev = new_dev; atkbd->softrepeat = value; if (atkbd->softrepeat) atkbd->softraw = true; atkbd_set_device_attrs(atkbd); err = input_register_device(atkbd->dev); if (err) { input_free_device(new_dev); atkbd->dev = old_dev; atkbd->softrepeat = old_softrepeat; atkbd->softraw = old_softraw; atkbd_set_device_attrs(atkbd); return err; } input_unregister_device(old_dev); } return count; } static ssize_t atkbd_show_softraw(struct atkbd *atkbd, char *buf) { return sprintf(buf, "%d\n", atkbd->softraw ? 1 : 0); } static ssize_t atkbd_set_softraw(struct atkbd *atkbd, const char *buf, size_t count) { struct input_dev *old_dev, *new_dev; unsigned int value; int err; bool old_softraw; err = kstrtouint(buf, 10, &value); if (err) return err; if (value > 1) return -EINVAL; if (atkbd->softraw != value) { old_dev = atkbd->dev; old_softraw = atkbd->softraw; new_dev = input_allocate_device(); if (!new_dev) return -ENOMEM; atkbd->dev = new_dev; atkbd->softraw = value; atkbd_set_device_attrs(atkbd); err = input_register_device(atkbd->dev); if (err) { input_free_device(new_dev); atkbd->dev = old_dev; atkbd->softraw = old_softraw; atkbd_set_device_attrs(atkbd); return err; } input_unregister_device(old_dev); } return count; } static ssize_t atkbd_show_err_count(struct atkbd *atkbd, char *buf) { return sprintf(buf, "%lu\n", atkbd->err_count); } static int __init atkbd_setup_forced_release(const struct dmi_system_id *id) { atkbd_platform_fixup = atkbd_apply_forced_release_keylist; atkbd_platform_fixup_data = id->driver_data; return 1; } static int __init atkbd_setup_scancode_fixup(const struct dmi_system_id *id) { atkbd_platform_scancode_fixup = id->driver_data; return 1; } static int __init atkbd_deactivate_fixup(const struct dmi_system_id *id) { atkbd_skip_deactivate = true; return 1; } /* * NOTE: do not add any more "force release" quirks to this table. The * task of adjusting list of keys that should be "released" automatically * by the driver is now delegated to userspace tools, such as udev, so * submit such quirks there. */ static const struct dmi_system_id atkbd_dmi_quirk_table[] __initconst = { { .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_CHASSIS_TYPE, "8"), /* Portable */ }, .callback = atkbd_setup_forced_release, .driver_data = atkbd_dell_laptop_forced_release_keys, }, { .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), DMI_MATCH(DMI_CHASSIS_TYPE, "8"), /* Portable */ }, .callback = atkbd_setup_forced_release, .driver_data = atkbd_dell_laptop_forced_release_keys, }, { .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), DMI_MATCH(DMI_PRODUCT_NAME, "HP 2133"), }, .callback = atkbd_setup_forced_release, .driver_data = atkbd_hp_forced_release_keys, }, { .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), DMI_MATCH(DMI_PRODUCT_NAME, "Pavilion ZV6100"), }, .callback = atkbd_setup_forced_release, .driver_data = atkbd_volume_forced_release_keys, }, { .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), DMI_MATCH(DMI_PRODUCT_NAME, "Presario R4000"), }, .callback = atkbd_setup_forced_release, .driver_data = atkbd_volume_forced_release_keys, }, { .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), DMI_MATCH(DMI_PRODUCT_NAME, "Presario R4100"), }, .callback = atkbd_setup_forced_release, .driver_data = atkbd_volume_forced_release_keys, }, { .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), DMI_MATCH(DMI_PRODUCT_NAME, "Presario R4200"), }, .callback = atkbd_setup_forced_release, .driver_data = atkbd_volume_forced_release_keys, }, { /* Inventec Symphony */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "INVENTEC"), DMI_MATCH(DMI_PRODUCT_NAME, "SYMPHONY 6.0/7.0"), }, .callback = atkbd_setup_forced_release, .driver_data = atkbd_volume_forced_release_keys, }, { /* Samsung NC10 */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."), DMI_MATCH(DMI_PRODUCT_NAME, "NC10"), }, .callback = atkbd_setup_forced_release, .driver_data = atkbd_samsung_forced_release_keys, }, { /* Samsung NC20 */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."), DMI_MATCH(DMI_PRODUCT_NAME, "NC20"), }, .callback = atkbd_setup_forced_release, .driver_data = atkbd_samsung_forced_release_keys, }, { /* Samsung SQ45S70S */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."), DMI_MATCH(DMI_PRODUCT_NAME, "SQ45S70S"), }, .callback = atkbd_setup_forced_release, .driver_data = atkbd_samsung_forced_release_keys, }, { /* Fujitsu Amilo PA 1510 */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), DMI_MATCH(DMI_PRODUCT_NAME, "AMILO Pa 1510"), }, .callback = atkbd_setup_forced_release, .driver_data = atkbd_volume_forced_release_keys, }, { /* Fujitsu Amilo Pi 3525 */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), DMI_MATCH(DMI_PRODUCT_NAME, "AMILO Pi 3525"), }, .callback = atkbd_setup_forced_release, .driver_data = atkbd_amilo_pi3525_forced_release_keys, }, { /* Fujitsu Amilo Xi 3650 */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), DMI_MATCH(DMI_PRODUCT_NAME, "AMILO Xi 3650"), }, .callback = atkbd_setup_forced_release, .driver_data = atkbd_amilo_xi3650_forced_release_keys, }, { .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Soltech Corporation"), DMI_MATCH(DMI_PRODUCT_NAME, "TA12"), }, .callback = atkbd_setup_forced_release, .driver_data = atkdb_soltech_ta12_forced_release_keys, }, { /* OQO Model 01+ */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "OQO"), DMI_MATCH(DMI_PRODUCT_NAME, "ZEPTO"), }, .callback = atkbd_setup_scancode_fixup, .driver_data = atkbd_oqo_01plus_scancode_fixup, }, { .matches = { DMI_MATCH(DMI_SYS_VENDOR, "LG Electronics"), }, .callback = atkbd_deactivate_fixup, }, { } }; static int __init atkbd_init(void) { dmi_check_system(atkbd_dmi_quirk_table); return serio_register_driver(&atkbd_drv); } static void __exit atkbd_exit(void) { serio_unregister_driver(&atkbd_drv); } module_init(atkbd_init); module_exit(atkbd_exit);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz> * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon */ #include <linux/module.h> #include "../../dccp.h" #include "tfrc.h" #define TFRC_CALC_X_ARRSIZE 500 #define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */ #define TFRC_SMALLEST_P (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE) /* TFRC TCP Reno Throughput Equation Lookup Table for f(p) The following two-column lookup table implements a part of the TCP throughput equation from [RFC 3448, sec. 3.1]: s X_calc = -------------------------------------------------------------- R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3)) Where: X is the transmit rate in bytes/second s is the packet size in bytes R is the round trip time in seconds p is the loss event rate, between 0 and 1.0, of the number of loss events as a fraction of the number of packets transmitted t_RTO is the TCP retransmission timeout value in seconds b is the number of packets acknowledged by a single TCP ACK We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes: s X_calc = ------------------------------------------------------- R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3)) which we can break down into: s X_calc = --------- R * f(p) where f(p) is given for 0 < p <= 1 by: f(p) = sqrt(2*p/3) + 12 * sqrt(3*p/8) * (p + 32*p^3) Since this is kernel code, floating-point arithmetic is avoided in favour of integer arithmetic. This means that nearly all fractional parameters are scaled by 1000000: * the parameters p and R * the return result f(p) The lookup table therefore actually tabulates the following function g(q): g(q) = 1000000 * f(q/1000000) Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer granularity for the practically more relevant case of small values of p (up to 5%), the second column is used; the first one ranges up to 100%. This split corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also determines the smallest resolution possible with this lookup table: TFRC_SMALLEST_P = TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE The entire table is generated by: for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) { lookup[i][0] = g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE); lookup[i][1] = g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE); } With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1, lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%) lookup[M][0] = g(1000000) = 1000000 * f(100%) lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%) lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%) In summary, the two columns represent f(p) for the following ranges: * The first column is for 0.002 <= p <= 1.0 * The second column is for 0.0001 <= p <= 0.05 Where the columns overlap, the second (finer-grained) is given preference, i.e. the first column is used only for p >= 0.05. */ static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { { 37172, 8172 }, { 53499, 11567 }, { 66664, 14180 }, { 78298, 16388 }, { 89021, 18339 }, { 99147, 20108 }, { 108858, 21738 }, { 118273, 23260 }, { 127474, 24693 }, { 136520, 26052 }, { 145456, 27348 }, { 154316, 28589 }, { 163130, 29783 }, { 171919, 30935 }, { 180704, 32049 }, { 189502, 33130 }, { 198328, 34180 }, { 207194, 35202 }, { 216114, 36198 }, { 225097, 37172 }, { 234153, 38123 }, { 243294, 39055 }, { 252527, 39968 }, { 261861, 40864 }, { 271305, 41743 }, { 280866, 42607 }, { 290553, 43457 }, { 300372, 44293 }, { 310333, 45117 }, { 320441, 45929 }, { 330705, 46729 }, { 341131, 47518 }, { 351728, 48297 }, { 362501, 49066 }, { 373460, 49826 }, { 384609, 50577 }, { 395958, 51320 }, { 407513, 52054 }, { 419281, 52780 }, { 431270, 53499 }, { 443487, 54211 }, { 455940, 54916 }, { 468635, 55614 }, { 481581, 56306 }, { 494785, 56991 }, { 508254, 57671 }, { 521996, 58345 }, { 536019, 59014 }, { 550331, 59677 }, { 564939, 60335 }, { 579851, 60988 }, { 595075, 61636 }, { 610619, 62279 }, { 626491, 62918 }, { 642700, 63553 }, { 659253, 64183 }, { 676158, 64809 }, { 693424, 65431 }, { 711060, 66050 }, { 729073, 66664 }, { 747472, 67275 }, { 766266, 67882 }, { 785464, 68486 }, { 805073, 69087 }, { 825103, 69684 }, { 845562, 70278 }, { 866460, 70868 }, { 887805, 71456 }, { 909606, 72041 }, { 931873, 72623 }, { 954614, 73202 }, { 977839, 73778 }, { 1001557, 74352 }, { 1025777, 74923 }, { 1050508, 75492 }, { 1075761, 76058 }, { 1101544, 76621 }, { 1127867, 77183 }, { 1154739, 77741 }, { 1182172, 78298 }, { 1210173, 78852 }, { 1238753, 79405 }, { 1267922, 79955 }, { 1297689, 80503 }, { 1328066, 81049 }, { 1359060, 81593 }, { 1390684, 82135 }, { 1422947, 82675 }, { 1455859, 83213 }, { 1489430, 83750 }, { 1523671, 84284 }, { 1558593, 84817 }, { 1594205, 85348 }, { 1630518, 85878 }, { 1667543, 86406 }, { 1705290, 86932 }, { 1743770, 87457 }, { 1782994, 87980 }, { 1822973, 88501 }, { 1863717, 89021 }, { 1905237, 89540 }, { 1947545, 90057 }, { 1990650, 90573 }, { 2034566, 91087 }, { 2079301, 91600 }, { 2124869, 92111 }, { 2171279, 92622 }, { 2218543, 93131 }, { 2266673, 93639 }, { 2315680, 94145 }, { 2365575, 94650 }, { 2416371, 95154 }, { 2468077, 95657 }, { 2520707, 96159 }, { 2574271, 96660 }, { 2628782, 97159 }, { 2684250, 97658 }, { 2740689, 98155 }, { 2798110, 98651 }, { 2856524, 99147 }, { 2915944, 99641 }, { 2976382, 100134 }, { 3037850, 100626 }, { 3100360, 101117 }, { 3163924, 101608 }, { 3228554, 102097 }, { 3294263, 102586 }, { 3361063, 103073 }, { 3428966, 103560 }, { 3497984, 104045 }, { 3568131, 104530 }, { 3639419, 105014 }, { 3711860, 105498 }, { 3785467, 105980 }, { 3860253, 106462 }, { 3936229, 106942 }, { 4013410, 107422 }, { 4091808, 107902 }, { 4171435, 108380 }, { 4252306, 108858 }, { 4334431, 109335 }, { 4417825, 109811 }, { 4502501, 110287 }, { 4588472, 110762 }, { 4675750, 111236 }, { 4764349, 111709 }, { 4854283, 112182 }, { 4945564, 112654 }, { 5038206, 113126 }, { 5132223, 113597 }, { 5227627, 114067 }, { 5324432, 114537 }, { 5422652, 115006 }, { 5522299, 115474 }, { 5623389, 115942 }, { 5725934, 116409 }, { 5829948, 116876 }, { 5935446, 117342 }, { 6042439, 117808 }, { 6150943, 118273 }, { 6260972, 118738 }, { 6372538, 119202 }, { 6485657, 119665 }, { 6600342, 120128 }, { 6716607, 120591 }, { 6834467, 121053 }, { 6953935, 121514 }, { 7075025, 121976 }, { 7197752, 122436 }, { 7322131, 122896 }, { 7448175, 123356 }, { 7575898, 123815 }, { 7705316, 124274 }, { 7836442, 124733 }, { 7969291, 125191 }, { 8103877, 125648 }, { 8240216, 126105 }, { 8378321, 126562 }, { 8518208, 127018 }, { 8659890, 127474 }, { 8803384, 127930 }, { 8948702, 128385 }, { 9095861, 128840 }, { 9244875, 129294 }, { 9395760, 129748 }, { 9548529, 130202 }, { 9703198, 130655 }, { 9859782, 131108 }, { 10018296, 131561 }, { 10178755, 132014 }, { 10341174, 132466 }, { 10505569, 132917 }, { 10671954, 133369 }, { 10840345, 133820 }, { 11010757, 134271 }, { 11183206, 134721 }, { 11357706, 135171 }, { 11534274, 135621 }, { 11712924, 136071 }, { 11893673, 136520 }, { 12076536, 136969 }, { 12261527, 137418 }, { 12448664, 137867 }, { 12637961, 138315 }, { 12829435, 138763 }, { 13023101, 139211 }, { 13218974, 139658 }, { 13417071, 140106 }, { 13617407, 140553 }, { 13819999, 140999 }, { 14024862, 141446 }, { 14232012, 141892 }, { 14441465, 142339 }, { 14653238, 142785 }, { 14867346, 143230 }, { 15083805, 143676 }, { 15302632, 144121 }, { 15523842, 144566 }, { 15747453, 145011 }, { 15973479, 145456 }, { 16201939, 145900 }, { 16432847, 146345 }, { 16666221, 146789 }, { 16902076, 147233 }, { 17140429, 147677 }, { 17381297, 148121 }, { 17624696, 148564 }, { 17870643, 149007 }, { 18119154, 149451 }, { 18370247, 149894 }, { 18623936, 150336 }, { 18880241, 150779 }, { 19139176, 151222 }, { 19400759, 151664 }, { 19665007, 152107 }, { 19931936, 152549 }, { 20201564, 152991 }, { 20473907, 153433 }, { 20748982, 153875 }, { 21026807, 154316 }, { 21307399, 154758 }, { 21590773, 155199 }, { 21876949, 155641 }, { 22165941, 156082 }, { 22457769, 156523 }, { 22752449, 156964 }, { 23049999, 157405 }, { 23350435, 157846 }, { 23653774, 158287 }, { 23960036, 158727 }, { 24269236, 159168 }, { 24581392, 159608 }, { 24896521, 160049 }, { 25214642, 160489 }, { 25535772, 160929 }, { 25859927, 161370 }, { 26187127, 161810 }, { 26517388, 162250 }, { 26850728, 162690 }, { 27187165, 163130 }, { 27526716, 163569 }, { 27869400, 164009 }, { 28215234, 164449 }, { 28564236, 164889 }, { 28916423, 165328 }, { 29271815, 165768 }, { 29630428, 166208 }, { 29992281, 166647 }, { 30357392, 167087 }, { 30725779, 167526 }, { 31097459, 167965 }, { 31472452, 168405 }, { 31850774, 168844 }, { 32232445, 169283 }, { 32617482, 169723 }, { 33005904, 170162 }, { 33397730, 170601 }, { 33792976, 171041 }, { 34191663, 171480 }, { 34593807, 171919 }, { 34999428, 172358 }, { 35408544, 172797 }, { 35821174, 173237 }, { 36237335, 173676 }, { 36657047, 174115 }, { 37080329, 174554 }, { 37507197, 174993 }, { 37937673, 175433 }, { 38371773, 175872 }, { 38809517, 176311 }, { 39250924, 176750 }, { 39696012, 177190 }, { 40144800, 177629 }, { 40597308, 178068 }, { 41053553, 178507 }, { 41513554, 178947 }, { 41977332, 179386 }, { 42444904, 179825 }, { 42916290, 180265 }, { 43391509, 180704 }, { 43870579, 181144 }, { 44353520, 181583 }, { 44840352, 182023 }, { 45331092, 182462 }, { 45825761, 182902 }, { 46324378, 183342 }, { 46826961, 183781 }, { 47333531, 184221 }, { 47844106, 184661 }, { 48358706, 185101 }, { 48877350, 185541 }, { 49400058, 185981 }, { 49926849, 186421 }, { 50457743, 186861 }, { 50992759, 187301 }, { 51531916, 187741 }, { 52075235, 188181 }, { 52622735, 188622 }, { 53174435, 189062 }, { 53730355, 189502 }, { 54290515, 189943 }, { 54854935, 190383 }, { 55423634, 190824 }, { 55996633, 191265 }, { 56573950, 191706 }, { 57155606, 192146 }, { 57741621, 192587 }, { 58332014, 193028 }, { 58926806, 193470 }, { 59526017, 193911 }, { 60129666, 194352 }, { 60737774, 194793 }, { 61350361, 195235 }, { 61967446, 195677 }, { 62589050, 196118 }, { 63215194, 196560 }, { 63845897, 197002 }, { 64481179, 197444 }, { 65121061, 197886 }, { 65765563, 198328 }, { 66414705, 198770 }, { 67068508, 199213 }, { 67726992, 199655 }, { 68390177, 200098 }, { 69058085, 200540 }, { 69730735, 200983 }, { 70408147, 201426 }, { 71090343, 201869 }, { 71777343, 202312 }, { 72469168, 202755 }, { 73165837, 203199 }, { 73867373, 203642 }, { 74573795, 204086 }, { 75285124, 204529 }, { 76001380, 204973 }, { 76722586, 205417 }, { 77448761, 205861 }, { 78179926, 206306 }, { 78916102, 206750 }, { 79657310, 207194 }, { 80403571, 207639 }, { 81154906, 208084 }, { 81911335, 208529 }, { 82672880, 208974 }, { 83439562, 209419 }, { 84211402, 209864 }, { 84988421, 210309 }, { 85770640, 210755 }, { 86558080, 211201 }, { 87350762, 211647 }, { 88148708, 212093 }, { 88951938, 212539 }, { 89760475, 212985 }, { 90574339, 213432 }, { 91393551, 213878 }, { 92218133, 214325 }, { 93048107, 214772 }, { 93883493, 215219 }, { 94724314, 215666 }, { 95570590, 216114 }, { 96422343, 216561 }, { 97279594, 217009 }, { 98142366, 217457 }, { 99010679, 217905 }, { 99884556, 218353 }, { 100764018, 218801 }, { 101649086, 219250 }, { 102539782, 219698 }, { 103436128, 220147 }, { 104338146, 220596 }, { 105245857, 221046 }, { 106159284, 221495 }, { 107078448, 221945 }, { 108003370, 222394 }, { 108934074, 222844 }, { 109870580, 223294 }, { 110812910, 223745 }, { 111761087, 224195 }, { 112715133, 224646 }, { 113675069, 225097 }, { 114640918, 225548 }, { 115612702, 225999 }, { 116590442, 226450 }, { 117574162, 226902 }, { 118563882, 227353 }, { 119559626, 227805 }, { 120561415, 228258 }, { 121569272, 228710 }, { 122583219, 229162 }, { 123603278, 229615 }, { 124629471, 230068 }, { 125661822, 230521 }, { 126700352, 230974 }, { 127745083, 231428 }, { 128796039, 231882 }, { 129853241, 232336 }, { 130916713, 232790 }, { 131986475, 233244 }, { 133062553, 233699 }, { 134144966, 234153 }, { 135233739, 234608 }, { 136328894, 235064 }, { 137430453, 235519 }, { 138538440, 235975 }, { 139652876, 236430 }, { 140773786, 236886 }, { 141901190, 237343 }, { 143035113, 237799 }, { 144175576, 238256 }, { 145322604, 238713 }, { 146476218, 239170 }, { 147636442, 239627 }, { 148803298, 240085 }, { 149976809, 240542 }, { 151156999, 241000 }, { 152343890, 241459 }, { 153537506, 241917 }, { 154737869, 242376 }, { 155945002, 242835 }, { 157158929, 243294 }, { 158379673, 243753 }, { 159607257, 244213 }, { 160841704, 244673 }, { 162083037, 245133 }, { 163331279, 245593 }, { 164586455, 246054 }, { 165848586, 246514 }, { 167117696, 246975 }, { 168393810, 247437 }, { 169676949, 247898 }, { 170967138, 248360 }, { 172264399, 248822 }, { 173568757, 249284 }, { 174880235, 249747 }, { 176198856, 250209 }, { 177524643, 250672 }, { 178857621, 251136 }, { 180197813, 251599 }, { 181545242, 252063 }, { 182899933, 252527 }, { 184261908, 252991 }, { 185631191, 253456 }, { 187007807, 253920 }, { 188391778, 254385 }, { 189783129, 254851 }, { 191181884, 255316 }, { 192588065, 255782 }, { 194001698, 256248 }, { 195422805, 256714 }, { 196851411, 257181 }, { 198287540, 257648 }, { 199731215, 258115 }, { 201182461, 258582 }, { 202641302, 259050 }, { 204107760, 259518 }, { 205581862, 259986 }, { 207063630, 260454 }, { 208553088, 260923 }, { 210050262, 261392 }, { 211555174, 261861 }, { 213067849, 262331 }, { 214588312, 262800 }, { 216116586, 263270 }, { 217652696, 263741 }, { 219196666, 264211 }, { 220748520, 264682 }, { 222308282, 265153 }, { 223875978, 265625 }, { 225451630, 266097 }, { 227035265, 266569 }, { 228626905, 267041 }, { 230226576, 267514 }, { 231834302, 267986 }, { 233450107, 268460 }, { 235074016, 268933 }, { 236706054, 269407 }, { 238346244, 269881 }, { 239994613, 270355 }, { 241651183, 270830 }, { 243315981, 271305 } }; /* return largest index i such that fval <= lookup[i][small] */ static inline u32 tfrc_binsearch(u32 fval, u8 small) { u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1; while (low < high) { try = (low + high) / 2; if (fval <= tfrc_calc_x_lookup[try][small]) high = try; else low = try + 1; } return high; } /** * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448 * @s: packet size in bytes * @R: RTT scaled by 1000000 (i.e., microseconds) * @p: loss ratio estimate scaled by 1000000 * * Returns X_calc in bytes per second (not scaled). */ u32 tfrc_calc_x(u16 s, u32 R, u32 p) { u16 index; u32 f; u64 result; /* check against invalid parameters and divide-by-zero */ BUG_ON(p > 1000000); /* p must not exceed 100% */ BUG_ON(p == 0); /* f(0) = 0, divide by zero */ if (R == 0) { /* possible divide by zero */ DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc."); return ~0U; } if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ DCCP_WARN("Value of p (%d) below resolution. " "Substituting %d\n", p, TFRC_SMALLEST_P); index = 0; } else /* 0.0001 <= p <= 0.05 */ index = p/TFRC_SMALLEST_P - 1; f = tfrc_calc_x_lookup[index][1]; } else { /* 0.05 < p <= 1.00 */ index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1; f = tfrc_calc_x_lookup[index][0]; } /* * Compute X = s/(R*f(p)) in bytes per second. * Since f(p) and R are both scaled by 1000000, we need to multiply by * 1000000^2. To avoid overflow, the result is computed in two stages. * This works under almost all reasonable operational conditions, for a * wide range of parameters. Yet, should some strange combination of * parameters result in overflow, the use of scaled_div32 will catch * this and return UINT_MAX - which is a logically adequate consequence. */ result = scaled_div(s, R); return scaled_div32(result, f); } /** * tfrc_calc_x_reverse_lookup - try to find p given f(p) * @fvalue: function value to match, scaled by 1000000 * * Returns closest match for p, also scaled by 1000000 */ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) { int index; if (fvalue == 0) /* f(p) = 0 whenever p = 0 */ return 0; /* Error cases. */ if (fvalue < tfrc_calc_x_lookup[0][1]) { DCCP_WARN("fvalue %u smaller than resolution\n", fvalue); return TFRC_SMALLEST_P; } if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) { DCCP_WARN("fvalue %u exceeds bounds!\n", fvalue); return 1000000; } if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) { index = tfrc_binsearch(fvalue, 1); return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE; } /* else ... it must be in the coarse-grained column */ index = tfrc_binsearch(fvalue, 0); return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; } /** * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% * @loss_event_rate: loss event rate to invert * When @loss_event_rate is large, there is a chance that p is truncated to 0. * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. */ u32 tfrc_invert_loss_event_rate(u32 loss_event_rate) { if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */ return 0; if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */ return 1000000; return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P); }
1086 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * pm_wakeup.h - Power management wakeup interface * * Copyright (C) 2008 Alan Stern * Copyright (C) 2010 Rafael J. Wysocki, Novell Inc. */ #ifndef _LINUX_PM_WAKEUP_H #define _LINUX_PM_WAKEUP_H #ifndef _DEVICE_H_ # error "please don't include this file directly" #endif #include <linux/types.h> struct wake_irq; /** * struct wakeup_source - Representation of wakeup sources * * @name: Name of the wakeup source * @id: Wakeup source id * @entry: Wakeup source list entry * @lock: Wakeup source lock * @wakeirq: Optional device specific wakeirq * @timer: Wakeup timer list * @timer_expires: Wakeup timer expiration * @total_time: Total time this wakeup source has been active. * @max_time: Maximum time this wakeup source has been continuously active. * @last_time: Monotonic clock when the wakeup source's was touched last time. * @prevent_sleep_time: Total time this source has been preventing autosleep. * @event_count: Number of signaled wakeup events. * @active_count: Number of times the wakeup source was activated. * @relax_count: Number of times the wakeup source was deactivated. * @expire_count: Number of times the wakeup source's timeout has expired. * @wakeup_count: Number of times the wakeup source might abort suspend. * @dev: Struct device for sysfs statistics about the wakeup source. * @active: Status of the wakeup source. * @autosleep_enabled: Autosleep is active, so update @prevent_sleep_time. */ struct wakeup_source { const char *name; int id; struct list_head entry; spinlock_t lock; struct wake_irq *wakeirq; struct timer_list timer; unsigned long timer_expires; ktime_t total_time; ktime_t max_time; ktime_t last_time; ktime_t start_prevent_time; ktime_t prevent_sleep_time; unsigned long event_count; unsigned long active_count; unsigned long relax_count; unsigned long expire_count; unsigned long wakeup_count; struct device *dev; bool active:1; bool autosleep_enabled:1; }; #define for_each_wakeup_source(ws) \ for ((ws) = wakeup_sources_walk_start(); \ (ws); \ (ws) = wakeup_sources_walk_next((ws))) #ifdef CONFIG_PM_SLEEP /* * Changes to device_may_wakeup take effect on the next pm state change. */ static inline bool device_can_wakeup(struct device *dev) { return dev->power.can_wakeup; } static inline bool device_may_wakeup(struct device *dev) { return dev->power.can_wakeup && !!dev->power.wakeup; } static inline bool device_wakeup_path(struct device *dev) { return dev->power.wakeup_path; } static inline void device_set_wakeup_path(struct device *dev) { dev->power.wakeup_path = true; } /* drivers/base/power/wakeup.c */ extern struct wakeup_source *wakeup_source_create(const char *name); extern void wakeup_source_destroy(struct wakeup_source *ws); extern void wakeup_source_add(struct wakeup_source *ws); extern void wakeup_source_remove(struct wakeup_source *ws); extern struct wakeup_source *wakeup_source_register(struct device *dev, const char *name); extern void wakeup_source_unregister(struct wakeup_source *ws); extern int wakeup_sources_read_lock(void); extern void wakeup_sources_read_unlock(int idx); extern struct wakeup_source *wakeup_sources_walk_start(void); extern struct wakeup_source *wakeup_sources_walk_next(struct wakeup_source *ws); extern int device_wakeup_enable(struct device *dev); extern int device_wakeup_disable(struct device *dev); extern void device_set_wakeup_capable(struct device *dev, bool capable); extern int device_set_wakeup_enable(struct device *dev, bool enable); extern void __pm_stay_awake(struct wakeup_source *ws); extern void pm_stay_awake(struct device *dev); extern void __pm_relax(struct wakeup_source *ws); extern void pm_relax(struct device *dev); extern void pm_wakeup_ws_event(struct wakeup_source *ws, unsigned int msec, bool hard); extern void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard); #else /* !CONFIG_PM_SLEEP */ static inline void device_set_wakeup_capable(struct device *dev, bool capable) { dev->power.can_wakeup = capable; } static inline bool device_can_wakeup(struct device *dev) { return dev->power.can_wakeup; } static inline struct wakeup_source *wakeup_source_create(const char *name) { return NULL; } static inline void wakeup_source_destroy(struct wakeup_source *ws) {} static inline void wakeup_source_add(struct wakeup_source *ws) {} static inline void wakeup_source_remove(struct wakeup_source *ws) {} static inline struct wakeup_source *wakeup_source_register(struct device *dev, const char *name) { return NULL; } static inline void wakeup_source_unregister(struct wakeup_source *ws) {} static inline int device_wakeup_enable(struct device *dev) { dev->power.should_wakeup = true; return 0; } static inline int device_wakeup_disable(struct device *dev) { dev->power.should_wakeup = false; return 0; } static inline int device_set_wakeup_enable(struct device *dev, bool enable) { dev->power.should_wakeup = enable; return 0; } static inline bool device_may_wakeup(struct device *dev) { return dev->power.can_wakeup && dev->power.should_wakeup; } static inline bool device_wakeup_path(struct device *dev) { return false; } static inline void device_set_wakeup_path(struct device *dev) {} static inline void __pm_stay_awake(struct wakeup_source *ws) {} static inline void pm_stay_awake(struct device *dev) {} static inline void __pm_relax(struct wakeup_source *ws) {} static inline void pm_relax(struct device *dev) {} static inline void pm_wakeup_ws_event(struct wakeup_source *ws, unsigned int msec, bool hard) {} static inline void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard) {} #endif /* !CONFIG_PM_SLEEP */ static inline bool device_awake_path(struct device *dev) { return device_wakeup_path(dev); } static inline void device_set_awake_path(struct device *dev) { device_set_wakeup_path(dev); } static inline void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec) { return pm_wakeup_ws_event(ws, msec, false); } static inline void pm_wakeup_event(struct device *dev, unsigned int msec) { return pm_wakeup_dev_event(dev, msec, false); } static inline void pm_wakeup_hard_event(struct device *dev) { return pm_wakeup_dev_event(dev, 0, true); } /** * device_init_wakeup - Device wakeup initialization. * @dev: Device to handle. * @enable: Whether or not to enable @dev as a wakeup device. * * By default, most devices should leave wakeup disabled. The exceptions are * devices that everyone expects to be wakeup sources: keyboards, power buttons, * possibly network interfaces, etc. Also, devices that don't generate their * own wakeup requests but merely forward requests from one bus to another * (like PCI bridges) should have wakeup enabled by default. */ static inline int device_init_wakeup(struct device *dev, bool enable) { if (enable) { device_set_wakeup_capable(dev, true); return device_wakeup_enable(dev); } else { device_wakeup_disable(dev); device_set_wakeup_capable(dev, false); return 0; } } #endif /* _LINUX_PM_WAKEUP_H */
147 147 146 147 146 147 147 147 147 147 147 147 146 147 147 147 34 138 138 138 147 147 147 147 148 72 148 146 6 6 146 147 69 148 147 147 2 146 1 1 1 1 1 1 2 2 2 2 2 2 2 146 145 146 146 62 9 9 62 61 62 61 59 59 59 62 61 62 62 62 62 62 9 62 3 3 3 3 3 177 176 37 9 36 37 37 37 12 12 5 29 29 43 12 36 36 36 36 36 36 36 36 36 36 36 36 36 36 35 32 34 32 3 32 35 43 10 9 9 9 9 9 9 9 9 9 9 9 9 9 42 42 42 42 42 41 42 9 42 12 11 42 42 42 42 42 41 41 42 42 8 3 41 42 40 42 146 146 145 146 146 32 145 146 4 5 5 146 9 146 146 146 146 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 // SPDX-License-Identifier: GPL-2.0 #include <linux/sizes.h> #include <linux/list_sort.h> #include "misc.h" #include "ctree.h" #include "block-group.h" #include "space-info.h" #include "disk-io.h" #include "free-space-cache.h" #include "free-space-tree.h" #include "volumes.h" #include "transaction.h" #include "ref-verify.h" #include "sysfs.h" #include "tree-log.h" #include "delalloc-space.h" #include "discard.h" #include "raid56.h" #include "zoned.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #ifdef CONFIG_BTRFS_DEBUG int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || (btrfs_test_opt(fs_info, FRAGMENT_DATA) && block_group->flags & BTRFS_BLOCK_GROUP_DATA); } #endif /* * Return target flags in extended format or 0 if restripe for this chunk_type * is not in progress * * Should be called with balance_lock held */ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; u64 target = 0; if (!bctl) return 0; if (flags & BTRFS_BLOCK_GROUP_DATA && bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; } else if (flags & BTRFS_BLOCK_GROUP_METADATA && bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; } return target; } /* * @flags: available profiles in extended format (see ctree.h) * * Return reduced profile in chunk format. If profile changing is in progress * (either running or paused) picks the target profile (if it's already * available), otherwise falls back to plain reducing. */ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) { u64 num_devices = fs_info->fs_devices->rw_devices; u64 target; u64 raid_type; u64 allowed = 0; /* * See if restripe for this chunk_type is in progress, if so try to * reduce to the target profile */ spin_lock(&fs_info->balance_lock); target = get_restripe_target(fs_info, flags); if (target) { spin_unlock(&fs_info->balance_lock); return extended_to_chunk(target); } spin_unlock(&fs_info->balance_lock); /* First, mask out the RAID levels which aren't possible */ for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { if (num_devices >= btrfs_raid_array[raid_type].devs_min) allowed |= btrfs_raid_array[raid_type].bg_flag; } allowed &= flags; /* Select the highest-redundancy RAID level. */ if (allowed & BTRFS_BLOCK_GROUP_RAID1C4) allowed = BTRFS_BLOCK_GROUP_RAID1C4; else if (allowed & BTRFS_BLOCK_GROUP_RAID6) allowed = BTRFS_BLOCK_GROUP_RAID6; else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3) allowed = BTRFS_BLOCK_GROUP_RAID1C3; else if (allowed & BTRFS_BLOCK_GROUP_RAID5) allowed = BTRFS_BLOCK_GROUP_RAID5; else if (allowed & BTRFS_BLOCK_GROUP_RAID10) allowed = BTRFS_BLOCK_GROUP_RAID10; else if (allowed & BTRFS_BLOCK_GROUP_RAID1) allowed = BTRFS_BLOCK_GROUP_RAID1; else if (allowed & BTRFS_BLOCK_GROUP_DUP) allowed = BTRFS_BLOCK_GROUP_DUP; else if (allowed & BTRFS_BLOCK_GROUP_RAID0) allowed = BTRFS_BLOCK_GROUP_RAID0; flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; return extended_to_chunk(flags | allowed); } u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) { unsigned seq; u64 flags; do { flags = orig_flags; seq = read_seqbegin(&fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) flags |= fs_info->avail_data_alloc_bits; else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) flags |= fs_info->avail_system_alloc_bits; else if (flags & BTRFS_BLOCK_GROUP_METADATA) flags |= fs_info->avail_metadata_alloc_bits; } while (read_seqretry(&fs_info->profiles_lock, seq)); return btrfs_reduce_alloc_profile(fs_info, flags); } void btrfs_get_block_group(struct btrfs_block_group *cache) { refcount_inc(&cache->refs); } void btrfs_put_block_group(struct btrfs_block_group *cache) { if (refcount_dec_and_test(&cache->refs)) { WARN_ON(cache->pinned > 0); /* * If there was a failure to cleanup a log tree, very likely due * to an IO failure on a writeback attempt of one or more of its * extent buffers, we could not do proper (and cheap) unaccounting * of their reserved space, so don't warn on reserved > 0 in that * case. */ if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) || !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info)) WARN_ON(cache->reserved > 0); /* * A block_group shouldn't be on the discard_list anymore. * Remove the block_group from the discard_list to prevent us * from causing a panic due to NULL pointer dereference. */ if (WARN_ON(!list_empty(&cache->discard_list))) btrfs_discard_cancel_work(&cache->fs_info->discard_ctl, cache); kfree(cache->free_space_ctl); btrfs_free_chunk_map(cache->physical_map); kfree(cache); } } /* * This adds the block group to the fs_info rb tree for the block group cache */ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, struct btrfs_block_group *block_group) { struct rb_node **p; struct rb_node *parent = NULL; struct btrfs_block_group *cache; bool leftmost = true; ASSERT(block_group->length != 0); write_lock(&info->block_group_cache_lock); p = &info->block_group_cache_tree.rb_root.rb_node; while (*p) { parent = *p; cache = rb_entry(parent, struct btrfs_block_group, cache_node); if (block_group->start < cache->start) { p = &(*p)->rb_left; } else if (block_group->start > cache->start) { p = &(*p)->rb_right; leftmost = false; } else { write_unlock(&info->block_group_cache_lock); return -EEXIST; } } rb_link_node(&block_group->cache_node, parent, p); rb_insert_color_cached(&block_group->cache_node, &info->block_group_cache_tree, leftmost); write_unlock(&info->block_group_cache_lock); return 0; } /* * This will return the block group at or after bytenr if contains is 0, else * it will return the block group that contains the bytenr */ static struct btrfs_block_group *block_group_cache_tree_search( struct btrfs_fs_info *info, u64 bytenr, int contains) { struct btrfs_block_group *cache, *ret = NULL; struct rb_node *n; u64 end, start; read_lock(&info->block_group_cache_lock); n = info->block_group_cache_tree.rb_root.rb_node; while (n) { cache = rb_entry(n, struct btrfs_block_group, cache_node); end = cache->start + cache->length - 1; start = cache->start; if (bytenr < start) { if (!contains && (!ret || start < ret->start)) ret = cache; n = n->rb_left; } else if (bytenr > start) { if (contains && bytenr <= end) { ret = cache; break; } n = n->rb_right; } else { ret = cache; break; } } if (ret) btrfs_get_block_group(ret); read_unlock(&info->block_group_cache_lock); return ret; } /* * Return the block group that starts at or after bytenr */ struct btrfs_block_group *btrfs_lookup_first_block_group( struct btrfs_fs_info *info, u64 bytenr) { return block_group_cache_tree_search(info, bytenr, 0); } /* * Return the block group that contains the given bytenr */ struct btrfs_block_group *btrfs_lookup_block_group( struct btrfs_fs_info *info, u64 bytenr) { return block_group_cache_tree_search(info, bytenr, 1); } struct btrfs_block_group *btrfs_next_block_group( struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; struct rb_node *node; read_lock(&fs_info->block_group_cache_lock); /* If our block group was removed, we need a full search. */ if (RB_EMPTY_NODE(&cache->cache_node)) { const u64 next_bytenr = cache->start + cache->length; read_unlock(&fs_info->block_group_cache_lock); btrfs_put_block_group(cache); return btrfs_lookup_first_block_group(fs_info, next_bytenr); } node = rb_next(&cache->cache_node); btrfs_put_block_group(cache); if (node) { cache = rb_entry(node, struct btrfs_block_group, cache_node); btrfs_get_block_group(cache); } else cache = NULL; read_unlock(&fs_info->block_group_cache_lock); return cache; } /* * Check if we can do a NOCOW write for a given extent. * * @fs_info: The filesystem information object. * @bytenr: Logical start address of the extent. * * Check if we can do a NOCOW write for the given extent, and increments the * number of NOCOW writers in the block group that contains the extent, as long * as the block group exists and it's currently not in read-only mode. * * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller * is responsible for calling btrfs_dec_nocow_writers() later. * * Or NULL if we can not do a NOCOW write */ struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_block_group *bg; bool can_nocow = true; bg = btrfs_lookup_block_group(fs_info, bytenr); if (!bg) return NULL; spin_lock(&bg->lock); if (bg->ro) can_nocow = false; else atomic_inc(&bg->nocow_writers); spin_unlock(&bg->lock); if (!can_nocow) { btrfs_put_block_group(bg); return NULL; } /* No put on block group, done by btrfs_dec_nocow_writers(). */ return bg; } /* * Decrement the number of NOCOW writers in a block group. * * This is meant to be called after a previous call to btrfs_inc_nocow_writers(), * and on the block group returned by that call. Typically this is called after * creating an ordered extent for a NOCOW write, to prevent races with scrub and * relocation. * * After this call, the caller should not use the block group anymore. It it wants * to use it, then it should get a reference on it before calling this function. */ void btrfs_dec_nocow_writers(struct btrfs_block_group *bg) { if (atomic_dec_and_test(&bg->nocow_writers)) wake_up_var(&bg->nocow_writers); /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */ btrfs_put_block_group(bg); } void btrfs_wait_nocow_writers(struct btrfs_block_group *bg) { wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); } void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start) { struct btrfs_block_group *bg; bg = btrfs_lookup_block_group(fs_info, start); ASSERT(bg); if (atomic_dec_and_test(&bg->reservations)) wake_up_var(&bg->reservations); btrfs_put_block_group(bg); } void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg) { struct btrfs_space_info *space_info = bg->space_info; ASSERT(bg->ro); if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) return; /* * Our block group is read only but before we set it to read only, * some task might have had allocated an extent from it already, but it * has not yet created a respective ordered extent (and added it to a * root's list of ordered extents). * Therefore wait for any task currently allocating extents, since the * block group's reservations counter is incremented while a read lock * on the groups' semaphore is held and decremented after releasing * the read access on that semaphore and creating the ordered extent. */ down_write(&space_info->groups_sem); up_write(&space_info->groups_sem); wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); } struct btrfs_caching_control *btrfs_get_caching_control( struct btrfs_block_group *cache) { struct btrfs_caching_control *ctl; spin_lock(&cache->lock); if (!cache->caching_ctl) { spin_unlock(&cache->lock); return NULL; } ctl = cache->caching_ctl; refcount_inc(&ctl->count); spin_unlock(&cache->lock); return ctl; } void btrfs_put_caching_control(struct btrfs_caching_control *ctl) { if (refcount_dec_and_test(&ctl->count)) kfree(ctl); } /* * When we wait for progress in the block group caching, its because our * allocation attempt failed at least once. So, we must sleep and let some * progress happen before we try again. * * This function will sleep at least once waiting for new free space to show * up, and then it will check the block group free space numbers for our min * num_bytes. Another option is to have it go ahead and look in the rbtree for * a free extent of a given size, but this is a good start. * * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using * any of the information in this block group. */ void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes) { struct btrfs_caching_control *caching_ctl; int progress; caching_ctl = btrfs_get_caching_control(cache); if (!caching_ctl) return; /* * We've already failed to allocate from this block group, so even if * there's enough space in the block group it isn't contiguous enough to * allow for an allocation, so wait for at least the next wakeup tick, * or for the thing to be done. */ progress = atomic_read(&caching_ctl->progress); wait_event(caching_ctl->wait, btrfs_block_group_done(cache) || (progress != atomic_read(&caching_ctl->progress) && (cache->free_space_ctl->free_space >= num_bytes))); btrfs_put_caching_control(caching_ctl); } static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache, struct btrfs_caching_control *caching_ctl) { wait_event(caching_ctl->wait, btrfs_block_group_done(cache)); return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0; } static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) { struct btrfs_caching_control *caching_ctl; int ret; caching_ctl = btrfs_get_caching_control(cache); if (!caching_ctl) return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); btrfs_put_caching_control(caching_ctl); return ret; } #ifdef CONFIG_BTRFS_DEBUG static void fragment_free_space(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; u64 start = block_group->start; u64 len = block_group->length; u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? fs_info->nodesize : fs_info->sectorsize; u64 step = chunk << 1; while (len > chunk) { btrfs_remove_free_space(block_group, start, chunk); start += step; if (len < step) len = 0; else len -= step; } } #endif /* * Add a free space range to the in memory free space cache of a block group. * This checks if the range contains super block locations and any such * locations are not added to the free space cache. * * @block_group: The target block group. * @start: Start offset of the range. * @end: End offset of the range (exclusive). * @total_added_ret: Optional pointer to return the total amount of space * added to the block group's free space cache. * * Returns 0 on success or < 0 on error. */ int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end, u64 *total_added_ret) { struct btrfs_fs_info *info = block_group->fs_info; u64 extent_start, extent_end, size; int ret; if (total_added_ret) *total_added_ret = 0; while (start < end) { if (!find_first_extent_bit(&info->excluded_extents, start, &extent_start, &extent_end, EXTENT_DIRTY | EXTENT_UPTODATE, NULL)) break; if (extent_start <= start) { start = extent_end + 1; } else if (extent_start > start && extent_start < end) { size = extent_start - start; ret = btrfs_add_free_space_async_trimmed(block_group, start, size); if (ret) return ret; if (total_added_ret) *total_added_ret += size; start = extent_end + 1; } else { break; } } if (start < end) { size = end - start; ret = btrfs_add_free_space_async_trimmed(block_group, start, size); if (ret) return ret; if (total_added_ret) *total_added_ret += size; } return 0; } /* * Get an arbitrary extent item index / max_index through the block group * * @block_group the block group to sample from * @index: the integral step through the block group to grab from * @max_index: the granularity of the sampling * @key: return value parameter for the item we find * * Pre-conditions on indices: * 0 <= index <= max_index * 0 < max_index * * Returns: 0 on success, 1 if the search didn't yield a useful item, negative * error code on error. */ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl, struct btrfs_block_group *block_group, int index, int max_index, struct btrfs_key *found_key) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *extent_root; u64 search_offset; u64 search_end = block_group->start + block_group->length; struct btrfs_path *path; struct btrfs_key search_key; int ret = 0; ASSERT(index >= 0); ASSERT(index <= max_index); ASSERT(max_index > 0); lockdep_assert_held(&caching_ctl->mutex); lockdep_assert_held_read(&fs_info->commit_root_sem); path = btrfs_alloc_path(); if (!path) return -ENOMEM; extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET)); path->skip_locking = 1; path->search_commit_root = 1; path->reada = READA_FORWARD; search_offset = index * div_u64(block_group->length, max_index); search_key.objectid = block_group->start + search_offset; search_key.type = BTRFS_EXTENT_ITEM_KEY; search_key.offset = 0; btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) { /* Success; sampled an extent item in the block group */ if (found_key->type == BTRFS_EXTENT_ITEM_KEY && found_key->objectid >= block_group->start && found_key->objectid + found_key->offset <= search_end) break; /* We can't possibly find a valid extent item anymore */ if (found_key->objectid >= search_end) { ret = 1; break; } } lockdep_assert_held(&caching_ctl->mutex); lockdep_assert_held_read(&fs_info->commit_root_sem); btrfs_free_path(path); return ret; } /* * Best effort attempt to compute a block group's size class while caching it. * * @block_group: the block group we are caching * * We cannot infer the size class while adding free space extents, because that * logic doesn't care about contiguous file extents (it doesn't differentiate * between a 100M extent and 100 contiguous 1M extents). So we need to read the * file extent items. Reading all of them is quite wasteful, because usually * only a handful are enough to give a good answer. Therefore, we just grab 5 of * them at even steps through the block group and pick the smallest size class * we see. Since size class is best effort, and not guaranteed in general, * inaccuracy is acceptable. * * To be more explicit about why this algorithm makes sense: * * If we are caching in a block group from disk, then there are three major cases * to consider: * 1. the block group is well behaved and all extents in it are the same size * class. * 2. the block group is mostly one size class with rare exceptions for last * ditch allocations * 3. the block group was populated before size classes and can have a totally * arbitrary mix of size classes. * * In case 1, looking at any extent in the block group will yield the correct * result. For the mixed cases, taking the minimum size class seems like a good * approximation, since gaps from frees will be usable to the size class. For * 2., a small handful of file extents is likely to yield the right answer. For * 3, we can either read every file extent, or admit that this is best effort * anyway and try to stay fast. * * Returns: 0 on success, negative error code on error. */ static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl, struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_key key; int i; u64 min_size = block_group->length; enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE; int ret; if (!btrfs_block_group_should_use_size_class(block_group)) return 0; lockdep_assert_held(&caching_ctl->mutex); lockdep_assert_held_read(&fs_info->commit_root_sem); for (i = 0; i < 5; ++i) { ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key); if (ret < 0) goto out; if (ret > 0) continue; min_size = min_t(u64, min_size, key.offset); size_class = btrfs_calc_block_group_size_class(min_size); } if (size_class != BTRFS_BG_SZ_NONE) { spin_lock(&block_group->lock); block_group->size_class = size_class; spin_unlock(&block_group->lock); } out: return ret; } static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group = caching_ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *extent_root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key; u64 total_found = 0; u64 last = 0; u32 nritems; int ret; bool wakeup = true; path = btrfs_alloc_path(); if (!path) return -ENOMEM; last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET); extent_root = btrfs_extent_root(fs_info, last); #ifdef CONFIG_BTRFS_DEBUG /* * If we're fragmenting we don't want to make anybody think we can * allocate from this block group until we've had a chance to fragment * the free space. */ if (btrfs_should_fragment_free_space(block_group)) wakeup = false; #endif /* * We don't want to deadlock with somebody trying to allocate a new * extent for the extent root while also trying to search the extent * root to add free space. So we skip locking and search the commit * root, since its read-only */ path->skip_locking = 1; path->search_commit_root = 1; path->reada = READA_FORWARD; key.objectid = last; key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); while (1) { if (btrfs_fs_closing(fs_info) > 1) { last = (u64)-1; break; } if (path->slots[0] < nritems) { btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); } else { ret = btrfs_find_next_key(extent_root, path, &key, 0, 0); if (ret) break; if (need_resched() || rwsem_is_contended(&fs_info->commit_root_sem)) { btrfs_release_path(path); up_read(&fs_info->commit_root_sem); mutex_unlock(&caching_ctl->mutex); cond_resched(); mutex_lock(&caching_ctl->mutex); down_read(&fs_info->commit_root_sem); goto next; } ret = btrfs_next_leaf(extent_root, path); if (ret < 0) goto out; if (ret) break; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); continue; } if (key.objectid < last) { key.objectid = last; key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; btrfs_release_path(path); goto next; } if (key.objectid < block_group->start) { path->slots[0]++; continue; } if (key.objectid >= block_group->start + block_group->length) break; if (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY) { u64 space_added; ret = btrfs_add_new_free_space(block_group, last, key.objectid, &space_added); if (ret) goto out; total_found += space_added; if (key.type == BTRFS_METADATA_ITEM_KEY) last = key.objectid + fs_info->nodesize; else last = key.objectid + key.offset; if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; if (wakeup) { atomic_inc(&caching_ctl->progress); wake_up(&caching_ctl->wait); } } } path->slots[0]++; } ret = btrfs_add_new_free_space(block_group, last, block_group->start + block_group->length, NULL); out: btrfs_free_path(path); return ret; } static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg) { clear_extent_bits(&bg->fs_info->excluded_extents, bg->start, bg->start + bg->length - 1, EXTENT_UPTODATE); } static noinline void caching_thread(struct btrfs_work *work) { struct btrfs_block_group *block_group; struct btrfs_fs_info *fs_info; struct btrfs_caching_control *caching_ctl; int ret; caching_ctl = container_of(work, struct btrfs_caching_control, work); block_group = caching_ctl->block_group; fs_info = block_group->fs_info; mutex_lock(&caching_ctl->mutex); down_read(&fs_info->commit_root_sem); load_block_group_size_class(caching_ctl, block_group); if (btrfs_test_opt(fs_info, SPACE_CACHE)) { ret = load_free_space_cache(block_group); if (ret == 1) { ret = 0; goto done; } /* * We failed to load the space cache, set ourselves to * CACHE_STARTED and carry on. */ spin_lock(&block_group->lock); block_group->cached = BTRFS_CACHE_STARTED; spin_unlock(&block_group->lock); wake_up(&caching_ctl->wait); } /* * If we are in the transaction that populated the free space tree we * can't actually cache from the free space tree as our commit root and * real root are the same, so we could change the contents of the blocks * while caching. Instead do the slow caching in this case, and after * the transaction has committed we will be safe. */ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags))) ret = load_free_space_tree(caching_ctl); else ret = load_extent_tree_free(caching_ctl); done: spin_lock(&block_group->lock); block_group->caching_ctl = NULL; block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; spin_unlock(&block_group->lock); #ifdef CONFIG_BTRFS_DEBUG if (btrfs_should_fragment_free_space(block_group)) { u64 bytes_used; spin_lock(&block_group->space_info->lock); spin_lock(&block_group->lock); bytes_used = block_group->length - block_group->used; block_group->space_info->bytes_used += bytes_used >> 1; spin_unlock(&block_group->lock); spin_unlock(&block_group->space_info->lock); fragment_free_space(block_group); } #endif up_read(&fs_info->commit_root_sem); btrfs_free_excluded_extents(block_group); mutex_unlock(&caching_ctl->mutex); wake_up(&caching_ctl->wait); btrfs_put_caching_control(caching_ctl); btrfs_put_block_group(block_group); } int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_caching_control *caching_ctl = NULL; int ret = 0; /* Allocator for zoned filesystems does not use the cache at all */ if (btrfs_is_zoned(fs_info)) return 0; caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); if (!caching_ctl) return -ENOMEM; INIT_LIST_HEAD(&caching_ctl->list); mutex_init(&caching_ctl->mutex); init_waitqueue_head(&caching_ctl->wait); caching_ctl->block_group = cache; refcount_set(&caching_ctl->count, 2); atomic_set(&caching_ctl->progress, 0); btrfs_init_work(&caching_ctl->work, caching_thread, NULL); spin_lock(&cache->lock); if (cache->cached != BTRFS_CACHE_NO) { kfree(caching_ctl); caching_ctl = cache->caching_ctl; if (caching_ctl) refcount_inc(&caching_ctl->count); spin_unlock(&cache->lock); goto out; } WARN_ON(cache->caching_ctl); cache->caching_ctl = caching_ctl; cache->cached = BTRFS_CACHE_STARTED; spin_unlock(&cache->lock); write_lock(&fs_info->block_group_cache_lock); refcount_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); write_unlock(&fs_info->block_group_cache_lock); btrfs_get_block_group(cache); btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); out: if (wait && caching_ctl) ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); if (caching_ctl) btrfs_put_caching_control(caching_ctl); return ret; } static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { u64 extra_flags = chunk_to_extended(flags) & BTRFS_EXTENDED_PROFILE_MASK; write_seqlock(&fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits &= ~extra_flags; if (flags & BTRFS_BLOCK_GROUP_METADATA) fs_info->avail_metadata_alloc_bits &= ~extra_flags; if (flags & BTRFS_BLOCK_GROUP_SYSTEM) fs_info->avail_system_alloc_bits &= ~extra_flags; write_sequnlock(&fs_info->profiles_lock); } /* * Clear incompat bits for the following feature(s): * * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group * in the whole filesystem * * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups */ static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) { bool found_raid56 = false; bool found_raid1c34 = false; if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) || (flags & BTRFS_BLOCK_GROUP_RAID1C3) || (flags & BTRFS_BLOCK_GROUP_RAID1C4)) { struct list_head *head = &fs_info->space_info; struct btrfs_space_info *sinfo; list_for_each_entry_rcu(sinfo, head, list) { down_read(&sinfo->groups_sem); if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) found_raid56 = true; if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) found_raid56 = true; if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3])) found_raid1c34 = true; if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4])) found_raid1c34 = true; up_read(&sinfo->groups_sem); } if (!found_raid56) btrfs_clear_fs_incompat(fs_info, RAID56); if (!found_raid1c34) btrfs_clear_fs_incompat(fs_info, RAID1C34); } } static int remove_block_group_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root; struct btrfs_key key; int ret; root = btrfs_block_group_root(fs_info); key.objectid = block_group->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; key.offset = block_group->length; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) ret = -ENOENT; if (ret < 0) return ret; ret = btrfs_del_item(trans, root, path); return ret; } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *path; struct btrfs_block_group *block_group; struct btrfs_free_cluster *cluster; struct inode *inode; struct kobject *kobj = NULL; int ret; int index; int factor; struct btrfs_caching_control *caching_ctl = NULL; bool remove_map; bool remove_rsv = false; block_group = btrfs_lookup_block_group(fs_info, map->start); BUG_ON(!block_group); BUG_ON(!block_group->ro); trace_btrfs_remove_block_group(block_group); /* * Free the reserved super bytes from this block group before * remove it. */ btrfs_free_excluded_extents(block_group); btrfs_free_ref_tree_range(fs_info, block_group->start, block_group->length); index = btrfs_bg_flags_to_raid_index(block_group->flags); factor = btrfs_bg_type_to_factor(block_group->flags); /* make sure this block group isn't part of an allocation cluster */ cluster = &fs_info->data_alloc_cluster; spin_lock(&cluster->refill_lock); btrfs_return_cluster_to_free_space(block_group, cluster); spin_unlock(&cluster->refill_lock); /* * make sure this block group isn't part of a metadata * allocation cluster */ cluster = &fs_info->meta_alloc_cluster; spin_lock(&cluster->refill_lock); btrfs_return_cluster_to_free_space(block_group, cluster); spin_unlock(&cluster->refill_lock); btrfs_clear_treelog_bg(block_group); btrfs_clear_data_reloc_bg(block_group); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } /* * get the inode first so any iput calls done for the io_list * aren't the final iput (no unlinks allowed now) */ inode = lookup_free_space_inode(block_group, path); mutex_lock(&trans->transaction->cache_write_mutex); /* * Make sure our free space cache IO is done before removing the * free space inode */ spin_lock(&trans->transaction->dirty_bgs_lock); if (!list_empty(&block_group->io_list)) { list_del_init(&block_group->io_list); WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); spin_unlock(&trans->transaction->dirty_bgs_lock); btrfs_wait_cache_io(trans, block_group, path); btrfs_put_block_group(block_group); spin_lock(&trans->transaction->dirty_bgs_lock); } if (!list_empty(&block_group->dirty_list)) { list_del_init(&block_group->dirty_list); remove_rsv = true; btrfs_put_block_group(block_group); } spin_unlock(&trans->transaction->dirty_bgs_lock); mutex_unlock(&trans->transaction->cache_write_mutex); ret = btrfs_remove_free_space_inode(trans, inode, block_group); if (ret) goto out; write_lock(&fs_info->block_group_cache_lock); rb_erase_cached(&block_group->cache_node, &fs_info->block_group_cache_tree); RB_CLEAR_NODE(&block_group->cache_node); /* Once for the block groups rbtree */ btrfs_put_block_group(block_group); write_unlock(&fs_info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); /* * we must use list_del_init so people can check to see if they * are still on the list after taking the semaphore */ list_del_init(&block_group->list); if (list_empty(&block_group->space_info->block_groups[index])) { kobj = block_group->space_info->block_group_kobjs[index]; block_group->space_info->block_group_kobjs[index] = NULL; clear_avail_alloc_bits(fs_info, block_group->flags); } up_write(&block_group->space_info->groups_sem); clear_incompat_bg_bits(fs_info, block_group->flags); if (kobj) { kobject_del(kobj); kobject_put(kobj); } if (block_group->cached == BTRFS_CACHE_STARTED) btrfs_wait_block_group_cache_done(block_group); write_lock(&fs_info->block_group_cache_lock); caching_ctl = btrfs_get_caching_control(block_group); if (!caching_ctl) { struct btrfs_caching_control *ctl; list_for_each_entry(ctl, &fs_info->caching_block_groups, list) { if (ctl->block_group == block_group) { caching_ctl = ctl; refcount_inc(&caching_ctl->count); break; } } } if (caching_ctl) list_del_init(&caching_ctl->list); write_unlock(&fs_info->block_group_cache_lock); if (caching_ctl) { /* Once for the caching bgs list and once for us. */ btrfs_put_caching_control(caching_ctl); btrfs_put_caching_control(caching_ctl); } spin_lock(&trans->transaction->dirty_bgs_lock); WARN_ON(!list_empty(&block_group->dirty_list)); WARN_ON(!list_empty(&block_group->io_list)); spin_unlock(&trans->transaction->dirty_bgs_lock); btrfs_remove_free_space_cache(block_group); spin_lock(&block_group->space_info->lock); list_del_init(&block_group->ro_list); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { WARN_ON(block_group->space_info->total_bytes < block_group->length); WARN_ON(block_group->space_info->bytes_readonly < block_group->length - block_group->zone_unusable); WARN_ON(block_group->space_info->bytes_zone_unusable < block_group->zone_unusable); WARN_ON(block_group->space_info->disk_total < block_group->length * factor); } block_group->space_info->total_bytes -= block_group->length; block_group->space_info->bytes_readonly -= (block_group->length - block_group->zone_unusable); block_group->space_info->bytes_zone_unusable -= block_group->zone_unusable; block_group->space_info->disk_total -= block_group->length * factor; spin_unlock(&block_group->space_info->lock); /* * Remove the free space for the block group from the free space tree * and the block group's item from the extent tree before marking the * block group as removed. This is to prevent races with tasks that * freeze and unfreeze a block group, this task and another task * allocating a new block group - the unfreeze task ends up removing * the block group's extent map before the task calling this function * deletes the block group item from the extent tree, allowing for * another task to attempt to create another block group with the same * item key (and failing with -EEXIST and a transaction abort). */ ret = remove_block_group_free_space(trans, block_group); if (ret) goto out; ret = remove_block_group_item(trans, path, block_group); if (ret < 0) goto out; spin_lock(&block_group->lock); set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags); /* * At this point trimming or scrub can't start on this block group, * because we removed the block group from the rbtree * fs_info->block_group_cache_tree so no one can't find it anymore and * even if someone already got this block group before we removed it * from the rbtree, they have already incremented block_group->frozen - * if they didn't, for the trimming case they won't find any free space * entries because we already removed them all when we called * btrfs_remove_free_space_cache(). * * And we must not remove the chunk map from the fs_info->mapping_tree * to prevent the same logical address range and physical device space * ranges from being reused for a new block group. This is needed to * avoid races with trimming and scrub. * * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is * completely transactionless, so while it is trimming a range the * currently running transaction might finish and a new one start, * allowing for new block groups to be created that can reuse the same * physical device locations unless we take this special care. * * There may also be an implicit trim operation if the file system * is mounted with -odiscard. The same protections must remain * in place until the extents have been discarded completely when * the transaction commit has completed. */ remove_map = (atomic_read(&block_group->frozen) == 0); spin_unlock(&block_group->lock); if (remove_map) btrfs_remove_chunk_map(fs_info, map); out: /* Once for the lookup reference */ btrfs_put_block_group(block_group); if (remove_rsv) btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); btrfs_free_path(path); return ret; } struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset) { struct btrfs_root *root = btrfs_block_group_root(fs_info); struct btrfs_chunk_map *map; unsigned int num_items; map = btrfs_find_chunk_map(fs_info, chunk_offset, 1); ASSERT(map != NULL); ASSERT(map->start == chunk_offset); /* * We need to reserve 3 + N units from the metadata space info in order * to remove a block group (done at btrfs_remove_chunk() and at * btrfs_remove_block_group()), which are used for: * * 1 unit for adding the free space inode's orphan (located in the tree * of tree roots). * 1 unit for deleting the block group item (located in the extent * tree). * 1 unit for deleting the free space item (located in tree of tree * roots). * N units for deleting N device extent items corresponding to each * stripe (located in the device tree). * * In order to remove a block group we also need to reserve units in the * system space info in order to update the chunk tree (update one or * more device items and remove one chunk item), but this is done at * btrfs_remove_chunk() through a call to check_system_chunk(). */ num_items = 3 + map->num_stripes; btrfs_free_chunk_map(map); return btrfs_start_transaction_fallback_global_rsv(root, num_items); } /* * Mark block group @cache read-only, so later write won't happen to block * group @cache. * * If @force is not set, this function will only mark the block group readonly * if we have enough free space (1M) in other metadata/system block groups. * If @force is not set, this function will mark the block group readonly * without checking free space. * * NOTE: This function doesn't care if other block groups can contain all the * data in this block group. That check should be done by relocation routine, * not this function. */ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; int ret = -ENOSPC; spin_lock(&sinfo->lock); spin_lock(&cache->lock); if (cache->swap_extents) { ret = -ETXTBSY; goto out; } if (cache->ro) { cache->ro++; ret = 0; goto out; } num_bytes = cache->length - cache->reserved - cache->pinned - cache->bytes_super - cache->zone_unusable - cache->used; /* * Data never overcommits, even in mixed mode, so do just the straight * check of left over space in how much we have allocated. */ if (force) { ret = 0; } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) { u64 sinfo_used = btrfs_space_info_used(sinfo, true); /* * Here we make sure if we mark this bg RO, we still have enough * free space as buffer. */ if (sinfo_used + num_bytes <= sinfo->total_bytes) ret = 0; } else { /* * We overcommit metadata, so we need to do the * btrfs_can_overcommit check here, and we need to pass in * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of * leeway to allow us to mark this block group as read only. */ if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes, BTRFS_RESERVE_NO_FLUSH)) ret = 0; } if (!ret) { sinfo->bytes_readonly += num_bytes; if (btrfs_is_zoned(cache->fs_info)) { /* Migrate zone_unusable bytes to readonly */ sinfo->bytes_readonly += cache->zone_unusable; sinfo->bytes_zone_unusable -= cache->zone_unusable; cache->zone_unusable = 0; } cache->ro++; list_add_tail(&cache->ro_list, &sinfo->ro_bgs); } out: spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { btrfs_info(cache->fs_info, "unable to make block group %llu ro", cache->start); btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); } return ret; } static bool clean_pinned_extents(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; struct btrfs_transaction *prev_trans = NULL; const u64 start = bg->start; const u64 end = start + bg->length - 1; int ret; spin_lock(&fs_info->trans_lock); if (trans->transaction->list.prev != &fs_info->trans_list) { prev_trans = list_last_entry(&trans->transaction->list, struct btrfs_transaction, list); refcount_inc(&prev_trans->use_count); } spin_unlock(&fs_info->trans_lock); /* * Hold the unused_bg_unpin_mutex lock to avoid racing with * btrfs_finish_extent_commit(). If we are at transaction N, another * task might be running finish_extent_commit() for the previous * transaction N - 1, and have seen a range belonging to the block * group in pinned_extents before we were able to clear the whole block * group range from pinned_extents. This means that task can lookup for * the block group after we unpinned it from pinned_extents and removed * it, leading to a BUG_ON() at unpin_extent_range(). */ mutex_lock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) { ret = clear_extent_bits(&prev_trans->pinned_extents, start, end, EXTENT_DIRTY); if (ret) goto out; } ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end, EXTENT_DIRTY); out: mutex_unlock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) btrfs_put_transaction(prev_trans); return ret == 0; } /* * Process the unused_bgs list and remove any that don't have any allocated * space inside of them. */ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) { struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC); int ret = 0; if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; if (btrfs_fs_closing(fs_info)) return; /* * Long running balances can keep us blocked here for eternity, so * simply skip deletion if we're unable to get the mutex. */ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) return; spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->unused_bgs)) { int trimming; block_group = list_first_entry(&fs_info->unused_bgs, struct btrfs_block_group, bg_list); list_del_init(&block_group->bg_list); space_info = block_group->space_info; if (ret || btrfs_mixed_space_info(space_info)) { btrfs_put_block_group(block_group); continue; } spin_unlock(&fs_info->unused_bgs_lock); btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); /* Don't want to race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); /* * Async discard moves the final block group discard to be prior * to the unused_bgs code path. Therefore, if it's not fully * trimmed, punt it back to the async discard lists. */ if (btrfs_test_opt(fs_info, DISCARD_ASYNC) && !btrfs_is_free_space_trimmed(block_group)) { trace_btrfs_skip_unused_block_group(block_group); up_write(&space_info->groups_sem); /* Requeue if we failed because of async discard */ btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); goto next; } spin_lock(&block_group->lock); if (block_group->reserved || block_group->pinned || block_group->used || block_group->ro || list_is_singular(&block_group->list)) { /* * We want to bail if we made new allocations or have * outstanding allocations in this block group. We do * the ro check in case balance is currently acting on * this block group. */ trace_btrfs_skip_unused_block_group(block_group); spin_unlock(&block_group->lock); up_write(&space_info->groups_sem); goto next; } spin_unlock(&block_group->lock); /* We don't want to force the issue, only flip if it's ok. */ ret = inc_block_group_ro(block_group, 0); up_write(&space_info->groups_sem); if (ret < 0) { ret = 0; goto next; } ret = btrfs_zone_finish(block_group); if (ret < 0) { btrfs_dec_block_group_ro(block_group); if (ret == -EAGAIN) ret = 0; goto next; } /* * Want to do this before we do anything else so we can recover * properly if we fail to join the transaction. */ trans = btrfs_start_trans_remove_block_group(fs_info, block_group->start); if (IS_ERR(trans)) { btrfs_dec_block_group_ro(block_group); ret = PTR_ERR(trans); goto next; } /* * We could have pending pinned extents for this block group, * just delete them, we don't care about them anymore. */ if (!clean_pinned_extents(trans, block_group)) { btrfs_dec_block_group_ro(block_group); goto end_trans; } /* * At this point, the block_group is read only and should fail * new allocations. However, btrfs_finish_extent_commit() can * cause this block_group to be placed back on the discard * lists because now the block_group isn't fully discarded. * Bail here and try again later after discarding everything. */ spin_lock(&fs_info->discard_ctl.lock); if (!list_empty(&block_group->discard_list)) { spin_unlock(&fs_info->discard_ctl.lock); btrfs_dec_block_group_ro(block_group); btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); goto end_trans; } spin_unlock(&fs_info->discard_ctl.lock); /* Reset pinned so btrfs_put_block_group doesn't complain */ spin_lock(&space_info->lock); spin_lock(&block_group->lock); btrfs_space_info_update_bytes_pinned(fs_info, space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; block_group->pinned = 0; spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); /* * The normal path here is an unused block group is passed here, * then trimming is handled in the transaction commit path. * Async discard interposes before this to do the trimming * before coming down the unused block group path as trimming * will no longer be done later in the transaction commit path. */ if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC)) goto flip_async; /* * DISCARD can flip during remount. On zoned filesystems, we * need to reset sequential-required zones. */ trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) || btrfs_is_zoned(fs_info); /* Implicit trim during transaction commit. */ if (trimming) btrfs_freeze_block_group(block_group); /* * Btrfs_remove_chunk will abort the transaction if things go * horribly wrong. */ ret = btrfs_remove_chunk(trans, block_group->start); if (ret) { if (trimming) btrfs_unfreeze_block_group(block_group); goto end_trans; } /* * If we're not mounted with -odiscard, we can just forget * about this block group. Otherwise we'll need to wait * until transaction commit to do the actual discard. */ if (trimming) { spin_lock(&fs_info->unused_bgs_lock); /* * A concurrent scrub might have added us to the list * fs_info->unused_bgs, so use a list_move operation * to add the block group to the deleted_bgs list. */ list_move(&block_group->bg_list, &trans->transaction->deleted_bgs); spin_unlock(&fs_info->unused_bgs_lock); btrfs_get_block_group(block_group); } end_trans: btrfs_end_transaction(trans); next: btrfs_put_block_group(block_group); spin_lock(&fs_info->unused_bgs_lock); } spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); return; flip_async: btrfs_end_transaction(trans); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_put_block_group(block_group); btrfs_discard_punt_unused_bgs_list(fs_info); } void btrfs_mark_bg_unused(struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; spin_lock(&fs_info->unused_bgs_lock); if (list_empty(&bg->bg_list)) { btrfs_get_block_group(bg); trace_btrfs_add_unused_block_group(bg); list_add_tail(&bg->bg_list, &fs_info->unused_bgs); } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) { /* Pull out the block group from the reclaim_bgs list. */ trace_btrfs_add_unused_block_group(bg); list_move_tail(&bg->bg_list, &fs_info->unused_bgs); } spin_unlock(&fs_info->unused_bgs_lock); } /* * We want block groups with a low number of used bytes to be in the beginning * of the list, so they will get reclaimed first. */ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, const struct list_head *b) { const struct btrfs_block_group *bg1, *bg2; bg1 = list_entry(a, struct btrfs_block_group, bg_list); bg2 = list_entry(b, struct btrfs_block_group, bg_list); return bg1->used > bg2->used; } static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) { if (btrfs_is_zoned(fs_info)) return btrfs_zoned_should_reclaim(fs_info); return true; } static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed) { const struct btrfs_space_info *space_info = bg->space_info; const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); const u64 new_val = bg->used; const u64 old_val = new_val + bytes_freed; u64 thresh; if (reclaim_thresh == 0) return false; thresh = mult_perc(bg->length, reclaim_thresh); /* * If we were below the threshold before don't reclaim, we are likely a * brand new block group and we don't want to relocate new block groups. */ if (old_val < thresh) return false; if (new_val >= thresh) return false; return true; } void btrfs_reclaim_bgs_work(struct work_struct *work) { struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info, reclaim_bgs_work); struct btrfs_block_group *bg; struct btrfs_space_info *space_info; if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; if (btrfs_fs_closing(fs_info)) return; if (!btrfs_should_reclaim(fs_info)) return; sb_start_write(fs_info->sb); if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { sb_end_write(fs_info->sb); return; } /* * Long running balances can keep us blocked here for eternity, so * simply skip reclaim if we're unable to get the mutex. */ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) { btrfs_exclop_finish(fs_info); sb_end_write(fs_info->sb); return; } spin_lock(&fs_info->unused_bgs_lock); /* * Sort happens under lock because we can't simply splice it and sort. * The block groups might still be in use and reachable via bg_list, * and their presence in the reclaim_bgs list must be preserved. */ list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); while (!list_empty(&fs_info->reclaim_bgs)) { u64 zone_unusable; int ret = 0; bg = list_first_entry(&fs_info->reclaim_bgs, struct btrfs_block_group, bg_list); list_del_init(&bg->bg_list); space_info = bg->space_info; spin_unlock(&fs_info->unused_bgs_lock); /* Don't race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); spin_lock(&bg->lock); if (bg->reserved || bg->pinned || bg->ro) { /* * We want to bail if we made new allocations or have * outstanding allocations in this block group. We do * the ro check in case balance is currently acting on * this block group. */ spin_unlock(&bg->lock); up_write(&space_info->groups_sem); goto next; } if (bg->used == 0) { /* * It is possible that we trigger relocation on a block * group as its extents are deleted and it first goes * below the threshold, then shortly after goes empty. * * In this case, relocating it does delete it, but has * some overhead in relocation specific metadata, looking * for the non-existent extents and running some extra * transactions, which we can avoid by using one of the * other mechanisms for dealing with empty block groups. */ if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) btrfs_mark_bg_unused(bg); spin_unlock(&bg->lock); up_write(&space_info->groups_sem); goto next; } /* * The block group might no longer meet the reclaim condition by * the time we get around to reclaiming it, so to avoid * reclaiming overly full block_groups, skip reclaiming them. * * Since the decision making process also depends on the amount * being freed, pass in a fake giant value to skip that extra * check, which is more meaningful when adding to the list in * the first place. */ if (!should_reclaim_block_group(bg, bg->length)) { spin_unlock(&bg->lock); up_write(&space_info->groups_sem); goto next; } spin_unlock(&bg->lock); /* * Get out fast, in case we're read-only or unmounting the * filesystem. It is OK to drop block groups from the list even * for the read-only case. As we did sb_start_write(), * "mount -o remount,ro" won't happen and read-only filesystem * means it is forced read-only due to a fatal error. So, it * never gets back to read-write to let us reclaim again. */ if (btrfs_need_cleaner_sleep(fs_info)) { up_write(&space_info->groups_sem); goto next; } /* * Cache the zone_unusable value before turning the block group * to read only. As soon as the blog group is read only it's * zone_unusable value gets moved to the block group's read-only * bytes and isn't available for calculations anymore. */ zone_unusable = bg->zone_unusable; ret = inc_block_group_ro(bg, 0); up_write(&space_info->groups_sem); if (ret < 0) goto next; btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used %llu%% unusable", bg->start, div64_u64(bg->used * 100, bg->length), div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); ret = btrfs_relocate_chunk(fs_info, bg->start); if (ret) { btrfs_dec_block_group_ro(bg); btrfs_err(fs_info, "error relocating chunk %llu", bg->start); } next: if (ret) btrfs_mark_bg_to_reclaim(bg); btrfs_put_block_group(bg); mutex_unlock(&fs_info->reclaim_bgs_lock); /* * Reclaiming all the block groups in the list can take really * long. Prioritize cleaning up unused block groups. */ btrfs_delete_unused_bgs(fs_info); /* * If we are interrupted by a balance, we can just bail out. The * cleaner thread restart again if necessary. */ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) goto end; spin_lock(&fs_info->unused_bgs_lock); } spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); end: btrfs_exclop_finish(fs_info); sb_end_write(fs_info->sb); } void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) { spin_lock(&fs_info->unused_bgs_lock); if (!list_empty(&fs_info->reclaim_bgs)) queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); spin_unlock(&fs_info->unused_bgs_lock); } void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; spin_lock(&fs_info->unused_bgs_lock); if (list_empty(&bg->bg_list)) { btrfs_get_block_group(bg); trace_btrfs_add_reclaim_block_group(bg); list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs); } spin_unlock(&fs_info->unused_bgs_lock); } static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, struct btrfs_path *path) { struct btrfs_chunk_map *map; struct btrfs_block_group_item bg; struct extent_buffer *leaf; int slot; u64 flags; int ret = 0; slot = path->slots[0]; leaf = path->nodes[0]; map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset); if (!map) { btrfs_err(fs_info, "logical %llu len %llu found bg but no related chunk", key->objectid, key->offset); return -ENOENT; } if (map->start != key->objectid || map->chunk_len != key->offset) { btrfs_err(fs_info, "block group %llu len %llu mismatch with chunk %llu len %llu", key->objectid, key->offset, map->start, map->chunk_len); ret = -EUCLEAN; goto out_free_map; } read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot), sizeof(bg)); flags = btrfs_stack_block_group_flags(&bg) & BTRFS_BLOCK_GROUP_TYPE_MASK; if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { btrfs_err(fs_info, "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", key->objectid, key->offset, flags, (BTRFS_BLOCK_GROUP_TYPE_MASK & map->type)); ret = -EUCLEAN; } out_free_map: btrfs_free_chunk_map(map); return ret; } static int find_first_block_group(struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_key *key) { struct btrfs_root *root = btrfs_block_group_root(fs_info); int ret; struct btrfs_key found_key; btrfs_for_each_slot(root, key, &found_key, path, ret) { if (found_key.objectid >= key->objectid && found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { return read_bg_from_eb(fs_info, &found_key, path); } } return ret; } static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { u64 extra_flags = chunk_to_extended(flags) & BTRFS_EXTENDED_PROFILE_MASK; write_seqlock(&fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits |= extra_flags; if (flags & BTRFS_BLOCK_GROUP_METADATA) fs_info->avail_metadata_alloc_bits |= extra_flags; if (flags & BTRFS_BLOCK_GROUP_SYSTEM) fs_info->avail_system_alloc_bits |= extra_flags; write_sequnlock(&fs_info->profiles_lock); } /* * Map a physical disk address to a list of logical addresses. * * @fs_info: the filesystem * @chunk_start: logical address of block group * @physical: physical address to map to logical addresses * @logical: return array of logical addresses which map to @physical * @naddrs: length of @logical * @stripe_len: size of IO stripe for the given block group * * Maps a particular @physical disk address to a list of @logical addresses. * Used primarily to exclude those portions of a block group that contain super * block copies. */ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 **logical, int *naddrs, int *stripe_len) { struct btrfs_chunk_map *map; u64 *buf; u64 bytenr; u64 data_stripe_length; u64 io_stripe_size; int i, nr = 0; int ret = 0; map = btrfs_get_chunk_map(fs_info, chunk_start, 1); if (IS_ERR(map)) return -EIO; data_stripe_length = map->stripe_size; io_stripe_size = BTRFS_STRIPE_LEN; chunk_start = map->start; /* For RAID5/6 adjust to a full IO stripe length */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); if (!buf) { ret = -ENOMEM; goto out; } for (i = 0; i < map->num_stripes; i++) { bool already_inserted = false; u32 stripe_nr; u32 offset; int j; if (!in_range(physical, map->stripes[i].physical, data_stripe_length)) continue; stripe_nr = (physical - map->stripes[i].physical) >> BTRFS_STRIPE_LEN_SHIFT; offset = (physical - map->stripes[i].physical) & BTRFS_STRIPE_LEN_MASK; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) stripe_nr = div_u64(stripe_nr * map->num_stripes + i, map->sub_stripes); /* * The remaining case would be for RAID56, multiply by * nr_data_stripes(). Alternatively, just use rmap_len below * instead of map->stripe_len */ bytenr = chunk_start + stripe_nr * io_stripe_size + offset; /* Ensure we don't add duplicate addresses */ for (j = 0; j < nr; j++) { if (buf[j] == bytenr) { already_inserted = true; break; } } if (!already_inserted) buf[nr++] = bytenr; } *logical = buf; *naddrs = nr; *stripe_len = io_stripe_size; out: btrfs_free_chunk_map(map); return ret; } static int exclude_super_stripes(struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; const bool zoned = btrfs_is_zoned(fs_info); u64 bytenr; u64 *logical; int stripe_len; int i, nr, ret; if (cache->start < BTRFS_SUPER_INFO_OFFSET) { stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; cache->bytes_super += stripe_len; ret = set_extent_bit(&fs_info->excluded_extents, cache->start, cache->start + stripe_len - 1, EXTENT_UPTODATE, NULL); if (ret) return ret; } for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); ret = btrfs_rmap_block(fs_info, cache->start, bytenr, &logical, &nr, &stripe_len); if (ret) return ret; /* Shouldn't have super stripes in sequential zones */ if (zoned && nr) { kfree(logical); btrfs_err(fs_info, "zoned: block group %llu must not contain super block", cache->start); return -EUCLEAN; } while (nr--) { u64 len = min_t(u64, stripe_len, cache->start + cache->length - logical[nr]); cache->bytes_super += len; ret = set_extent_bit(&fs_info->excluded_extents, logical[nr], logical[nr] + len - 1, EXTENT_UPTODATE, NULL); if (ret) { kfree(logical); return ret; } } kfree(logical); } return 0; } static struct btrfs_block_group *btrfs_create_block_group_cache( struct btrfs_fs_info *fs_info, u64 start) { struct btrfs_block_group *cache; cache = kzalloc(sizeof(*cache), GFP_NOFS); if (!cache) return NULL; cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), GFP_NOFS); if (!cache->free_space_ctl) { kfree(cache); return NULL; } cache->start = start; cache->fs_info = fs_info; cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED; refcount_set(&cache->refs, 1); spin_lock_init(&cache->lock); init_rwsem(&cache->data_rwsem); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); INIT_LIST_HEAD(&cache->bg_list); INIT_LIST_HEAD(&cache->ro_list); INIT_LIST_HEAD(&cache->discard_list); INIT_LIST_HEAD(&cache->dirty_list); INIT_LIST_HEAD(&cache->io_list); INIT_LIST_HEAD(&cache->active_bg_list); btrfs_init_free_space_ctl(cache, cache->free_space_ctl); atomic_set(&cache->frozen, 0); mutex_init(&cache->free_space_lock); return cache; } /* * Iterate all chunks and verify that each of them has the corresponding block * group */ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) { u64 start = 0; int ret = 0; while (1) { struct btrfs_chunk_map *map; struct btrfs_block_group *bg; /* * btrfs_find_chunk_map() will return the first chunk map * intersecting the range, so setting @length to 1 is enough to * get the first chunk. */ map = btrfs_find_chunk_map(fs_info, start, 1); if (!map) break; bg = btrfs_lookup_block_group(fs_info, map->start); if (!bg) { btrfs_err(fs_info, "chunk start=%llu len=%llu doesn't have corresponding block group", map->start, map->chunk_len); ret = -EUCLEAN; btrfs_free_chunk_map(map); break; } if (bg->start != map->start || bg->length != map->chunk_len || (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { btrfs_err(fs_info, "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", map->start, map->chunk_len, map->type & BTRFS_BLOCK_GROUP_TYPE_MASK, bg->start, bg->length, bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); ret = -EUCLEAN; btrfs_free_chunk_map(map); btrfs_put_block_group(bg); break; } start = map->start + map->chunk_len; btrfs_free_chunk_map(map); btrfs_put_block_group(bg); } return ret; } static int read_one_block_group(struct btrfs_fs_info *info, struct btrfs_block_group_item *bgi, const struct btrfs_key *key, int need_clear) { struct btrfs_block_group *cache; const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS); int ret; ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY); cache = btrfs_create_block_group_cache(info, key->objectid); if (!cache) return -ENOMEM; cache->length = key->offset; cache->used = btrfs_stack_block_group_used(bgi); cache->commit_used = cache->used; cache->flags = btrfs_stack_block_group_flags(bgi); cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); set_free_space_tree_thresholds(cache); if (need_clear) { /* * When we mount with old space cache, we need to * set BTRFS_DC_CLEAR and set dirty flag. * * a) Setting 'BTRFS_DC_CLEAR' makes sure that we * truncate the old free space cache inode and * setup a new one. * b) Setting 'dirty flag' makes sure that we flush * the new space cache info onto disk. */ if (btrfs_test_opt(info, SPACE_CACHE)) cache->disk_cache_state = BTRFS_DC_CLEAR; } if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { btrfs_err(info, "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", cache->start); ret = -EINVAL; goto error; } ret = btrfs_load_block_group_zone_info(cache, false); if (ret) { btrfs_err(info, "zoned: failed to load zone info of bg %llu", cache->start); goto error; } /* * We need to exclude the super stripes now so that the space info has * super bytes accounted for, otherwise we'll think we have more space * than we actually do. */ ret = exclude_super_stripes(cache); if (ret) { /* We may have excluded something, so call this just in case. */ btrfs_free_excluded_extents(cache); goto error; } /* * For zoned filesystem, space after the allocation offset is the only * free space for a block group. So, we don't need any caching work. * btrfs_calc_zone_unusable() will set the amount of free space and * zone_unusable space. * * For regular filesystem, check for two cases, either we are full, and * therefore don't need to bother with the caching work since we won't * find any space, or we are empty, and we can just add all the space * in and be done with it. This saves us _a_lot_ of time, particularly * in the full case. */ if (btrfs_is_zoned(info)) { btrfs_calc_zone_unusable(cache); /* Should not have any excluded extents. Just in case, though. */ btrfs_free_excluded_extents(cache); } else if (cache->length == cache->used) { cache->cached = BTRFS_CACHE_FINISHED; btrfs_free_excluded_extents(cache); } else if (cache->used == 0) { cache->cached = BTRFS_CACHE_FINISHED; ret = btrfs_add_new_free_space(cache, cache->start, cache->start + cache->length, NULL); btrfs_free_excluded_extents(cache); if (ret) goto error; } ret = btrfs_add_block_group_cache(info, cache); if (ret) { btrfs_remove_free_space_cache(cache); goto error; } trace_btrfs_add_block_group(info, cache, 0); btrfs_add_bg_to_space_info(info, cache); set_avail_alloc_bits(info, cache->flags); if (btrfs_chunk_writeable(info, cache->start)) { if (cache->used == 0) { ASSERT(list_empty(&cache->bg_list)); if (btrfs_test_opt(info, DISCARD_ASYNC)) btrfs_discard_queue_work(&info->discard_ctl, cache); else btrfs_mark_bg_unused(cache); } } else { inc_block_group_ro(cache, 1); } return 0; error: btrfs_put_block_group(cache); return ret; } static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) { struct rb_node *node; int ret = 0; for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) { struct btrfs_chunk_map *map; struct btrfs_block_group *bg; map = rb_entry(node, struct btrfs_chunk_map, rb_node); bg = btrfs_create_block_group_cache(fs_info, map->start); if (!bg) { ret = -ENOMEM; break; } /* Fill dummy cache as FULL */ bg->length = map->chunk_len; bg->flags = map->type; bg->cached = BTRFS_CACHE_FINISHED; bg->used = map->chunk_len; bg->flags = map->type; ret = btrfs_add_block_group_cache(fs_info, bg); /* * We may have some valid block group cache added already, in * that case we skip to the next one. */ if (ret == -EEXIST) { ret = 0; btrfs_put_block_group(bg); continue; } if (ret) { btrfs_remove_free_space_cache(bg); btrfs_put_block_group(bg); break; } btrfs_add_bg_to_space_info(fs_info, bg); set_avail_alloc_bits(fs_info, bg->flags); } if (!ret) btrfs_init_global_block_rsv(fs_info); return ret; } int btrfs_read_block_groups(struct btrfs_fs_info *info) { struct btrfs_root *root = btrfs_block_group_root(info); struct btrfs_path *path; int ret; struct btrfs_block_group *cache; struct btrfs_space_info *space_info; struct btrfs_key key; int need_clear = 0; u64 cache_gen; /* * Either no extent root (with ibadroots rescue option) or we have * unsupported RO options. The fs can never be mounted read-write, so no * need to waste time searching block group items. * * This also allows new extent tree related changes to be RO compat, * no need for a full incompat flag. */ if (!root || (btrfs_super_compat_ro_flags(info->super_copy) & ~BTRFS_FEATURE_COMPAT_RO_SUPP)) return fill_dummy_bgs(info); key.objectid = 0; key.offset = 0; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; path = btrfs_alloc_path(); if (!path) return -ENOMEM; cache_gen = btrfs_super_cache_generation(info->super_copy); if (btrfs_test_opt(info, SPACE_CACHE) && btrfs_super_generation(info->super_copy) != cache_gen) need_clear = 1; if (btrfs_test_opt(info, CLEAR_CACHE)) need_clear = 1; while (1) { struct btrfs_block_group_item bgi; struct extent_buffer *leaf; int slot; ret = find_first_block_group(info, path, &key); if (ret > 0) break; if (ret != 0) goto error; leaf = path->nodes[0]; slot = path->slots[0]; read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), sizeof(bgi)); btrfs_item_key_to_cpu(leaf, &key, slot); btrfs_release_path(path); ret = read_one_block_group(info, &bgi, &key, need_clear); if (ret < 0) goto error; key.objectid += key.offset; key.offset = 0; } btrfs_release_path(path); list_for_each_entry(space_info, &info->space_info, list) { int i; for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { if (list_empty(&space_info->block_groups[i])) continue; cache = list_first_entry(&space_info->block_groups[i], struct btrfs_block_group, list); btrfs_sysfs_add_block_group_type(cache); } if (!(btrfs_get_alloc_profile(info, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID56_MASK | BTRFS_BLOCK_GROUP_DUP))) continue; /* * Avoid allocating from un-mirrored block group if there are * mirrored block groups. */ list_for_each_entry(cache, &space_info->block_groups[BTRFS_RAID_RAID0], list) inc_block_group_ro(cache, 1); list_for_each_entry(cache, &space_info->block_groups[BTRFS_RAID_SINGLE], list) inc_block_group_ro(cache, 1); } btrfs_init_global_block_rsv(info); ret = check_chunk_block_group_mappings(info); error: btrfs_free_path(path); /* * We've hit some error while reading the extent tree, and have * rescue=ibadroots mount option. * Try to fill the tree using dummy block groups so that the user can * continue to mount and grab their data. */ if (ret && btrfs_test_opt(info, IGNOREBADROOTS)) ret = fill_dummy_bgs(info); return ret; } /* * This function, insert_block_group_item(), belongs to the phase 2 of chunk * allocation. * * See the comment at btrfs_chunk_alloc() for details about the chunk allocation * phases. */ static int insert_block_group_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group_item bgi; struct btrfs_root *root = btrfs_block_group_root(fs_info); struct btrfs_key key; u64 old_commit_used; int ret; spin_lock(&block_group->lock); btrfs_set_stack_block_group_used(&bgi, block_group->used); btrfs_set_stack_block_group_chunk_objectid(&bgi, block_group->global_root_id); btrfs_set_stack_block_group_flags(&bgi, block_group->flags); old_commit_used = block_group->commit_used; block_group->commit_used = block_group->used; key.objectid = block_group->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; key.offset = block_group->length; spin_unlock(&block_group->lock); ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); if (ret < 0) { spin_lock(&block_group->lock); block_group->commit_used = old_commit_used; spin_unlock(&block_group->lock); } return ret; } static int insert_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 chunk_offset, u64 start, u64 num_bytes) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; struct btrfs_path *path; struct btrfs_dev_extent *extent; struct extent_buffer *leaf; struct btrfs_key key; int ret; WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = device->devid; key.type = BTRFS_DEV_EXTENT_KEY; key.offset = start; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); if (ret) goto out; leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID); btrfs_set_dev_extent_chunk_objectid(leaf, extent, BTRFS_FIRST_CHUNK_TREE_OBJECTID); btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); btrfs_set_dev_extent_length(leaf, extent, num_bytes); btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; } /* * This function belongs to phase 2. * * See the comment at btrfs_chunk_alloc() for details about the chunk allocation * phases. */ static int insert_dev_extents(struct btrfs_trans_handle *trans, u64 chunk_offset, u64 chunk_size) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_device *device; struct btrfs_chunk_map *map; u64 dev_offset; int i; int ret = 0; map = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); if (IS_ERR(map)) return PTR_ERR(map); /* * Take the device list mutex to prevent races with the final phase of * a device replace operation that replaces the device object associated * with the map's stripes, because the device object's id can change * at any time during that final phase of the device replace operation * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, * resulting in persisting a device extent item with such ID. */ mutex_lock(&fs_info->fs_devices->device_list_mutex); for (i = 0; i < map->num_stripes; i++) { device = map->stripes[i].dev; dev_offset = map->stripes[i].physical; ret = insert_dev_extent(trans, device, chunk_offset, dev_offset, map->stripe_size); if (ret) break; } mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_free_chunk_map(map); return ret; } /* * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of * chunk allocation. * * See the comment at btrfs_chunk_alloc() for details about the chunk allocation * phases. */ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *block_group; int ret = 0; while (!list_empty(&trans->new_bgs)) { int index; block_group = list_first_entry(&trans->new_bgs, struct btrfs_block_group, bg_list); if (ret) goto next; index = btrfs_bg_flags_to_raid_index(block_group->flags); ret = insert_block_group_item(trans, block_group); if (ret) btrfs_abort_transaction(trans, ret); if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &block_group->runtime_flags)) { mutex_lock(&fs_info->chunk_mutex); ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group); mutex_unlock(&fs_info->chunk_mutex); if (ret) btrfs_abort_transaction(trans, ret); } ret = insert_dev_extents(trans, block_group->start, block_group->length); if (ret) btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, block_group); /* * If we restriped during balance, we may have added a new raid * type, so now add the sysfs entries when it is safe to do so. * We don't have to worry about locking here as it's handled in * btrfs_sysfs_add_block_group_type. */ if (block_group->space_info->block_group_kobjs[index] == NULL) btrfs_sysfs_add_block_group_type(block_group); /* Already aborted the transaction if it failed. */ next: btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); list_del_init(&block_group->bg_list); clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); } btrfs_trans_release_chunk_metadata(trans); } /* * For extent tree v2 we use the block_group_item->chunk_offset to point at our * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID. */ static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) { u64 div = SZ_1G; u64 index; if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) return BTRFS_FIRST_CHUNK_TREE_OBJECTID; /* If we have a smaller fs index based on 128MiB. */ if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL)) div = SZ_128M; offset = div64_u64(offset, div); div64_u64_rem(offset, fs_info->nr_global_roots, &index); return index; } struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 type, u64 chunk_offset, u64 size) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *cache; int ret; btrfs_set_log_full_commit(trans); cache = btrfs_create_block_group_cache(fs_info, chunk_offset); if (!cache) return ERR_PTR(-ENOMEM); /* * Mark it as new before adding it to the rbtree of block groups or any * list, so that no other task finds it and calls btrfs_mark_bg_unused() * before the new flag is set. */ set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags); cache->length = size; set_free_space_tree_thresholds(cache); cache->flags = type; cache->cached = BTRFS_CACHE_FINISHED; cache->global_root_id = calculate_global_root_id(fs_info, cache->start); if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags); ret = btrfs_load_block_group_zone_info(cache, true); if (ret) { btrfs_put_block_group(cache); return ERR_PTR(ret); } ret = exclude_super_stripes(cache); if (ret) { /* We may have excluded something, so call this just in case */ btrfs_free_excluded_extents(cache); btrfs_put_block_group(cache); return ERR_PTR(ret); } ret = btrfs_add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL); btrfs_free_excluded_extents(cache); if (ret) { btrfs_put_block_group(cache); return ERR_PTR(ret); } /* * Ensure the corresponding space_info object is created and * assigned to our block group. We want our bg to be added to the rbtree * with its ->space_info set. */ cache->space_info = btrfs_find_space_info(fs_info, cache->flags); ASSERT(cache->space_info); ret = btrfs_add_block_group_cache(fs_info, cache); if (ret) { btrfs_remove_free_space_cache(cache); btrfs_put_block_group(cache); return ERR_PTR(ret); } /* * Now that our block group has its ->space_info set and is inserted in * the rbtree, update the space info's counters. */ trace_btrfs_add_block_group(fs_info, cache, 1); btrfs_add_bg_to_space_info(fs_info, cache); btrfs_update_global_block_rsv(fs_info); #ifdef CONFIG_BTRFS_DEBUG if (btrfs_should_fragment_free_space(cache)) { cache->space_info->bytes_used += size >> 1; fragment_free_space(cache); } #endif list_add_tail(&cache->bg_list, &trans->new_bgs); btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info); set_avail_alloc_bits(fs_info, type); return cache; } /* * Mark one block group RO, can be called several times for the same block * group. * * @cache: the destination block group * @do_chunk_alloc: whether need to do chunk pre-allocation, this is to * ensure we still have some free space after marking this * block group RO. */ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, bool do_chunk_alloc) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_trans_handle *trans; struct btrfs_root *root = btrfs_block_group_root(fs_info); u64 alloc_flags; int ret; bool dirty_bg_running; /* * This can only happen when we are doing read-only scrub on read-only * mount. * In that case we should not start a new transaction on read-only fs. * Thus here we skip all chunk allocations. */ if (sb_rdonly(fs_info->sb)) { mutex_lock(&fs_info->ro_block_group_mutex); ret = inc_block_group_ro(cache, 0); mutex_unlock(&fs_info->ro_block_group_mutex); return ret; } do { trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); dirty_bg_running = false; /* * We're not allowed to set block groups readonly after the dirty * block group cache has started writing. If it already started, * back off and let this transaction commit. */ mutex_lock(&fs_info->ro_block_group_mutex); if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { u64 transid = trans->transid; mutex_unlock(&fs_info->ro_block_group_mutex); btrfs_end_transaction(trans); ret = btrfs_wait_for_commit(fs_info, transid); if (ret) return ret; dirty_bg_running = true; } } while (dirty_bg_running); if (do_chunk_alloc) { /* * If we are changing raid levels, try to allocate a * corresponding block group with the new raid level. */ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); if (alloc_flags != cache->flags) { ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); /* * ENOSPC is allowed here, we may have enough space * already allocated at the new raid level to carry on */ if (ret == -ENOSPC) ret = 0; if (ret < 0) goto out; } } ret = inc_block_group_ro(cache, 0); if (!ret) goto out; if (ret == -ETXTBSY) goto unlock_out; /* * Skip chunk allocation if the bg is SYSTEM, this is to avoid system * chunk allocation storm to exhaust the system chunk array. Otherwise * we still want to try our best to mark the block group read-only. */ if (!do_chunk_alloc && ret == -ENOSPC && (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM)) goto unlock_out; alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; /* * We have allocated a new chunk. We also need to activate that chunk to * grant metadata tickets for zoned filesystem. */ ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true); if (ret < 0) goto out; ret = inc_block_group_ro(cache, 0); if (ret == -ETXTBSY) goto unlock_out; out: if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); mutex_lock(&fs_info->chunk_mutex); check_system_chunk(trans, alloc_flags); mutex_unlock(&fs_info->chunk_mutex); } unlock_out: mutex_unlock(&fs_info->ro_block_group_mutex); btrfs_end_transaction(trans); return ret; } void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; BUG_ON(!cache->ro); spin_lock(&sinfo->lock); spin_lock(&cache->lock); if (!--cache->ro) { if (btrfs_is_zoned(cache->fs_info)) { /* Migrate zone_unusable bytes back */ cache->zone_unusable = (cache->alloc_offset - cache->used) + (cache->length - cache->zone_capacity); sinfo->bytes_zone_unusable += cache->zone_unusable; sinfo->bytes_readonly -= cache->zone_unusable; } num_bytes = cache->length - cache->reserved - cache->pinned - cache->bytes_super - cache->zone_unusable - cache->used; sinfo->bytes_readonly -= num_bytes; list_del_init(&cache->ro_list); } spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); } static int update_block_group_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_root *root = btrfs_block_group_root(fs_info); unsigned long bi; struct extent_buffer *leaf; struct btrfs_block_group_item bgi; struct btrfs_key key; u64 old_commit_used; u64 used; /* * Block group items update can be triggered out of commit transaction * critical section, thus we need a consistent view of used bytes. * We cannot use cache->used directly outside of the spin lock, as it * may be changed. */ spin_lock(&cache->lock); old_commit_used = cache->commit_used; used = cache->used; /* No change in used bytes, can safely skip it. */ if (cache->commit_used == used) { spin_unlock(&cache->lock); return 0; } cache->commit_used = used; spin_unlock(&cache->lock); key.objectid = cache->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; key.offset = cache->length; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret) { if (ret > 0) ret = -ENOENT; goto fail; } leaf = path->nodes[0]; bi = btrfs_item_ptr_offset(leaf, path->slots[0]); btrfs_set_stack_block_group_used(&bgi, used); btrfs_set_stack_block_group_chunk_objectid(&bgi, cache->global_root_id); btrfs_set_stack_block_group_flags(&bgi, cache->flags); write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); btrfs_mark_buffer_dirty(trans, leaf); fail: btrfs_release_path(path); /* * We didn't update the block group item, need to revert commit_used * unless the block group item didn't exist yet - this is to prevent a * race with a concurrent insertion of the block group item, with * insert_block_group_item(), that happened just after we attempted to * update. In that case we would reset commit_used to 0 just after the * insertion set it to a value greater than 0 - if the block group later * becomes with 0 used bytes, we would incorrectly skip its update. */ if (ret < 0 && ret != -ENOENT) { spin_lock(&cache->lock); cache->commit_used = old_commit_used; spin_unlock(&cache->lock); } return ret; } static int cache_save_setup(struct btrfs_block_group *block_group, struct btrfs_trans_handle *trans, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct inode *inode = NULL; struct extent_changeset *data_reserved = NULL; u64 alloc_hint = 0; int dcs = BTRFS_DC_ERROR; u64 cache_size = 0; int retries = 0; int ret = 0; if (!btrfs_test_opt(fs_info, SPACE_CACHE)) return 0; /* * If this block group is smaller than 100 megs don't bother caching the * block group. */ if (block_group->length < (100 * SZ_1M)) { spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); return 0; } if (TRANS_ABORTED(trans)) return 0; again: inode = lookup_free_space_inode(block_group, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { ret = PTR_ERR(inode); btrfs_release_path(path); goto out; } if (IS_ERR(inode)) { BUG_ON(retries); retries++; if (block_group->ro) goto out_free; ret = create_free_space_inode(trans, block_group, path); if (ret) goto out_free; goto again; } /* * We want to set the generation to 0, that way if anything goes wrong * from here on out we know not to trust this cache when we load up next * time. */ BTRFS_I(inode)->generation = 0; ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) { /* * So theoretically we could recover from this, simply set the * super cache generation to 0 so we know to invalidate the * cache, but then we'd have to keep track of the block groups * that fail this way so we know we _have_ to reset this cache * before the next commit or risk reading stale cache. So to * limit our exposure to horrible edge cases lets just abort the * transaction, this only happens in really bad situations * anyway. */ btrfs_abort_transaction(trans, ret); goto out_put; } WARN_ON(ret); /* We've already setup this transaction, go ahead and exit */ if (block_group->cache_generation == trans->transid && i_size_read(inode)) { dcs = BTRFS_DC_SETUP; goto out_put; } if (i_size_read(inode) > 0) { ret = btrfs_check_trunc_cache_free_space(fs_info, &fs_info->global_block_rsv); if (ret) goto out_put; ret = btrfs_truncate_free_space_cache(trans, NULL, inode); if (ret) goto out_put; } spin_lock(&block_group->lock); if (block_group->cached != BTRFS_CACHE_FINISHED || !btrfs_test_opt(fs_info, SPACE_CACHE)) { /* * don't bother trying to write stuff out _if_ * a) we're not cached, * b) we're with nospace_cache mount option, * c) we're with v2 space_cache (FREE_SPACE_TREE). */ dcs = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); goto out_put; } spin_unlock(&block_group->lock); /* * We hit an ENOSPC when setting up the cache in this transaction, just * skip doing the setup, we've already cleared the cache so we're safe. */ if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { ret = -ENOSPC; goto out_put; } /* * Try to preallocate enough space based on how big the block group is. * Keep in mind this has to include any pinned space which could end up * taking up quite a bit since it's not folded into the other space * cache. */ cache_size = div_u64(block_group->length, SZ_256M); if (!cache_size) cache_size = 1; cache_size *= 16; cache_size *= fs_info->sectorsize; ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0, cache_size, false); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size, cache_size, cache_size, &alloc_hint); /* * Our cache requires contiguous chunks so that we don't modify a bunch * of metadata or split extents when writing the cache out, which means * we can enospc if we are heavily fragmented in addition to just normal * out of space conditions. So if we hit this just skip setting up any * other block groups for this transaction, maybe we'll unpin enough * space the next time around. */ if (!ret) dcs = BTRFS_DC_SETUP; else if (ret == -ENOSPC) set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); out_put: iput(inode); out_free: btrfs_release_path(path); out: spin_lock(&block_group->lock); if (!ret && dcs == BTRFS_DC_SETUP) block_group->cache_generation = trans->transid; block_group->disk_cache_state = dcs; spin_unlock(&block_group->lock); extent_changeset_free(data_reserved); return ret; } int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *cache, *tmp; struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_path *path; if (list_empty(&cur_trans->dirty_bgs) || !btrfs_test_opt(fs_info, SPACE_CACHE)) return 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* Could add new block groups, use _safe just in case */ list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, dirty_list) { if (cache->disk_cache_state == BTRFS_DC_CLEAR) cache_save_setup(cache, trans, path); } btrfs_free_path(path); return 0; } /* * Transaction commit does final block group cache writeback during a critical * section where nothing is allowed to change the FS. This is required in * order for the cache to actually match the block group, but can introduce a * lot of latency into the commit. * * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO. * There's a chance we'll have to redo some of it if the block group changes * again during the commit, but it greatly reduces the commit latency by * getting rid of the easy block groups while we're still allowing others to * join the commit. */ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *cache; struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; int should_put; struct btrfs_path *path = NULL; LIST_HEAD(dirty); struct list_head *io = &cur_trans->io_bgs; int loops = 0; spin_lock(&cur_trans->dirty_bgs_lock); if (list_empty(&cur_trans->dirty_bgs)) { spin_unlock(&cur_trans->dirty_bgs_lock); return 0; } list_splice_init(&cur_trans->dirty_bgs, &dirty); spin_unlock(&cur_trans->dirty_bgs_lock); again: /* Make sure all the block groups on our dirty list actually exist */ btrfs_create_pending_block_groups(trans); if (!path) { path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } } /* * cache_write_mutex is here only to save us from balance or automatic * removal of empty block groups deleting this block group while we are * writing out the cache */ mutex_lock(&trans->transaction->cache_write_mutex); while (!list_empty(&dirty)) { bool drop_reserve = true; cache = list_first_entry(&dirty, struct btrfs_block_group, dirty_list); /* * This can happen if something re-dirties a block group that * is already under IO. Just wait for it to finish and then do * it all again */ if (!list_empty(&cache->io_list)) { list_del_init(&cache->io_list); btrfs_wait_cache_io(trans, cache, path); btrfs_put_block_group(cache); } /* * btrfs_wait_cache_io uses the cache->dirty_list to decide if * it should update the cache_state. Don't delete until after * we wait. * * Since we're not running in the commit critical section * we need the dirty_bgs_lock to protect from update_block_group */ spin_lock(&cur_trans->dirty_bgs_lock); list_del_init(&cache->dirty_list); spin_unlock(&cur_trans->dirty_bgs_lock); should_put = 1; cache_save_setup(cache, trans, path); if (cache->disk_cache_state == BTRFS_DC_SETUP) { cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { should_put = 0; /* * The cache_write_mutex is protecting the * io_list, also refer to the definition of * btrfs_transaction::io_bgs for more details */ list_add_tail(&cache->io_list, io); } else { /* * If we failed to write the cache, the * generation will be bad and life goes on */ ret = 0; } } if (!ret) { ret = update_block_group_item(trans, path, cache); /* * Our block group might still be attached to the list * of new block groups in the transaction handle of some * other task (struct btrfs_trans_handle->new_bgs). This * means its block group item isn't yet in the extent * tree. If this happens ignore the error, as we will * try again later in the critical section of the * transaction commit. */ if (ret == -ENOENT) { ret = 0; spin_lock(&cur_trans->dirty_bgs_lock); if (list_empty(&cache->dirty_list)) { list_add_tail(&cache->dirty_list, &cur_trans->dirty_bgs); btrfs_get_block_group(cache); drop_reserve = false; } spin_unlock(&cur_trans->dirty_bgs_lock); } else if (ret) { btrfs_abort_transaction(trans, ret); } } /* If it's not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); if (drop_reserve) btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); /* * Avoid blocking other tasks for too long. It might even save * us from writing caches for block groups that are going to be * removed. */ mutex_unlock(&trans->transaction->cache_write_mutex); if (ret) goto out; mutex_lock(&trans->transaction->cache_write_mutex); } mutex_unlock(&trans->transaction->cache_write_mutex); /* * Go through delayed refs for all the stuff we've just kicked off * and then loop back (just once) */ if (!ret) ret = btrfs_run_delayed_refs(trans, 0); if (!ret && loops == 0) { loops++; spin_lock(&cur_trans->dirty_bgs_lock); list_splice_init(&cur_trans->dirty_bgs, &dirty); /* * dirty_bgs_lock protects us from concurrent block group * deletes too (not just cache_write_mutex). */ if (!list_empty(&dirty)) { spin_unlock(&cur_trans->dirty_bgs_lock); goto again; } spin_unlock(&cur_trans->dirty_bgs_lock); } out: if (ret < 0) { spin_lock(&cur_trans->dirty_bgs_lock); list_splice_init(&dirty, &cur_trans->dirty_bgs); spin_unlock(&cur_trans->dirty_bgs_lock); btrfs_cleanup_dirty_bgs(cur_trans, fs_info); } btrfs_free_path(path); return ret; } int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *cache; struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; int should_put; struct btrfs_path *path; struct list_head *io = &cur_trans->io_bgs; path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* * Even though we are in the critical section of the transaction commit, * we can still have concurrent tasks adding elements to this * transaction's list of dirty block groups. These tasks correspond to * endio free space workers started when writeback finishes for a * space cache, which run inode.c:btrfs_finish_ordered_io(), and can * allocate new block groups as a result of COWing nodes of the root * tree when updating the free space inode. The writeback for the space * caches is triggered by an earlier call to * btrfs_start_dirty_block_groups() and iterations of the following * loop. * Also we want to do the cache_save_setup first and then run the * delayed refs to make sure we have the best chance at doing this all * in one shot. */ spin_lock(&cur_trans->dirty_bgs_lock); while (!list_empty(&cur_trans->dirty_bgs)) { cache = list_first_entry(&cur_trans->dirty_bgs, struct btrfs_block_group, dirty_list); /* * This can happen if cache_save_setup re-dirties a block group * that is already under IO. Just wait for it to finish and * then do it all again */ if (!list_empty(&cache->io_list)) { spin_unlock(&cur_trans->dirty_bgs_lock); list_del_init(&cache->io_list); btrfs_wait_cache_io(trans, cache, path); btrfs_put_block_group(cache); spin_lock(&cur_trans->dirty_bgs_lock); } /* * Don't remove from the dirty list until after we've waited on * any pending IO */ list_del_init(&cache->dirty_list); spin_unlock(&cur_trans->dirty_bgs_lock); should_put = 1; cache_save_setup(cache, trans, path); if (!ret) ret = btrfs_run_delayed_refs(trans, U64_MAX); if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { should_put = 0; list_add_tail(&cache->io_list, io); } else { /* * If we failed to write the cache, the * generation will be bad and life goes on */ ret = 0; } } if (!ret) { ret = update_block_group_item(trans, path, cache); /* * One of the free space endio workers might have * created a new block group while updating a free space * cache's inode (at inode.c:btrfs_finish_ordered_io()) * and hasn't released its transaction handle yet, in * which case the new block group is still attached to * its transaction handle and its creation has not * finished yet (no block group item in the extent tree * yet, etc). If this is the case, wait for all free * space endio workers to finish and retry. This is a * very rare case so no need for a more efficient and * complex approach. */ if (ret == -ENOENT) { wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); ret = update_block_group_item(trans, path, cache); } if (ret) btrfs_abort_transaction(trans, ret); } /* If its not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); /* * Refer to the definition of io_bgs member for details why it's safe * to use it without any locking */ while (!list_empty(io)) { cache = list_first_entry(io, struct btrfs_block_group, io_list); list_del_init(&cache->io_list); btrfs_wait_cache_io(trans, cache, path); btrfs_put_block_group(cache); } btrfs_free_path(path); return ret; } int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_space_info *space_info; struct btrfs_block_group *cache; u64 old_val; bool reclaim = false; bool bg_already_dirty = true; int factor; /* Block accounting for super block */ spin_lock(&info->delalloc_root_lock); old_val = btrfs_super_bytes_used(info->super_copy); if (alloc) old_val += num_bytes; else old_val -= num_bytes; btrfs_set_super_bytes_used(info->super_copy, old_val); spin_unlock(&info->delalloc_root_lock); cache = btrfs_lookup_block_group(info, bytenr); if (!cache) return -ENOENT; /* An extent can not span multiple block groups. */ ASSERT(bytenr + num_bytes <= cache->start + cache->length); space_info = cache->space_info; factor = btrfs_bg_type_to_factor(cache->flags); /* * If this block group has free space cache written out, we need to make * sure to load it if we are removing space. This is because we need * the unpinning stage to actually add the space back to the block group, * otherwise we will leak space. */ if (!alloc && !btrfs_block_group_done(cache)) btrfs_cache_block_group(cache, true); spin_lock(&space_info->lock); spin_lock(&cache->lock); if (btrfs_test_opt(info, SPACE_CACHE) && cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; old_val = cache->used; if (alloc) { old_val += num_bytes; cache->used = old_val; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; space_info->bytes_used += num_bytes; space_info->disk_used += num_bytes * factor; spin_unlock(&cache->lock); spin_unlock(&space_info->lock); } else { old_val -= num_bytes; cache->used = old_val; cache->pinned += num_bytes; btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes); space_info->bytes_used -= num_bytes; space_info->disk_used -= num_bytes * factor; reclaim = should_reclaim_block_group(cache, num_bytes); spin_unlock(&cache->lock); spin_unlock(&space_info->lock); set_extent_bit(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); } spin_lock(&trans->transaction->dirty_bgs_lock); if (list_empty(&cache->dirty_list)) { list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs); bg_already_dirty = false; btrfs_get_block_group(cache); } spin_unlock(&trans->transaction->dirty_bgs_lock); /* * No longer have used bytes in this block group, queue it for deletion. * We do this after adding the block group to the dirty list to avoid * races between cleaner kthread and space cache writeout. */ if (!alloc && old_val == 0) { if (!btrfs_test_opt(info, DISCARD_ASYNC)) btrfs_mark_bg_unused(cache); } else if (!alloc && reclaim) { btrfs_mark_bg_to_reclaim(cache); } btrfs_put_block_group(cache); /* Modified block groups are accounted for in the delayed_refs_rsv. */ if (!bg_already_dirty) btrfs_inc_delayed_refs_rsv_bg_updates(info); return 0; } /* * Update the block_group and space info counters. * * @cache: The cache we are manipulating * @ram_bytes: The number of bytes of file content, and will be same to * @num_bytes except for the compress path. * @num_bytes: The number of bytes in question * @delalloc: The blocks are allocated for the delalloc write * * This is called by the allocator when it reserves space. If this is a * reservation and the block group has become read only we cannot make the * reservation and return -EAGAIN, otherwise this function always succeeds. */ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, u64 ram_bytes, u64 num_bytes, int delalloc, bool force_wrong_size_class) { struct btrfs_space_info *space_info = cache->space_info; enum btrfs_block_group_size_class size_class; int ret = 0; spin_lock(&space_info->lock); spin_lock(&cache->lock); if (cache->ro) { ret = -EAGAIN; goto out; } if (btrfs_block_group_should_use_size_class(cache)) { size_class = btrfs_calc_block_group_size_class(num_bytes); ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class); if (ret) goto out; } cache->reserved += num_bytes; space_info->bytes_reserved += num_bytes; trace_btrfs_space_reservation(cache->fs_info, "space_info", space_info->flags, num_bytes, 1); btrfs_space_info_update_bytes_may_use(cache->fs_info, space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; /* * Compression can use less space than we reserved, so wake tickets if * that happens. */ if (num_bytes < ram_bytes) btrfs_try_granting_tickets(cache->fs_info, space_info); out: spin_unlock(&cache->lock); spin_unlock(&space_info->lock); return ret; } /* * Update the block_group and space info counters. * * @cache: The cache we are manipulating * @num_bytes: The number of bytes in question * @delalloc: The blocks are allocated for the delalloc write * * This is called by somebody who is freeing space that was never actually used * on disk. For example if you reserve some space for a new leaf in transaction * A and before transaction A commits you free that leaf, you call this with * reserve set to 0 in order to clear the reservation. */ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, int delalloc) { struct btrfs_space_info *space_info = cache->space_info; spin_lock(&space_info->lock); spin_lock(&cache->lock); if (cache->ro) space_info->bytes_readonly += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; space_info->max_extent_size = 0; if (delalloc) cache->delalloc_bytes -= num_bytes; spin_unlock(&cache->lock); btrfs_try_granting_tickets(cache->fs_info, space_info); spin_unlock(&space_info->lock); } static void force_metadata_allocation(struct btrfs_fs_info *info) { struct list_head *head = &info->space_info; struct btrfs_space_info *found; list_for_each_entry(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_METADATA) found->force_alloc = CHUNK_ALLOC_FORCE; } } static int should_alloc_chunk(struct btrfs_fs_info *fs_info, struct btrfs_space_info *sinfo, int force) { u64 bytes_used = btrfs_space_info_used(sinfo, false); u64 thresh; if (force == CHUNK_ALLOC_FORCE) return 1; /* * in limited mode, we want to have some free space up to * about 1% of the FS size. */ if (force == CHUNK_ALLOC_LIMITED) { thresh = btrfs_super_total_bytes(fs_info->super_copy); thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1)); if (sinfo->total_bytes - bytes_used < thresh) return 1; } if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80)) return 0; return 1; } int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) { u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type); return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); } static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) { struct btrfs_block_group *bg; int ret; /* * Check if we have enough space in the system space info because we * will need to update device items in the chunk btree and insert a new * chunk item in the chunk btree as well. This will allocate a new * system block group if needed. */ check_system_chunk(trans, flags); bg = btrfs_create_chunk(trans, flags); if (IS_ERR(bg)) { ret = PTR_ERR(bg); goto out; } ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); /* * Normally we are not expected to fail with -ENOSPC here, since we have * previously reserved space in the system space_info and allocated one * new system chunk if necessary. However there are three exceptions: * * 1) We may have enough free space in the system space_info but all the * existing system block groups have a profile which can not be used * for extent allocation. * * This happens when mounting in degraded mode. For example we have a * RAID1 filesystem with 2 devices, lose one device and mount the fs * using the other device in degraded mode. If we then allocate a chunk, * we may have enough free space in the existing system space_info, but * none of the block groups can be used for extent allocation since they * have a RAID1 profile, and because we are in degraded mode with a * single device, we are forced to allocate a new system chunk with a * SINGLE profile. Making check_system_chunk() iterate over all system * block groups and check if they have a usable profile and enough space * can be slow on very large filesystems, so we tolerate the -ENOSPC and * try again after forcing allocation of a new system chunk. Like this * we avoid paying the cost of that search in normal circumstances, when * we were not mounted in degraded mode; * * 2) We had enough free space info the system space_info, and one suitable * block group to allocate from when we called check_system_chunk() * above. However right after we called it, the only system block group * with enough free space got turned into RO mode by a running scrub, * and in this case we have to allocate a new one and retry. We only * need do this allocate and retry once, since we have a transaction * handle and scrub uses the commit root to search for block groups; * * 3) We had one system block group with enough free space when we called * check_system_chunk(), but after that, right before we tried to * allocate the last extent buffer we needed, a discard operation came * in and it temporarily removed the last free space entry from the * block group (discard removes a free space entry, discards it, and * then adds back the entry to the block group cache). */ if (ret == -ENOSPC) { const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info); struct btrfs_block_group *sys_bg; sys_bg = btrfs_create_chunk(trans, sys_flags); if (IS_ERR(sys_bg)) { ret = PTR_ERR(sys_bg); btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } } else if (ret) { btrfs_abort_transaction(trans, ret); goto out; } out: btrfs_trans_release_chunk_metadata(trans); if (ret) return ERR_PTR(ret); btrfs_get_block_group(bg); return bg; } /* * Chunk allocation is done in 2 phases: * * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for * the chunk, the chunk mapping, create its block group and add the items * that belong in the chunk btree to it - more specifically, we need to * update device items in the chunk btree and add a new chunk item to it. * * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block * group item to the extent btree and the device extent items to the devices * btree. * * This is done to prevent deadlocks. For example when COWing a node from the * extent btree we are holding a write lock on the node's parent and if we * trigger chunk allocation and attempted to insert the new block group item * in the extent btree right way, we could deadlock because the path for the * insertion can include that parent node. At first glance it seems impossible * to trigger chunk allocation after starting a transaction since tasks should * reserve enough transaction units (metadata space), however while that is true * most of the time, chunk allocation may still be triggered for several reasons: * * 1) When reserving metadata, we check if there is enough free space in the * metadata space_info and therefore don't trigger allocation of a new chunk. * However later when the task actually tries to COW an extent buffer from * the extent btree or from the device btree for example, it is forced to * allocate a new block group (chunk) because the only one that had enough * free space was just turned to RO mode by a running scrub for example (or * device replace, block group reclaim thread, etc), so we can not use it * for allocating an extent and end up being forced to allocate a new one; * * 2) Because we only check that the metadata space_info has enough free bytes, * we end up not allocating a new metadata chunk in that case. However if * the filesystem was mounted in degraded mode, none of the existing block * groups might be suitable for extent allocation due to their incompatible * profile (for e.g. mounting a 2 devices filesystem, where all block groups * use a RAID1 profile, in degraded mode using a single device). In this case * when the task attempts to COW some extent buffer of the extent btree for * example, it will trigger allocation of a new metadata block group with a * suitable profile (SINGLE profile in the example of the degraded mount of * the RAID1 filesystem); * * 3) The task has reserved enough transaction units / metadata space, but when * it attempts to COW an extent buffer from the extent or device btree for * example, it does not find any free extent in any metadata block group, * therefore forced to try to allocate a new metadata block group. * This is because some other task allocated all available extents in the * meanwhile - this typically happens with tasks that don't reserve space * properly, either intentionally or as a bug. One example where this is * done intentionally is fsync, as it does not reserve any transaction units * and ends up allocating a variable number of metadata extents for log * tree extent buffers; * * 4) The task has reserved enough transaction units / metadata space, but right * before it tries to allocate the last extent buffer it needs, a discard * operation comes in and, temporarily, removes the last free space entry from * the only metadata block group that had free space (discard starts by * removing a free space entry from a block group, then does the discard * operation and, once it's done, it adds back the free space entry to the * block group). * * We also need this 2 phases setup when adding a device to a filesystem with * a seed device - we must create new metadata and system chunks without adding * any of the block group items to the chunk, extent and device btrees. If we * did not do it this way, we would get ENOSPC when attempting to update those * btrees, since all the chunks from the seed device are read-only. * * Phase 1 does the updates and insertions to the chunk btree because if we had * it done in phase 2 and have a thundering herd of tasks allocating chunks in * parallel, we risk having too many system chunks allocated by many tasks if * many tasks reach phase 1 without the previous ones completing phase 2. In the * extreme case this leads to exhaustion of the system chunk array in the * superblock. This is easier to trigger if using a btree node/leaf size of 64K * and with RAID filesystems (so we have more device items in the chunk btree). * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of * the system chunk array due to concurrent allocations") provides more details. * * Allocation of system chunks does not happen through this function. A task that * needs to update the chunk btree (the only btree that uses system chunks), must * preallocate chunk space by calling either check_system_chunk() or * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or * metadata chunk or when removing a chunk, while the later is used before doing * a modification to the chunk btree - use cases for the later are adding, * removing and resizing a device as well as relocation of a system chunk. * See the comment below for more details. * * The reservation of system space, done through check_system_chunk(), as well * as all the updates and insertions into the chunk btree must be done while * holding fs_info->chunk_mutex. This is important to guarantee that while COWing * an extent buffer from the chunks btree we never trigger allocation of a new * system chunk, which would result in a deadlock (trying to lock twice an * extent buffer of the chunk btree, first time before triggering the chunk * allocation and the second time during chunk allocation while attempting to * update the chunks btree). The system chunk array is also updated while holding * that mutex. The same logic applies to removing chunks - we must reserve system * space, update the chunk btree and the system chunk array in the superblock * while holding fs_info->chunk_mutex. * * This function, btrfs_chunk_alloc(), belongs to phase 1. * * If @force is CHUNK_ALLOC_FORCE: * - return 1 if it successfully allocates a chunk, * - return errors including -ENOSPC otherwise. * If @force is NOT CHUNK_ALLOC_FORCE: * - return 0 if it doesn't need to allocate a new chunk, * - return 1 if it successfully allocates a chunk, * - return errors including -ENOSPC otherwise. */ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, enum btrfs_chunk_alloc_enum force) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_space_info *space_info; struct btrfs_block_group *ret_bg; bool wait_for_alloc = false; bool should_alloc = false; bool from_extent_allocation = false; int ret = 0; if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) { from_extent_allocation = true; force = CHUNK_ALLOC_FORCE; } /* Don't re-enter if we're already allocating a chunk */ if (trans->allocating_chunk) return -ENOSPC; /* * Allocation of system chunks can not happen through this path, as we * could end up in a deadlock if we are allocating a data or metadata * chunk and there is another task modifying the chunk btree. * * This is because while we are holding the chunk mutex, we will attempt * to add the new chunk item to the chunk btree or update an existing * device item in the chunk btree, while the other task that is modifying * the chunk btree is attempting to COW an extent buffer while holding a * lock on it and on its parent - if the COW operation triggers a system * chunk allocation, then we can deadlock because we are holding the * chunk mutex and we may need to access that extent buffer or its parent * in order to add the chunk item or update a device item. * * Tasks that want to modify the chunk tree should reserve system space * before updating the chunk btree, by calling either * btrfs_reserve_chunk_metadata() or check_system_chunk(). * It's possible that after a task reserves the space, it still ends up * here - this happens in the cases described above at do_chunk_alloc(). * The task will have to either retry or fail. */ if (flags & BTRFS_BLOCK_GROUP_SYSTEM) return -ENOSPC; space_info = btrfs_find_space_info(fs_info, flags); ASSERT(space_info); do { spin_lock(&space_info->lock); if (force < space_info->force_alloc) force = space_info->force_alloc; should_alloc = should_alloc_chunk(fs_info, space_info, force); if (space_info->full) { /* No more free physical space */ if (should_alloc) ret = -ENOSPC; else ret = 0; spin_unlock(&space_info->lock); return ret; } else if (!should_alloc) { spin_unlock(&space_info->lock); return 0; } else if (space_info->chunk_alloc) { /* * Someone is already allocating, so we need to block * until this someone is finished and then loop to * recheck if we should continue with our allocation * attempt. */ wait_for_alloc = true; force = CHUNK_ALLOC_NO_FORCE; spin_unlock(&space_info->lock); mutex_lock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->chunk_mutex); } else { /* Proceed with allocation */ space_info->chunk_alloc = 1; wait_for_alloc = false; spin_unlock(&space_info->lock); } cond_resched(); } while (wait_for_alloc); mutex_lock(&fs_info->chunk_mutex); trans->allocating_chunk = true; /* * If we have mixed data/metadata chunks we want to make sure we keep * allocating mixed chunks instead of individual chunks. */ if (btrfs_mixed_space_info(space_info)) flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); /* * if we're doing a data chunk, go ahead and make sure that * we keep a reasonable number of metadata chunks allocated in the * FS as well. */ if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { fs_info->data_chunk_allocations++; if (!(fs_info->data_chunk_allocations % fs_info->metadata_ratio)) force_metadata_allocation(fs_info); } ret_bg = do_chunk_alloc(trans, flags); trans->allocating_chunk = false; if (IS_ERR(ret_bg)) { ret = PTR_ERR(ret_bg); } else if (from_extent_allocation && (flags & BTRFS_BLOCK_GROUP_DATA)) { /* * New block group is likely to be used soon. Try to activate * it now. Failure is OK for now. */ btrfs_zone_activate(ret_bg); } if (!ret) btrfs_put_block_group(ret_bg); spin_lock(&space_info->lock); if (ret < 0) { if (ret == -ENOSPC) space_info->full = 1; else goto out; } else { ret = 1; space_info->max_extent_size = 0; } space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; out: space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); mutex_unlock(&fs_info->chunk_mutex); return ret; } static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) { u64 num_dev; num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; if (!num_dev) num_dev = fs_info->fs_devices->rw_devices; return num_dev; } static void reserve_chunk_space(struct btrfs_trans_handle *trans, u64 bytes, u64 type) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_space_info *info; u64 left; int ret = 0; /* * Needed because we can end up allocating a system chunk and for an * atomic and race free space reservation in the chunk block reserve. */ lockdep_assert_held(&fs_info->chunk_mutex); info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); spin_lock(&info->lock); left = info->total_bytes - btrfs_space_info_used(info, true); spin_unlock(&info->lock); if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", left, bytes, type); btrfs_dump_space_info(fs_info, info, 0, 0); } if (left < bytes) { u64 flags = btrfs_system_alloc_profile(fs_info); struct btrfs_block_group *bg; /* * Ignore failure to create system chunk. We might end up not * needing it, as we might not need to COW all nodes/leafs from * the paths we visit in the chunk tree (they were already COWed * or created in the current transaction for example). */ bg = btrfs_create_chunk(trans, flags); if (IS_ERR(bg)) { ret = PTR_ERR(bg); } else { /* * We have a new chunk. We also need to activate it for * zoned filesystem. */ ret = btrfs_zoned_activate_one_bg(fs_info, info, true); if (ret < 0) return; /* * If we fail to add the chunk item here, we end up * trying again at phase 2 of chunk allocation, at * btrfs_create_pending_block_groups(). So ignore * any error here. An ENOSPC here could happen, due to * the cases described at do_chunk_alloc() - the system * block group we just created was just turned into RO * mode by a scrub for example, or a running discard * temporarily removed its free space entries, etc. */ btrfs_chunk_alloc_add_chunk_item(trans, bg); } } if (!ret) { ret = btrfs_block_rsv_add(fs_info, &fs_info->chunk_block_rsv, bytes, BTRFS_RESERVE_NO_FLUSH); if (!ret) trans->chunk_bytes_reserved += bytes; } } /* * Reserve space in the system space for allocating or removing a chunk. * The caller must be holding fs_info->chunk_mutex. */ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) { struct btrfs_fs_info *fs_info = trans->fs_info; const u64 num_devs = get_profile_num_devs(fs_info, type); u64 bytes; /* num_devs device items to update and 1 chunk item to add or remove. */ bytes = btrfs_calc_metadata_size(fs_info, num_devs) + btrfs_calc_insert_metadata_size(fs_info, 1); reserve_chunk_space(trans, bytes, type); } /* * Reserve space in the system space, if needed, for doing a modification to the * chunk btree. * * @trans: A transaction handle. * @is_item_insertion: Indicate if the modification is for inserting a new item * in the chunk btree or if it's for the deletion or update * of an existing item. * * This is used in a context where we need to update the chunk btree outside * block group allocation and removal, to avoid a deadlock with a concurrent * task that is allocating a metadata or data block group and therefore needs to * update the chunk btree while holding the chunk mutex. After the update to the * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called. * */ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, bool is_item_insertion) { struct btrfs_fs_info *fs_info = trans->fs_info; u64 bytes; if (is_item_insertion) bytes = btrfs_calc_insert_metadata_size(fs_info, 1); else bytes = btrfs_calc_metadata_size(fs_info, 1); mutex_lock(&fs_info->chunk_mutex); reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM); mutex_unlock(&fs_info->chunk_mutex); } void btrfs_put_block_group_cache(struct btrfs_fs_info *info) { struct btrfs_block_group *block_group; block_group = btrfs_lookup_first_block_group(info, 0); while (block_group) { btrfs_wait_block_group_cache_done(block_group); spin_lock(&block_group->lock); if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { struct inode *inode = block_group->inode; block_group->inode = NULL; spin_unlock(&block_group->lock); ASSERT(block_group->io_ctl.inode == NULL); iput(inode); } else { spin_unlock(&block_group->lock); } block_group = btrfs_next_block_group(block_group); } } /* * Must be called only after stopping all workers, since we could have block * group caching kthreads running, and therefore they could race with us if we * freed the block groups before stopping them. */ int btrfs_free_block_groups(struct btrfs_fs_info *info) { struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; struct btrfs_caching_control *caching_ctl; struct rb_node *n; if (btrfs_is_zoned(info)) { if (info->active_meta_bg) { btrfs_put_block_group(info->active_meta_bg); info->active_meta_bg = NULL; } if (info->active_system_bg) { btrfs_put_block_group(info->active_system_bg); info->active_system_bg = NULL; } } write_lock(&info->block_group_cache_lock); while (!list_empty(&info->caching_block_groups)) { caching_ctl = list_entry(info->caching_block_groups.next, struct btrfs_caching_control, list); list_del(&caching_ctl->list); btrfs_put_caching_control(caching_ctl); } write_unlock(&info->block_group_cache_lock); spin_lock(&info->unused_bgs_lock); while (!list_empty(&info->unused_bgs)) { block_group = list_first_entry(&info->unused_bgs, struct btrfs_block_group, bg_list); list_del_init(&block_group->bg_list); btrfs_put_block_group(block_group); } while (!list_empty(&info->reclaim_bgs)) { block_group = list_first_entry(&info->reclaim_bgs, struct btrfs_block_group, bg_list); list_del_init(&block_group->bg_list); btrfs_put_block_group(block_group); } spin_unlock(&info->unused_bgs_lock); spin_lock(&info->zone_active_bgs_lock); while (!list_empty(&info->zone_active_bgs)) { block_group = list_first_entry(&info->zone_active_bgs, struct btrfs_block_group, active_bg_list); list_del_init(&block_group->active_bg_list); btrfs_put_block_group(block_group); } spin_unlock(&info->zone_active_bgs_lock); write_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group, cache_node); rb_erase_cached(&block_group->cache_node, &info->block_group_cache_tree); RB_CLEAR_NODE(&block_group->cache_node); write_unlock(&info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); list_del(&block_group->list); up_write(&block_group->space_info->groups_sem); /* * We haven't cached this block group, which means we could * possibly have excluded extents on this block group. */ if (block_group->cached == BTRFS_CACHE_NO || block_group->cached == BTRFS_CACHE_ERROR) btrfs_free_excluded_extents(block_group); btrfs_remove_free_space_cache(block_group); ASSERT(block_group->cached != BTRFS_CACHE_STARTED); ASSERT(list_empty(&block_group->dirty_list)); ASSERT(list_empty(&block_group->io_list)); ASSERT(list_empty(&block_group->bg_list)); ASSERT(refcount_read(&block_group->refs) == 1); ASSERT(block_group->swap_extents == 0); btrfs_put_block_group(block_group); write_lock(&info->block_group_cache_lock); } write_unlock(&info->block_group_cache_lock); btrfs_release_global_block_rsv(info); while (!list_empty(&info->space_info)) { space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); /* * Do not hide this behind enospc_debug, this is actually * important and indicates a real bug if this happens. */ if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_may_use > 0)) btrfs_dump_space_info(info, space_info, 0, 0); /* * If there was a failure to cleanup a log tree, very likely due * to an IO failure on a writeback attempt of one or more of its * extent buffers, we could not do proper (and cheap) unaccounting * of their reserved space, so don't warn on bytes_reserved > 0 in * that case. */ if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { if (WARN_ON(space_info->bytes_reserved > 0)) btrfs_dump_space_info(info, space_info, 0, 0); } WARN_ON(space_info->reclaim_size > 0); list_del(&space_info->list); btrfs_sysfs_remove_space_info(space_info); } return 0; } void btrfs_freeze_block_group(struct btrfs_block_group *cache) { atomic_inc(&cache->frozen); } void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; bool cleanup; spin_lock(&block_group->lock); cleanup = (atomic_dec_and_test(&block_group->frozen) && test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)); spin_unlock(&block_group->lock); if (cleanup) { struct btrfs_chunk_map *map; map = btrfs_find_chunk_map(fs_info, block_group->start, 1); /* Logic error, can't happen. */ ASSERT(map); btrfs_remove_chunk_map(fs_info, map); /* Once for our lookup reference. */ btrfs_free_chunk_map(map); /* * We may have left one free space entry and other possible * tasks trimming this block group have left 1 entry each one. * Free them if any. */ btrfs_remove_free_space_cache(block_group); } } bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg) { bool ret = true; spin_lock(&bg->lock); if (bg->ro) ret = false; else bg->swap_extents++; spin_unlock(&bg->lock); return ret; } void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount) { spin_lock(&bg->lock); ASSERT(!bg->ro); ASSERT(bg->swap_extents >= amount); bg->swap_extents -= amount; spin_unlock(&bg->lock); } enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size) { if (size <= SZ_128K) return BTRFS_BG_SZ_SMALL; if (size <= SZ_8M) return BTRFS_BG_SZ_MEDIUM; return BTRFS_BG_SZ_LARGE; } /* * Handle a block group allocating an extent in a size class * * @bg: The block group we allocated in. * @size_class: The size class of the allocation. * @force_wrong_size_class: Whether we are desperate enough to allow * mismatched size classes. * * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the * case of a race that leads to the wrong size class without * force_wrong_size_class set. * * find_free_extent will skip block groups with a mismatched size class until * it really needs to avoid ENOSPC. In that case it will set * force_wrong_size_class. However, if a block group is newly allocated and * doesn't yet have a size class, then it is possible for two allocations of * different sizes to race and both try to use it. The loser is caught here and * has to retry. */ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, enum btrfs_block_group_size_class size_class, bool force_wrong_size_class) { ASSERT(size_class != BTRFS_BG_SZ_NONE); /* The new allocation is in the right size class, do nothing */ if (bg->size_class == size_class) return 0; /* * The new allocation is in a mismatched size class. * This means one of two things: * * 1. Two tasks in find_free_extent for different size_classes raced * and hit the same empty block_group. Make the loser try again. * 2. A call to find_free_extent got desperate enough to set * 'force_wrong_slab'. Don't change the size_class, but allow the * allocation. */ if (bg->size_class != BTRFS_BG_SZ_NONE) { if (force_wrong_size_class) return 0; return -EAGAIN; } /* * The happy new block group case: the new allocation is the first * one in the block_group so we set size_class. */ bg->size_class = size_class; return 0; } bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg) { if (btrfs_is_zoned(bg->fs_info)) return false; if (!btrfs_is_block_group_data_only(bg)) return false; return true; }
23 23 21 2 2 2 2 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 // SPDX-License-Identifier: GPL-2.0 /* * fs/partitions/sun.c * * Code extracted from drivers/block/genhd.c * * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King */ #include "check.h" #define SUN_LABEL_MAGIC 0xDABE #define SUN_VTOC_SANITY 0x600DDEEE enum { SUN_WHOLE_DISK = 5, LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ }; int sun_partition(struct parsed_partitions *state) { int i; __be16 csum; int slot = 1; __be16 *ush; Sector sect; struct sun_disklabel { unsigned char info[128]; /* Informative text string */ struct sun_vtoc { __be32 version; /* Layout version */ char volume[8]; /* Volume name */ __be16 nparts; /* Number of partitions */ struct sun_info { /* Partition hdrs, sec 2 */ __be16 id; __be16 flags; } infos[8]; __be16 padding; /* Alignment padding */ __be32 bootinfo[3]; /* Info needed by mboot */ __be32 sanity; /* To verify vtoc sanity */ __be32 reserved[10]; /* Free space */ __be32 timestamp[8]; /* Partition timestamp */ } vtoc; __be32 write_reinstruct; /* sectors to skip, writes */ __be32 read_reinstruct; /* sectors to skip, reads */ unsigned char spare[148]; /* Padding */ __be16 rspeed; /* Disk rotational speed */ __be16 pcylcount; /* Physical cylinder count */ __be16 sparecyl; /* extra sects per cylinder */ __be16 obs1; /* gap1 */ __be16 obs2; /* gap2 */ __be16 ilfact; /* Interleave factor */ __be16 ncyl; /* Data cylinder count */ __be16 nacyl; /* Alt. cylinder count */ __be16 ntrks; /* Tracks per cylinder */ __be16 nsect; /* Sectors per track */ __be16 obs3; /* bhead - Label head offset */ __be16 obs4; /* ppart - Physical Partition */ struct sun_partition { __be32 start_cylinder; __be32 num_sectors; } partitions[8]; __be16 magic; /* Magic number */ __be16 csum; /* Label xor'd checksum */ } * label; struct sun_partition *p; unsigned long spc; int use_vtoc; int nparts; label = read_part_sector(state, 0, &sect); if (!label) return -1; p = label->partitions; if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) { /* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n", state->disk->disk_name, be16_to_cpu(label->magic)); */ put_dev_sector(sect); return 0; } /* Look at the checksum */ ush = ((__be16 *) (label+1)) - 1; for (csum = 0; ush >= ((__be16 *) label);) csum ^= *ush--; if (csum) { printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", state->disk->disk_name); put_dev_sector(sect); return 0; } /* Check to see if we can use the VTOC table */ use_vtoc = ((be32_to_cpu(label->vtoc.sanity) == SUN_VTOC_SANITY) && (be32_to_cpu(label->vtoc.version) == 1) && (be16_to_cpu(label->vtoc.nparts) <= 8)); /* Use 8 partition entries if not specified in validated VTOC */ nparts = (use_vtoc) ? be16_to_cpu(label->vtoc.nparts) : 8; /* * So that old Linux-Sun partitions continue to work, * alow the VTOC to be used under the additional condition ... */ use_vtoc = use_vtoc || !(label->vtoc.sanity || label->vtoc.version || label->vtoc.nparts); spc = be16_to_cpu(label->ntrks) * be16_to_cpu(label->nsect); for (i = 0; i < nparts; i++, p++) { unsigned long st_sector; unsigned int num_sectors; st_sector = be32_to_cpu(p->start_cylinder) * spc; num_sectors = be32_to_cpu(p->num_sectors); if (num_sectors) { put_partition(state, slot, st_sector, num_sectors); state->parts[slot].flags = 0; if (use_vtoc) { if (be16_to_cpu(label->vtoc.infos[i].id) == LINUX_RAID_PARTITION) state->parts[slot].flags |= ADDPART_FLAG_RAID; else if (be16_to_cpu(label->vtoc.infos[i].id) == SUN_WHOLE_DISK) state->parts[slot].flags |= ADDPART_FLAG_WHOLEDISK; } } slot++; } strlcat(state->pp_buf, "\n", PAGE_SIZE); put_dev_sector(sect); return 1; }
55 23 55 55 1 54 54 4 54 55 54 54 53 44 44 43 44 44 29 29 16 13 44 31 31 31 31 44 44 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 /* mpicoder.c - Coder for the external representation of MPIs * Copyright (C) 1998, 1999 Free Software Foundation, Inc. * * This file is part of GnuPG. * * GnuPG is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * GnuPG is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ #include <linux/bitops.h> #include <linux/count_zeros.h> #include <linux/byteorder/generic.h> #include <linux/scatterlist.h> #include <linux/string.h> #include "mpi-internal.h" #define MAX_EXTERN_SCAN_BYTES (16*1024*1024) #define MAX_EXTERN_MPI_BITS 16384 /** * mpi_read_raw_data - Read a raw byte stream as a positive integer * @xbuffer: The data to read * @nbytes: The amount of data to read */ MPI mpi_read_raw_data(const void *xbuffer, size_t nbytes) { const uint8_t *buffer = xbuffer; int i, j; unsigned nbits, nlimbs; mpi_limb_t a; MPI val = NULL; while (nbytes > 0 && buffer[0] == 0) { buffer++; nbytes--; } nbits = nbytes * 8; if (nbits > MAX_EXTERN_MPI_BITS) { pr_info("MPI: mpi too large (%u bits)\n", nbits); return NULL; } if (nbytes > 0) nbits -= count_leading_zeros(buffer[0]) - (BITS_PER_LONG - 8); nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); val = mpi_alloc(nlimbs); if (!val) return NULL; val->nbits = nbits; val->sign = 0; val->nlimbs = nlimbs; if (nbytes > 0) { i = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB; i %= BYTES_PER_MPI_LIMB; for (j = nlimbs; j > 0; j--) { a = 0; for (; i < BYTES_PER_MPI_LIMB; i++) { a <<= 8; a |= *buffer++; } i = 0; val->d[j - 1] = a; } } return val; } EXPORT_SYMBOL_GPL(mpi_read_raw_data); MPI mpi_read_from_buffer(const void *xbuffer, unsigned *ret_nread) { const uint8_t *buffer = xbuffer; unsigned int nbits, nbytes; MPI val; if (*ret_nread < 2) return ERR_PTR(-EINVAL); nbits = buffer[0] << 8 | buffer[1]; if (nbits > MAX_EXTERN_MPI_BITS) { pr_info("MPI: mpi too large (%u bits)\n", nbits); return ERR_PTR(-EINVAL); } nbytes = DIV_ROUND_UP(nbits, 8); if (nbytes + 2 > *ret_nread) { pr_info("MPI: mpi larger than buffer nbytes=%u ret_nread=%u\n", nbytes, *ret_nread); return ERR_PTR(-EINVAL); } val = mpi_read_raw_data(buffer + 2, nbytes); if (!val) return ERR_PTR(-ENOMEM); *ret_nread = nbytes + 2; return val; } EXPORT_SYMBOL_GPL(mpi_read_from_buffer); /**************** * Fill the mpi VAL from the hex string in STR. */ int mpi_fromstr(MPI val, const char *str) { int sign = 0; int prepend_zero = 0; int i, j, c, c1, c2; unsigned int nbits, nbytes, nlimbs; mpi_limb_t a; if (*str == '-') { sign = 1; str++; } /* Skip optional hex prefix. */ if (*str == '0' && str[1] == 'x') str += 2; nbits = strlen(str); if (nbits > MAX_EXTERN_SCAN_BYTES) { mpi_clear(val); return -EINVAL; } nbits *= 4; if ((nbits % 8)) prepend_zero = 1; nbytes = (nbits+7) / 8; nlimbs = (nbytes+BYTES_PER_MPI_LIMB-1) / BYTES_PER_MPI_LIMB; if (val->alloced < nlimbs) mpi_resize(val, nlimbs); i = BYTES_PER_MPI_LIMB - (nbytes % BYTES_PER_MPI_LIMB); i %= BYTES_PER_MPI_LIMB; j = val->nlimbs = nlimbs; val->sign = sign; for (; j > 0; j--) { a = 0; for (; i < BYTES_PER_MPI_LIMB; i++) { if (prepend_zero) { c1 = '0'; prepend_zero = 0; } else c1 = *str++; if (!c1) { mpi_clear(val); return -EINVAL; } c2 = *str++; if (!c2) { mpi_clear(val); return -EINVAL; } if (c1 >= '0' && c1 <= '9') c = c1 - '0'; else if (c1 >= 'a' && c1 <= 'f') c = c1 - 'a' + 10; else if (c1 >= 'A' && c1 <= 'F') c = c1 - 'A' + 10; else { mpi_clear(val); return -EINVAL; } c <<= 4; if (c2 >= '0' && c2 <= '9') c |= c2 - '0'; else if (c2 >= 'a' && c2 <= 'f') c |= c2 - 'a' + 10; else if (c2 >= 'A' && c2 <= 'F') c |= c2 - 'A' + 10; else { mpi_clear(val); return -EINVAL; } a <<= 8; a |= c; } i = 0; val->d[j-1] = a; } return 0; } EXPORT_SYMBOL_GPL(mpi_fromstr); MPI mpi_scanval(const char *string) { MPI a; a = mpi_alloc(0); if (!a) return NULL; if (mpi_fromstr(a, string)) { mpi_free(a); return NULL; } mpi_normalize(a); return a; } EXPORT_SYMBOL_GPL(mpi_scanval); static int count_lzeros(MPI a) { mpi_limb_t alimb; int i, lzeros = 0; for (i = a->nlimbs - 1; i >= 0; i--) { alimb = a->d[i]; if (alimb == 0) { lzeros += sizeof(mpi_limb_t); } else { lzeros += count_leading_zeros(alimb) / 8; break; } } return lzeros; } /** * mpi_read_buffer() - read MPI to a buffer provided by user (msb first) * * @a: a multi precision integer * @buf: buffer to which the output will be written to. Needs to be at * least mpi_get_size(a) long. * @buf_len: size of the buf. * @nbytes: receives the actual length of the data written on success and * the data to-be-written on -EOVERFLOW in case buf_len was too * small. * @sign: if not NULL, it will be set to the sign of a. * * Return: 0 on success or error code in case of error */ int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes, int *sign) { uint8_t *p; #if BYTES_PER_MPI_LIMB == 4 __be32 alimb; #elif BYTES_PER_MPI_LIMB == 8 __be64 alimb; #else #error please implement for this limb size. #endif unsigned int n = mpi_get_size(a); int i, lzeros; if (!buf || !nbytes) return -EINVAL; if (sign) *sign = a->sign; lzeros = count_lzeros(a); if (buf_len < n - lzeros) { *nbytes = n - lzeros; return -EOVERFLOW; } p = buf; *nbytes = n - lzeros; for (i = a->nlimbs - 1 - lzeros / BYTES_PER_MPI_LIMB, lzeros %= BYTES_PER_MPI_LIMB; i >= 0; i--) { #if BYTES_PER_MPI_LIMB == 4 alimb = cpu_to_be32(a->d[i]); #elif BYTES_PER_MPI_LIMB == 8 alimb = cpu_to_be64(a->d[i]); #else #error please implement for this limb size. #endif memcpy(p, (u8 *)&alimb + lzeros, BYTES_PER_MPI_LIMB - lzeros); p += BYTES_PER_MPI_LIMB - lzeros; lzeros = 0; } return 0; } EXPORT_SYMBOL_GPL(mpi_read_buffer); /* * mpi_get_buffer() - Returns an allocated buffer with the MPI (msb first). * Caller must free the return string. * This function does return a 0 byte buffer with nbytes set to zero if the * value of A is zero. * * @a: a multi precision integer. * @nbytes: receives the length of this buffer. * @sign: if not NULL, it will be set to the sign of the a. * * Return: Pointer to MPI buffer or NULL on error */ void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign) { uint8_t *buf; unsigned int n; int ret; if (!nbytes) return NULL; n = mpi_get_size(a); if (!n) n++; buf = kmalloc(n, GFP_KERNEL); if (!buf) return NULL; ret = mpi_read_buffer(a, buf, n, nbytes, sign); if (ret) { kfree(buf); return NULL; } return buf; } EXPORT_SYMBOL_GPL(mpi_get_buffer); /** * mpi_write_to_sgl() - Funnction exports MPI to an sgl (msb first) * * This function works in the same way as the mpi_read_buffer, but it * takes an sgl instead of u8 * buf. * * @a: a multi precision integer * @sgl: scatterlist to write to. Needs to be at least * mpi_get_size(a) long. * @nbytes: the number of bytes to write. Leading bytes will be * filled with zero. * @sign: if not NULL, it will be set to the sign of a. * * Return: 0 on success or error code in case of error */ int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned nbytes, int *sign) { u8 *p, *p2; #if BYTES_PER_MPI_LIMB == 4 __be32 alimb; #elif BYTES_PER_MPI_LIMB == 8 __be64 alimb; #else #error please implement for this limb size. #endif unsigned int n = mpi_get_size(a); struct sg_mapping_iter miter; int i, x, buf_len; int nents; if (sign) *sign = a->sign; if (nbytes < n) return -EOVERFLOW; nents = sg_nents_for_len(sgl, nbytes); if (nents < 0) return -EINVAL; sg_miter_start(&miter, sgl, nents, SG_MITER_ATOMIC | SG_MITER_TO_SG); sg_miter_next(&miter); buf_len = miter.length; p2 = miter.addr; while (nbytes > n) { i = min_t(unsigned, nbytes - n, buf_len); memset(p2, 0, i); p2 += i; nbytes -= i; buf_len -= i; if (!buf_len) { sg_miter_next(&miter); buf_len = miter.length; p2 = miter.addr; } } for (i = a->nlimbs - 1; i >= 0; i--) { #if BYTES_PER_MPI_LIMB == 4 alimb = a->d[i] ? cpu_to_be32(a->d[i]) : 0; #elif BYTES_PER_MPI_LIMB == 8 alimb = a->d[i] ? cpu_to_be64(a->d[i]) : 0; #else #error please implement for this limb size. #endif p = (u8 *)&alimb; for (x = 0; x < sizeof(alimb); x++) { *p2++ = *p++; if (!--buf_len) { sg_miter_next(&miter); buf_len = miter.length; p2 = miter.addr; } } } sg_miter_stop(&miter); return 0; } EXPORT_SYMBOL_GPL(mpi_write_to_sgl); /* * mpi_read_raw_from_sgl() - Function allocates an MPI and populates it with * data from the sgl * * This function works in the same way as the mpi_read_raw_data, but it * takes an sgl instead of void * buffer. i.e. it allocates * a new MPI and reads the content of the sgl to the MPI. * * @sgl: scatterlist to read from * @nbytes: number of bytes to read * * Return: Pointer to a new MPI or NULL on error */ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes) { struct sg_mapping_iter miter; unsigned int nbits, nlimbs; int x, j, z, lzeros, ents; unsigned int len; const u8 *buff; mpi_limb_t a; MPI val = NULL; ents = sg_nents_for_len(sgl, nbytes); if (ents < 0) return NULL; sg_miter_start(&miter, sgl, ents, SG_MITER_ATOMIC | SG_MITER_FROM_SG); lzeros = 0; len = 0; while (nbytes > 0) { while (len && !*buff) { lzeros++; len--; buff++; } if (len && *buff) break; sg_miter_next(&miter); buff = miter.addr; len = miter.length; nbytes -= lzeros; lzeros = 0; } miter.consumed = lzeros; nbytes -= lzeros; nbits = nbytes * 8; if (nbits > MAX_EXTERN_MPI_BITS) { sg_miter_stop(&miter); pr_info("MPI: mpi too large (%u bits)\n", nbits); return NULL; } if (nbytes > 0) nbits -= count_leading_zeros(*buff) - (BITS_PER_LONG - 8); sg_miter_stop(&miter); nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); val = mpi_alloc(nlimbs); if (!val) return NULL; val->nbits = nbits; val->sign = 0; val->nlimbs = nlimbs; if (nbytes == 0) return val; j = nlimbs - 1; a = 0; z = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB; z %= BYTES_PER_MPI_LIMB; while (sg_miter_next(&miter)) { buff = miter.addr; len = min_t(unsigned, miter.length, nbytes); nbytes -= len; for (x = 0; x < len; x++) { a <<= 8; a |= *buff++; if (((z + x + 1) % BYTES_PER_MPI_LIMB) == 0) { val->d[j--] = a; a = 0; } } z += x; } return val; } EXPORT_SYMBOL_GPL(mpi_read_raw_from_sgl); /* Perform a two's complement operation on buffer P of size N bytes. */ static void twocompl(unsigned char *p, unsigned int n) { int i; for (i = n-1; i >= 0 && !p[i]; i--) ; if (i >= 0) { if ((p[i] & 0x01)) p[i] = (((p[i] ^ 0xfe) | 0x01) & 0xff); else if ((p[i] & 0x02)) p[i] = (((p[i] ^ 0xfc) | 0x02) & 0xfe); else if ((p[i] & 0x04)) p[i] = (((p[i] ^ 0xf8) | 0x04) & 0xfc); else if ((p[i] & 0x08)) p[i] = (((p[i] ^ 0xf0) | 0x08) & 0xf8); else if ((p[i] & 0x10)) p[i] = (((p[i] ^ 0xe0) | 0x10) & 0xf0); else if ((p[i] & 0x20)) p[i] = (((p[i] ^ 0xc0) | 0x20) & 0xe0); else if ((p[i] & 0x40)) p[i] = (((p[i] ^ 0x80) | 0x40) & 0xc0); else p[i] = 0x80; for (i--; i >= 0; i--) p[i] ^= 0xff; } } int mpi_print(enum gcry_mpi_format format, unsigned char *buffer, size_t buflen, size_t *nwritten, MPI a) { unsigned int nbits = mpi_get_nbits(a); size_t len; size_t dummy_nwritten; int negative; if (!nwritten) nwritten = &dummy_nwritten; /* Libgcrypt does no always care to set clear the sign if the value * is 0. For printing this is a bit of a surprise, in particular * because if some of the formats don't support negative numbers but * should be able to print a zero. Thus we need this extra test * for a negative number. */ if (a->sign && mpi_cmp_ui(a, 0)) negative = 1; else negative = 0; len = buflen; *nwritten = 0; if (format == GCRYMPI_FMT_STD) { unsigned char *tmp; int extra = 0; unsigned int n; tmp = mpi_get_buffer(a, &n, NULL); if (!tmp) return -EINVAL; if (negative) { twocompl(tmp, n); if (!(*tmp & 0x80)) { /* Need to extend the sign. */ n++; extra = 2; } } else if (n && (*tmp & 0x80)) { /* Positive but the high bit of the returned buffer is set. * Thus we need to print an extra leading 0x00 so that the * output is interpreted as a positive number. */ n++; extra = 1; } if (buffer && n > len) { /* The provided buffer is too short. */ kfree(tmp); return -E2BIG; } if (buffer) { unsigned char *s = buffer; if (extra == 1) *s++ = 0; else if (extra) *s++ = 0xff; memcpy(s, tmp, n-!!extra); } kfree(tmp); *nwritten = n; return 0; } else if (format == GCRYMPI_FMT_USG) { unsigned int n = (nbits + 7)/8; /* Note: We ignore the sign for this format. */ /* FIXME: for performance reasons we should put this into * mpi_aprint because we can then use the buffer directly. */ if (buffer && n > len) return -E2BIG; if (buffer) { unsigned char *tmp; tmp = mpi_get_buffer(a, &n, NULL); if (!tmp) return -EINVAL; memcpy(buffer, tmp, n); kfree(tmp); } *nwritten = n; return 0; } else if (format == GCRYMPI_FMT_PGP) { unsigned int n = (nbits + 7)/8; /* The PGP format can only handle unsigned integers. */ if (negative) return -EINVAL; if (buffer && n+2 > len) return -E2BIG; if (buffer) { unsigned char *tmp; unsigned char *s = buffer; s[0] = nbits >> 8; s[1] = nbits; tmp = mpi_get_buffer(a, &n, NULL); if (!tmp) return -EINVAL; memcpy(s+2, tmp, n); kfree(tmp); } *nwritten = n+2; return 0; } else if (format == GCRYMPI_FMT_SSH) { unsigned char *tmp; int extra = 0; unsigned int n; tmp = mpi_get_buffer(a, &n, NULL); if (!tmp) return -EINVAL; if (negative) { twocompl(tmp, n); if (!(*tmp & 0x80)) { /* Need to extend the sign. */ n++; extra = 2; } } else if (n && (*tmp & 0x80)) { n++; extra = 1; } if (buffer && n+4 > len) { kfree(tmp); return -E2BIG; } if (buffer) { unsigned char *s = buffer; *s++ = n >> 24; *s++ = n >> 16; *s++ = n >> 8; *s++ = n; if (extra == 1) *s++ = 0; else if (extra) *s++ = 0xff; memcpy(s, tmp, n-!!extra); } kfree(tmp); *nwritten = 4+n; return 0; } else if (format == GCRYMPI_FMT_HEX) { unsigned char *tmp; int i; int extra = 0; unsigned int n = 0; tmp = mpi_get_buffer(a, &n, NULL); if (!tmp) return -EINVAL; if (!n || (*tmp & 0x80)) extra = 2; if (buffer && 2*n + extra + negative + 1 > len) { kfree(tmp); return -E2BIG; } if (buffer) { unsigned char *s = buffer; if (negative) *s++ = '-'; if (extra) { *s++ = '0'; *s++ = '0'; } for (i = 0; i < n; i++) { unsigned int c = tmp[i]; *s++ = (c >> 4) < 10 ? '0'+(c>>4) : 'A'+(c>>4)-10; c &= 15; *s++ = c < 10 ? '0'+c : 'A'+c-10; } *s++ = 0; *nwritten = s - buffer; } else { *nwritten = 2*n + extra + negative + 1; } kfree(tmp); return 0; } else return -EINVAL; } EXPORT_SYMBOL_GPL(mpi_print);
103 103 103 11 103 11 103 103 103 103 103 103 103 103 103 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 // SPDX-License-Identifier: GPL-2.0 /* * driver.c - centralized device driver management * * Copyright (c) 2002-3 Patrick Mochel * Copyright (c) 2002-3 Open Source Development Labs * Copyright (c) 2007 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2007 Novell Inc. */ #include <linux/device/driver.h> #include <linux/device.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/sysfs.h> #include "base.h" static struct device *next_device(struct klist_iter *i) { struct klist_node *n = klist_next(i); struct device *dev = NULL; struct device_private *dev_prv; if (n) { dev_prv = to_device_private_driver(n); dev = dev_prv->device; } return dev; } /** * driver_set_override() - Helper to set or clear driver override. * @dev: Device to change * @override: Address of string to change (e.g. &device->driver_override); * The contents will be freed and hold newly allocated override. * @s: NUL-terminated string, new driver name to force a match, pass empty * string to clear it ("" or "\n", where the latter is only for sysfs * interface). * @len: length of @s * * Helper to set or clear driver override in a device, intended for the cases * when the driver_override field is allocated by driver/bus code. * * Returns: 0 on success or a negative error code on failure. */ int driver_set_override(struct device *dev, const char **override, const char *s, size_t len) { const char *new, *old; char *cp; if (!override || !s) return -EINVAL; /* * The stored value will be used in sysfs show callback (sysfs_emit()), * which has a length limit of PAGE_SIZE and adds a trailing newline. * Thus we can store one character less to avoid truncation during sysfs * show. */ if (len >= (PAGE_SIZE - 1)) return -EINVAL; /* * Compute the real length of the string in case userspace sends us a * bunch of \0 characters like python likes to do. */ len = strlen(s); if (!len) { /* Empty string passed - clear override */ device_lock(dev); old = *override; *override = NULL; device_unlock(dev); kfree(old); return 0; } cp = strnchr(s, len, '\n'); if (cp) len = cp - s; new = kstrndup(s, len, GFP_KERNEL); if (!new) return -ENOMEM; device_lock(dev); old = *override; if (cp != s) { *override = new; } else { /* "\n" passed - clear override */ kfree(new); *override = NULL; } device_unlock(dev); kfree(old); return 0; } EXPORT_SYMBOL_GPL(driver_set_override); /** * driver_for_each_device - Iterator for devices bound to a driver. * @drv: Driver we're iterating. * @start: Device to begin with * @data: Data to pass to the callback. * @fn: Function to call for each device. * * Iterate over the @drv's list of devices calling @fn for each one. */ int driver_for_each_device(struct device_driver *drv, struct device *start, void *data, int (*fn)(struct device *, void *)) { struct klist_iter i; struct device *dev; int error = 0; if (!drv) return -EINVAL; klist_iter_init_node(&drv->p->klist_devices, &i, start ? &start->p->knode_driver : NULL); while (!error && (dev = next_device(&i))) error = fn(dev, data); klist_iter_exit(&i); return error; } EXPORT_SYMBOL_GPL(driver_for_each_device); /** * driver_find_device - device iterator for locating a particular device. * @drv: The device's driver * @start: Device to begin with * @data: Data to pass to match function * @match: Callback function to check device * * This is similar to the driver_for_each_device() function above, but * it returns a reference to a device that is 'found' for later use, as * determined by the @match callback. * * The callback should return 0 if the device doesn't match and non-zero * if it does. If the callback returns non-zero, this function will * return to the caller and not iterate over any more devices. */ struct device *driver_find_device(struct device_driver *drv, struct device *start, const void *data, int (*match)(struct device *dev, const void *data)) { struct klist_iter i; struct device *dev; if (!drv || !drv->p) return NULL; klist_iter_init_node(&drv->p->klist_devices, &i, (start ? &start->p->knode_driver : NULL)); while ((dev = next_device(&i))) if (match(dev, data) && get_device(dev)) break; klist_iter_exit(&i); return dev; } EXPORT_SYMBOL_GPL(driver_find_device); /** * driver_create_file - create sysfs file for driver. * @drv: driver. * @attr: driver attribute descriptor. */ int driver_create_file(struct device_driver *drv, const struct driver_attribute *attr) { int error; if (drv) error = sysfs_create_file(&drv->p->kobj, &attr->attr); else error = -EINVAL; return error; } EXPORT_SYMBOL_GPL(driver_create_file); /** * driver_remove_file - remove sysfs file for driver. * @drv: driver. * @attr: driver attribute descriptor. */ void driver_remove_file(struct device_driver *drv, const struct driver_attribute *attr) { if (drv) sysfs_remove_file(&drv->p->kobj, &attr->attr); } EXPORT_SYMBOL_GPL(driver_remove_file); int driver_add_groups(struct device_driver *drv, const struct attribute_group **groups) { return sysfs_create_groups(&drv->p->kobj, groups); } void driver_remove_groups(struct device_driver *drv, const struct attribute_group **groups) { sysfs_remove_groups(&drv->p->kobj, groups); } /** * driver_register - register driver with bus * @drv: driver to register * * We pass off most of the work to the bus_add_driver() call, * since most of the things we have to do deal with the bus * structures. */ int driver_register(struct device_driver *drv) { int ret; struct device_driver *other; if (!bus_is_registered(drv->bus)) { pr_err("Driver '%s' was unable to register with bus_type '%s' because the bus was not initialized.\n", drv->name, drv->bus->name); return -EINVAL; } if ((drv->bus->probe && drv->probe) || (drv->bus->remove && drv->remove) || (drv->bus->shutdown && drv->shutdown)) pr_warn("Driver '%s' needs updating - please use " "bus_type methods\n", drv->name); other = driver_find(drv->name, drv->bus); if (other) { pr_err("Error: Driver '%s' is already registered, " "aborting...\n", drv->name); return -EBUSY; } ret = bus_add_driver(drv); if (ret) return ret; ret = driver_add_groups(drv, drv->groups); if (ret) { bus_remove_driver(drv); return ret; } kobject_uevent(&drv->p->kobj, KOBJ_ADD); deferred_probe_extend_timeout(); return ret; } EXPORT_SYMBOL_GPL(driver_register); /** * driver_unregister - remove driver from system. * @drv: driver. * * Again, we pass off most of the work to the bus-level call. */ void driver_unregister(struct device_driver *drv) { if (!drv || !drv->p) { WARN(1, "Unexpected driver unregister!\n"); return; } driver_remove_groups(drv, drv->groups); bus_remove_driver(drv); } EXPORT_SYMBOL_GPL(driver_unregister);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_IRQ_H #define _LINUX_IRQ_H /* * Please do not include this file in generic code. There is currently * no requirement for any architecture to implement anything held * within this file. * * Thanks. --rmk */ #include <linux/cache.h> #include <linux/spinlock.h> #include <linux/cpumask.h> #include <linux/irqhandler.h> #include <linux/irqreturn.h> #include <linux/irqnr.h> #include <linux/topology.h> #include <linux/io.h> #include <linux/slab.h> #include <asm/irq.h> #include <asm/ptrace.h> #include <asm/irq_regs.h> struct seq_file; struct module; struct msi_msg; struct irq_affinity_desc; enum irqchip_irq_state; /* * IRQ line status. * * Bits 0-7 are the same as the IRQF_* bits in linux/interrupt.h * * IRQ_TYPE_NONE - default, unspecified type * IRQ_TYPE_EDGE_RISING - rising edge triggered * IRQ_TYPE_EDGE_FALLING - falling edge triggered * IRQ_TYPE_EDGE_BOTH - rising and falling edge triggered * IRQ_TYPE_LEVEL_HIGH - high level triggered * IRQ_TYPE_LEVEL_LOW - low level triggered * IRQ_TYPE_LEVEL_MASK - Mask to filter out the level bits * IRQ_TYPE_SENSE_MASK - Mask for all the above bits * IRQ_TYPE_DEFAULT - For use by some PICs to ask irq_set_type * to setup the HW to a sane default (used * by irqdomain map() callbacks to synchronize * the HW state and SW flags for a newly * allocated descriptor). * * IRQ_TYPE_PROBE - Special flag for probing in progress * * Bits which can be modified via irq_set/clear/modify_status_flags() * IRQ_LEVEL - Interrupt is level type. Will be also * updated in the code when the above trigger * bits are modified via irq_set_irq_type() * IRQ_PER_CPU - Mark an interrupt PER_CPU. Will protect * it from affinity setting * IRQ_NOPROBE - Interrupt cannot be probed by autoprobing * IRQ_NOREQUEST - Interrupt cannot be requested via * request_irq() * IRQ_NOTHREAD - Interrupt cannot be threaded * IRQ_NOAUTOEN - Interrupt is not automatically enabled in * request/setup_irq() * IRQ_NO_BALANCING - Interrupt cannot be balanced (affinity set) * IRQ_MOVE_PCNTXT - Interrupt can be migrated from process context * IRQ_NESTED_THREAD - Interrupt nests into another thread * IRQ_PER_CPU_DEVID - Dev_id is a per-cpu variable * IRQ_IS_POLLED - Always polled by another interrupt. Exclude * it from the spurious interrupt detection * mechanism and from core side polling. * IRQ_DISABLE_UNLAZY - Disable lazy irq disable * IRQ_HIDDEN - Don't show up in /proc/interrupts * IRQ_NO_DEBUG - Exclude from note_interrupt() debugging */ enum { IRQ_TYPE_NONE = 0x00000000, IRQ_TYPE_EDGE_RISING = 0x00000001, IRQ_TYPE_EDGE_FALLING = 0x00000002, IRQ_TYPE_EDGE_BOTH = (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING), IRQ_TYPE_LEVEL_HIGH = 0x00000004, IRQ_TYPE_LEVEL_LOW = 0x00000008, IRQ_TYPE_LEVEL_MASK = (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH), IRQ_TYPE_SENSE_MASK = 0x0000000f, IRQ_TYPE_DEFAULT = IRQ_TYPE_SENSE_MASK, IRQ_TYPE_PROBE = 0x00000010, IRQ_LEVEL = (1 << 8), IRQ_PER_CPU = (1 << 9), IRQ_NOPROBE = (1 << 10), IRQ_NOREQUEST = (1 << 11), IRQ_NOAUTOEN = (1 << 12), IRQ_NO_BALANCING = (1 << 13), IRQ_MOVE_PCNTXT = (1 << 14), IRQ_NESTED_THREAD = (1 << 15), IRQ_NOTHREAD = (1 << 16), IRQ_PER_CPU_DEVID = (1 << 17), IRQ_IS_POLLED = (1 << 18), IRQ_DISABLE_UNLAZY = (1 << 19), IRQ_HIDDEN = (1 << 20), IRQ_NO_DEBUG = (1 << 21), }; #define IRQF_MODIFY_MASK \ (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \ IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \ IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \ IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_HIDDEN) #define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING) /* * Return value for chip->irq_set_affinity() * * IRQ_SET_MASK_OK - OK, core updates irq_common_data.affinity * IRQ_SET_MASK_NOCPY - OK, chip did update irq_common_data.affinity * IRQ_SET_MASK_OK_DONE - Same as IRQ_SET_MASK_OK for core. Special code to * support stacked irqchips, which indicates skipping * all descendant irqchips. */ enum { IRQ_SET_MASK_OK = 0, IRQ_SET_MASK_OK_NOCOPY, IRQ_SET_MASK_OK_DONE, }; struct msi_desc; struct irq_domain; /** * struct irq_common_data - per irq data shared by all irqchips * @state_use_accessors: status information for irq chip functions. * Use accessor functions to deal with it * @node: node index useful for balancing * @handler_data: per-IRQ data for the irq_chip methods * @affinity: IRQ affinity on SMP. If this is an IPI * related irq, then this is the mask of the * CPUs to which an IPI can be sent. * @effective_affinity: The effective IRQ affinity on SMP as some irq * chips do not allow multi CPU destinations. * A subset of @affinity. * @msi_desc: MSI descriptor * @ipi_offset: Offset of first IPI target cpu in @affinity. Optional. */ struct irq_common_data { unsigned int __private state_use_accessors; #ifdef CONFIG_NUMA unsigned int node; #endif void *handler_data; struct msi_desc *msi_desc; #ifdef CONFIG_SMP cpumask_var_t affinity; #endif #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK cpumask_var_t effective_affinity; #endif #ifdef CONFIG_GENERIC_IRQ_IPI unsigned int ipi_offset; #endif }; /** * struct irq_data - per irq chip data passed down to chip functions * @mask: precomputed bitmask for accessing the chip registers * @irq: interrupt number * @hwirq: hardware interrupt number, local to the interrupt domain * @common: point to data shared by all irqchips * @chip: low level interrupt hardware access * @domain: Interrupt translation domain; responsible for mapping * between hwirq number and linux irq number. * @parent_data: pointer to parent struct irq_data to support hierarchy * irq_domain * @chip_data: platform-specific per-chip private data for the chip * methods, to allow shared chip implementations */ struct irq_data { u32 mask; unsigned int irq; unsigned long hwirq; struct irq_common_data *common; struct irq_chip *chip; struct irq_domain *domain; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY struct irq_data *parent_data; #endif void *chip_data; }; /* * Bit masks for irq_common_data.state_use_accessors * * IRQD_TRIGGER_MASK - Mask for the trigger type bits * IRQD_SETAFFINITY_PENDING - Affinity setting is pending * IRQD_ACTIVATED - Interrupt has already been activated * IRQD_NO_BALANCING - Balancing disabled for this IRQ * IRQD_PER_CPU - Interrupt is per cpu * IRQD_AFFINITY_SET - Interrupt affinity was set * IRQD_LEVEL - Interrupt is level triggered * IRQD_WAKEUP_STATE - Interrupt is configured for wakeup * from suspend * IRQD_MOVE_PCNTXT - Interrupt can be moved in process * context * IRQD_IRQ_DISABLED - Disabled state of the interrupt * IRQD_IRQ_MASKED - Masked state of the interrupt * IRQD_IRQ_INPROGRESS - In progress state of the interrupt * IRQD_WAKEUP_ARMED - Wakeup mode armed * IRQD_FORWARDED_TO_VCPU - The interrupt is forwarded to a VCPU * IRQD_AFFINITY_MANAGED - Affinity is auto-managed by the kernel * IRQD_IRQ_STARTED - Startup state of the interrupt * IRQD_MANAGED_SHUTDOWN - Interrupt was shutdown due to empty affinity * mask. Applies only to affinity managed irqs. * IRQD_SINGLE_TARGET - IRQ allows only a single affinity target * IRQD_DEFAULT_TRIGGER_SET - Expected trigger already been set * IRQD_CAN_RESERVE - Can use reservation mode * IRQD_HANDLE_ENFORCE_IRQCTX - Enforce that handle_irq_*() is only invoked * from actual interrupt context. * IRQD_AFFINITY_ON_ACTIVATE - Affinity is set on activation. Don't call * irq_chip::irq_set_affinity() when deactivated. * IRQD_IRQ_ENABLED_ON_SUSPEND - Interrupt is enabled on suspend by irq pm if * irqchip have flag IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND set. * IRQD_RESEND_WHEN_IN_PROGRESS - Interrupt may fire when already in progress in which * case it must be resent at the next available opportunity. */ enum { IRQD_TRIGGER_MASK = 0xf, IRQD_SETAFFINITY_PENDING = BIT(8), IRQD_ACTIVATED = BIT(9), IRQD_NO_BALANCING = BIT(10), IRQD_PER_CPU = BIT(11), IRQD_AFFINITY_SET = BIT(12), IRQD_LEVEL = BIT(13), IRQD_WAKEUP_STATE = BIT(14), IRQD_MOVE_PCNTXT = BIT(15), IRQD_IRQ_DISABLED = BIT(16), IRQD_IRQ_MASKED = BIT(17), IRQD_IRQ_INPROGRESS = BIT(18), IRQD_WAKEUP_ARMED = BIT(19), IRQD_FORWARDED_TO_VCPU = BIT(20), IRQD_AFFINITY_MANAGED = BIT(21), IRQD_IRQ_STARTED = BIT(22), IRQD_MANAGED_SHUTDOWN = BIT(23), IRQD_SINGLE_TARGET = BIT(24), IRQD_DEFAULT_TRIGGER_SET = BIT(25), IRQD_CAN_RESERVE = BIT(26), IRQD_HANDLE_ENFORCE_IRQCTX = BIT(27), IRQD_AFFINITY_ON_ACTIVATE = BIT(28), IRQD_IRQ_ENABLED_ON_SUSPEND = BIT(29), IRQD_RESEND_WHEN_IN_PROGRESS = BIT(30), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) static inline bool irqd_is_setaffinity_pending(struct irq_data *d) { return __irqd_to_state(d) & IRQD_SETAFFINITY_PENDING; } static inline bool irqd_is_per_cpu(struct irq_data *d) { return __irqd_to_state(d) & IRQD_PER_CPU; } static inline bool irqd_can_balance(struct irq_data *d) { return !(__irqd_to_state(d) & (IRQD_PER_CPU | IRQD_NO_BALANCING)); } static inline bool irqd_affinity_was_set(struct irq_data *d) { return __irqd_to_state(d) & IRQD_AFFINITY_SET; } static inline void irqd_mark_affinity_was_set(struct irq_data *d) { __irqd_to_state(d) |= IRQD_AFFINITY_SET; } static inline bool irqd_trigger_type_was_set(struct irq_data *d) { return __irqd_to_state(d) & IRQD_DEFAULT_TRIGGER_SET; } static inline u32 irqd_get_trigger_type(struct irq_data *d) { return __irqd_to_state(d) & IRQD_TRIGGER_MASK; } /* * Must only be called inside irq_chip.irq_set_type() functions or * from the DT/ACPI setup code. */ static inline void irqd_set_trigger_type(struct irq_data *d, u32 type) { __irqd_to_state(d) &= ~IRQD_TRIGGER_MASK; __irqd_to_state(d) |= type & IRQD_TRIGGER_MASK; __irqd_to_state(d) |= IRQD_DEFAULT_TRIGGER_SET; } static inline bool irqd_is_level_type(struct irq_data *d) { return __irqd_to_state(d) & IRQD_LEVEL; } /* * Must only be called of irqchip.irq_set_affinity() or low level * hierarchy domain allocation functions. */ static inline void irqd_set_single_target(struct irq_data *d) { __irqd_to_state(d) |= IRQD_SINGLE_TARGET; } static inline bool irqd_is_single_target(struct irq_data *d) { return __irqd_to_state(d) & IRQD_SINGLE_TARGET; } static inline void irqd_set_handle_enforce_irqctx(struct irq_data *d) { __irqd_to_state(d) |= IRQD_HANDLE_ENFORCE_IRQCTX; } static inline bool irqd_is_handle_enforce_irqctx(struct irq_data *d) { return __irqd_to_state(d) & IRQD_HANDLE_ENFORCE_IRQCTX; } static inline bool irqd_is_enabled_on_suspend(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_ENABLED_ON_SUSPEND; } static inline bool irqd_is_wakeup_set(struct irq_data *d) { return __irqd_to_state(d) & IRQD_WAKEUP_STATE; } static inline bool irqd_can_move_in_process_context(struct irq_data *d) { return __irqd_to_state(d) & IRQD_MOVE_PCNTXT; } static inline bool irqd_irq_disabled(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_DISABLED; } static inline bool irqd_irq_masked(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_MASKED; } static inline bool irqd_irq_inprogress(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_INPROGRESS; } static inline bool irqd_is_wakeup_armed(struct irq_data *d) { return __irqd_to_state(d) & IRQD_WAKEUP_ARMED; } static inline bool irqd_is_forwarded_to_vcpu(struct irq_data *d) { return __irqd_to_state(d) & IRQD_FORWARDED_TO_VCPU; } static inline void irqd_set_forwarded_to_vcpu(struct irq_data *d) { __irqd_to_state(d) |= IRQD_FORWARDED_TO_VCPU; } static inline void irqd_clr_forwarded_to_vcpu(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_FORWARDED_TO_VCPU; } static inline bool irqd_affinity_is_managed(struct irq_data *d) { return __irqd_to_state(d) & IRQD_AFFINITY_MANAGED; } static inline bool irqd_is_activated(struct irq_data *d) { return __irqd_to_state(d) & IRQD_ACTIVATED; } static inline void irqd_set_activated(struct irq_data *d) { __irqd_to_state(d) |= IRQD_ACTIVATED; } static inline void irqd_clr_activated(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_ACTIVATED; } static inline bool irqd_is_started(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_STARTED; } static inline bool irqd_is_managed_and_shutdown(struct irq_data *d) { return __irqd_to_state(d) & IRQD_MANAGED_SHUTDOWN; } static inline void irqd_set_can_reserve(struct irq_data *d) { __irqd_to_state(d) |= IRQD_CAN_RESERVE; } static inline void irqd_clr_can_reserve(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_CAN_RESERVE; } static inline bool irqd_can_reserve(struct irq_data *d) { return __irqd_to_state(d) & IRQD_CAN_RESERVE; } static inline void irqd_set_affinity_on_activate(struct irq_data *d) { __irqd_to_state(d) |= IRQD_AFFINITY_ON_ACTIVATE; } static inline bool irqd_affinity_on_activate(struct irq_data *d) { return __irqd_to_state(d) & IRQD_AFFINITY_ON_ACTIVATE; } static inline void irqd_set_resend_when_in_progress(struct irq_data *d) { __irqd_to_state(d) |= IRQD_RESEND_WHEN_IN_PROGRESS; } static inline bool irqd_needs_resend_when_in_progress(struct irq_data *d) { return __irqd_to_state(d) & IRQD_RESEND_WHEN_IN_PROGRESS; } #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) { return d->hwirq; } /** * struct irq_chip - hardware interrupt chip descriptor * * @name: name for /proc/interrupts * @irq_startup: start up the interrupt (defaults to ->enable if NULL) * @irq_shutdown: shut down the interrupt (defaults to ->disable if NULL) * @irq_enable: enable the interrupt (defaults to chip->unmask if NULL) * @irq_disable: disable the interrupt * @irq_ack: start of a new interrupt * @irq_mask: mask an interrupt source * @irq_mask_ack: ack and mask an interrupt source * @irq_unmask: unmask an interrupt source * @irq_eoi: end of interrupt * @irq_set_affinity: Set the CPU affinity on SMP machines. If the force * argument is true, it tells the driver to * unconditionally apply the affinity setting. Sanity * checks against the supplied affinity mask are not * required. This is used for CPU hotplug where the * target CPU is not yet set in the cpu_online_mask. * @irq_retrigger: resend an IRQ to the CPU * @irq_set_type: set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ * @irq_set_wake: enable/disable power-management wake-on of an IRQ * @irq_bus_lock: function to lock access to slow bus (i2c) chips * @irq_bus_sync_unlock:function to sync and unlock slow bus (i2c) chips * @irq_cpu_online: configure an interrupt source for a secondary CPU * @irq_cpu_offline: un-configure an interrupt source for a secondary CPU * @irq_suspend: function called from core code on suspend once per * chip, when one or more interrupts are installed * @irq_resume: function called from core code on resume once per chip, * when one ore more interrupts are installed * @irq_pm_shutdown: function called from core code on shutdown once per chip * @irq_calc_mask: Optional function to set irq_data.mask for special cases * @irq_print_chip: optional to print special chip info in show_interrupts * @irq_request_resources: optional to request resources before calling * any other callback related to this irq * @irq_release_resources: optional to release resources acquired with * irq_request_resources * @irq_compose_msi_msg: optional to compose message content for MSI * @irq_write_msi_msg: optional to write message content for MSI * @irq_get_irqchip_state: return the internal state of an interrupt * @irq_set_irqchip_state: set the internal state of a interrupt * @irq_set_vcpu_affinity: optional to target a vCPU in a virtual machine * @ipi_send_single: send a single IPI to destination cpus * @ipi_send_mask: send an IPI to destination cpus in cpumask * @irq_nmi_setup: function called from core code before enabling an NMI * @irq_nmi_teardown: function called from core code after disabling an NMI * @flags: chip specific flags */ struct irq_chip { const char *name; unsigned int (*irq_startup)(struct irq_data *data); void (*irq_shutdown)(struct irq_data *data); void (*irq_enable)(struct irq_data *data); void (*irq_disable)(struct irq_data *data); void (*irq_ack)(struct irq_data *data); void (*irq_mask)(struct irq_data *data); void (*irq_mask_ack)(struct irq_data *data); void (*irq_unmask)(struct irq_data *data); void (*irq_eoi)(struct irq_data *data); int (*irq_set_affinity)(struct irq_data *data, const struct cpumask *dest, bool force); int (*irq_retrigger)(struct irq_data *data); int (*irq_set_type)(struct irq_data *data, unsigned int flow_type); int (*irq_set_wake)(struct irq_data *data, unsigned int on); void (*irq_bus_lock)(struct irq_data *data); void (*irq_bus_sync_unlock)(struct irq_data *data); #ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE void (*irq_cpu_online)(struct irq_data *data); void (*irq_cpu_offline)(struct irq_data *data); #endif void (*irq_suspend)(struct irq_data *data); void (*irq_resume)(struct irq_data *data); void (*irq_pm_shutdown)(struct irq_data *data); void (*irq_calc_mask)(struct irq_data *data); void (*irq_print_chip)(struct irq_data *data, struct seq_file *p); int (*irq_request_resources)(struct irq_data *data); void (*irq_release_resources)(struct irq_data *data); void (*irq_compose_msi_msg)(struct irq_data *data, struct msi_msg *msg); void (*irq_write_msi_msg)(struct irq_data *data, struct msi_msg *msg); int (*irq_get_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool *state); int (*irq_set_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool state); int (*irq_set_vcpu_affinity)(struct irq_data *data, void *vcpu_info); void (*ipi_send_single)(struct irq_data *data, unsigned int cpu); void (*ipi_send_mask)(struct irq_data *data, const struct cpumask *dest); int (*irq_nmi_setup)(struct irq_data *data); void (*irq_nmi_teardown)(struct irq_data *data); unsigned long flags; }; /* * irq_chip specific flags * * IRQCHIP_SET_TYPE_MASKED: Mask before calling chip.irq_set_type() * IRQCHIP_EOI_IF_HANDLED: Only issue irq_eoi() when irq was handled * IRQCHIP_MASK_ON_SUSPEND: Mask non wake irqs in the suspend path * IRQCHIP_ONOFFLINE_ENABLED: Only call irq_on/off_line callbacks * when irq enabled * IRQCHIP_SKIP_SET_WAKE: Skip chip.irq_set_wake(), for this irq chip * IRQCHIP_ONESHOT_SAFE: One shot does not require mask/unmask * IRQCHIP_EOI_THREADED: Chip requires eoi() on unmask in threaded mode * IRQCHIP_SUPPORTS_LEVEL_MSI: Chip can provide two doorbells for Level MSIs * IRQCHIP_SUPPORTS_NMI: Chip can deliver NMIs, only for root irqchips * IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND: Invokes __enable_irq()/__disable_irq() for wake irqs * in the suspend path if they are in disabled state * IRQCHIP_AFFINITY_PRE_STARTUP: Default affinity update before startup * IRQCHIP_IMMUTABLE: Don't ever change anything in this chip */ enum { IRQCHIP_SET_TYPE_MASKED = (1 << 0), IRQCHIP_EOI_IF_HANDLED = (1 << 1), IRQCHIP_MASK_ON_SUSPEND = (1 << 2), IRQCHIP_ONOFFLINE_ENABLED = (1 << 3), IRQCHIP_SKIP_SET_WAKE = (1 << 4), IRQCHIP_ONESHOT_SAFE = (1 << 5), IRQCHIP_EOI_THREADED = (1 << 6), IRQCHIP_SUPPORTS_LEVEL_MSI = (1 << 7), IRQCHIP_SUPPORTS_NMI = (1 << 8), IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND = (1 << 9), IRQCHIP_AFFINITY_PRE_STARTUP = (1 << 10), IRQCHIP_IMMUTABLE = (1 << 11), }; #include <linux/irqdesc.h> /* * Pick up the arch-dependent methods: */ #include <asm/hw_irq.h> #ifndef NR_IRQS_LEGACY # define NR_IRQS_LEGACY 0 #endif #ifndef ARCH_IRQ_INIT_FLAGS # define ARCH_IRQ_INIT_FLAGS 0 #endif #define IRQ_DEFAULT_INIT_FLAGS ARCH_IRQ_INIT_FLAGS struct irqaction; extern int setup_percpu_irq(unsigned int irq, struct irqaction *new); extern void remove_percpu_irq(unsigned int irq, struct irqaction *act); #ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE extern void irq_cpu_online(void); extern void irq_cpu_offline(void); #endif extern int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *cpumask, bool force); extern int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info); #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_IRQ_MIGRATION) extern void irq_migrate_all_off_this_cpu(void); extern int irq_affinity_online_cpu(unsigned int cpu); #else # define irq_affinity_online_cpu NULL #endif #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) void __irq_move_irq(struct irq_data *data); static inline void irq_move_irq(struct irq_data *data) { if (unlikely(irqd_is_setaffinity_pending(data))) __irq_move_irq(data); } void irq_move_masked_irq(struct irq_data *data); void irq_force_complete_move(struct irq_desc *desc); #else static inline void irq_move_irq(struct irq_data *data) { } static inline void irq_move_masked_irq(struct irq_data *data) { } static inline void irq_force_complete_move(struct irq_desc *desc) { } #endif extern int no_irq_affinity; #ifdef CONFIG_HARDIRQS_SW_RESEND int irq_set_parent(int irq, int parent_irq); #else static inline int irq_set_parent(int irq, int parent_irq) { return 0; } #endif /* * Built-in IRQ handlers for various IRQ types, * callable via desc->handle_irq() */ extern void handle_level_irq(struct irq_desc *desc); extern void handle_fasteoi_irq(struct irq_desc *desc); extern void handle_edge_irq(struct irq_desc *desc); extern void handle_edge_eoi_irq(struct irq_desc *desc); extern void handle_simple_irq(struct irq_desc *desc); extern void handle_untracked_irq(struct irq_desc *desc); extern void handle_percpu_irq(struct irq_desc *desc); extern void handle_percpu_devid_irq(struct irq_desc *desc); extern void handle_bad_irq(struct irq_desc *desc); extern void handle_nested_irq(unsigned int irq); extern void handle_fasteoi_nmi(struct irq_desc *desc); extern void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc); extern int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg); extern int irq_chip_pm_get(struct irq_data *data); extern int irq_chip_pm_put(struct irq_data *data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY extern void handle_fasteoi_ack_irq(struct irq_desc *desc); extern void handle_fasteoi_mask_irq(struct irq_desc *desc); extern int irq_chip_set_parent_state(struct irq_data *data, enum irqchip_irq_state which, bool val); extern int irq_chip_get_parent_state(struct irq_data *data, enum irqchip_irq_state which, bool *state); extern void irq_chip_enable_parent(struct irq_data *data); extern void irq_chip_disable_parent(struct irq_data *data); extern void irq_chip_ack_parent(struct irq_data *data); extern int irq_chip_retrigger_hierarchy(struct irq_data *data); extern void irq_chip_mask_parent(struct irq_data *data); extern void irq_chip_mask_ack_parent(struct irq_data *data); extern void irq_chip_unmask_parent(struct irq_data *data); extern void irq_chip_eoi_parent(struct irq_data *data); extern int irq_chip_set_affinity_parent(struct irq_data *data, const struct cpumask *dest, bool force); extern int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on); extern int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info); extern int irq_chip_set_type_parent(struct irq_data *data, unsigned int type); extern int irq_chip_request_resources_parent(struct irq_data *data); extern void irq_chip_release_resources_parent(struct irq_data *data); #endif /* Handling of unhandled and spurious interrupts: */ extern void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret); /* Enable/disable irq debugging output: */ extern int noirqdebug_setup(char *str); /* Checks whether the interrupt can be requested by request_irq(): */ extern int can_request_irq(unsigned int irq, unsigned long irqflags); /* Dummy irq-chip implementations: */ extern struct irq_chip no_irq_chip; extern struct irq_chip dummy_irq_chip; extern void irq_set_chip_and_handler_name(unsigned int irq, const struct irq_chip *chip, irq_flow_handler_t handle, const char *name); static inline void irq_set_chip_and_handler(unsigned int irq, const struct irq_chip *chip, irq_flow_handler_t handle) { irq_set_chip_and_handler_name(irq, chip, handle, NULL); } extern int irq_set_percpu_devid(unsigned int irq); extern int irq_set_percpu_devid_partition(unsigned int irq, const struct cpumask *affinity); extern int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity); extern void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name); static inline void irq_set_handler(unsigned int irq, irq_flow_handler_t handle) { __irq_set_handler(irq, handle, 0, NULL); } /* * Set a highlevel chained flow handler for a given IRQ. * (a chained handler is automatically enabled and set to * IRQ_NOREQUEST, IRQ_NOPROBE, and IRQ_NOTHREAD) */ static inline void irq_set_chained_handler(unsigned int irq, irq_flow_handler_t handle) { __irq_set_handler(irq, handle, 1, NULL); } /* * Set a highlevel chained flow handler and its data for a given IRQ. * (a chained handler is automatically enabled and set to * IRQ_NOREQUEST, IRQ_NOPROBE, and IRQ_NOTHREAD) */ void irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, void *data); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set); static inline void irq_set_status_flags(unsigned int irq, unsigned long set) { irq_modify_status(irq, 0, set); } static inline void irq_clear_status_flags(unsigned int irq, unsigned long clr) { irq_modify_status(irq, clr, 0); } static inline void irq_set_noprobe(unsigned int irq) { irq_modify_status(irq, 0, IRQ_NOPROBE); } static inline void irq_set_probe(unsigned int irq) { irq_modify_status(irq, IRQ_NOPROBE, 0); } static inline void irq_set_nothread(unsigned int irq) { irq_modify_status(irq, 0, IRQ_NOTHREAD); } static inline void irq_set_thread(unsigned int irq) { irq_modify_status(irq, IRQ_NOTHREAD, 0); } static inline void irq_set_nested_thread(unsigned int irq, bool nest) { if (nest) irq_set_status_flags(irq, IRQ_NESTED_THREAD); else irq_clear_status_flags(irq, IRQ_NESTED_THREAD); } static inline void irq_set_percpu_devid_flags(unsigned int irq) { irq_set_status_flags(irq, IRQ_NOAUTOEN | IRQ_PER_CPU | IRQ_NOTHREAD | IRQ_NOPROBE | IRQ_PER_CPU_DEVID); } /* Set/get chip/data for an IRQ: */ extern int irq_set_chip(unsigned int irq, const struct irq_chip *chip); extern int irq_set_handler_data(unsigned int irq, void *data); extern int irq_set_chip_data(unsigned int irq, void *data); extern int irq_set_irq_type(unsigned int irq, unsigned int type); extern int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry); extern int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, struct msi_desc *entry); extern struct irq_data *irq_get_irq_data(unsigned int irq); static inline struct irq_chip *irq_get_chip(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->chip : NULL; } static inline struct irq_chip *irq_data_get_irq_chip(struct irq_data *d) { return d->chip; } static inline void *irq_get_chip_data(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->chip_data : NULL; } static inline void *irq_data_get_irq_chip_data(struct irq_data *d) { return d->chip_data; } static inline void *irq_get_handler_data(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->common->handler_data : NULL; } static inline void *irq_data_get_irq_handler_data(struct irq_data *d) { return d->common->handler_data; } static inline struct msi_desc *irq_get_msi_desc(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->common->msi_desc : NULL; } static inline struct msi_desc *irq_data_get_msi_desc(struct irq_data *d) { return d->common->msi_desc; } static inline u32 irq_get_trigger_type(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? irqd_get_trigger_type(d) : 0; } static inline int irq_common_data_get_node(struct irq_common_data *d) { #ifdef CONFIG_NUMA return d->node; #else return 0; #endif } static inline int irq_data_get_node(struct irq_data *d) { return irq_common_data_get_node(d->common); } static inline const struct cpumask *irq_data_get_affinity_mask(struct irq_data *d) { #ifdef CONFIG_SMP return d->common->affinity; #else return cpumask_of(0); #endif } static inline void irq_data_update_affinity(struct irq_data *d, const struct cpumask *m) { #ifdef CONFIG_SMP cpumask_copy(d->common->affinity, m); #endif } static inline const struct cpumask *irq_get_affinity_mask(int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? irq_data_get_affinity_mask(d) : NULL; } #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK static inline const struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d) { return d->common->effective_affinity; } static inline void irq_data_update_effective_affinity(struct irq_data *d, const struct cpumask *m) { cpumask_copy(d->common->effective_affinity, m); } #else static inline void irq_data_update_effective_affinity(struct irq_data *d, const struct cpumask *m) { } static inline const struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d) { return irq_data_get_affinity_mask(d); } #endif static inline const struct cpumask *irq_get_effective_affinity_mask(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? irq_data_get_effective_affinity_mask(d) : NULL; } unsigned int arch_dynirq_lower_bound(unsigned int from); int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, struct module *owner, const struct irq_affinity_desc *affinity); int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from, unsigned int cnt, int node, struct module *owner, const struct irq_affinity_desc *affinity); /* use macros to avoid needing export.h for THIS_MODULE */ #define irq_alloc_descs(irq, from, cnt, node) \ __irq_alloc_descs(irq, from, cnt, node, THIS_MODULE, NULL) #define irq_alloc_desc(node) \ irq_alloc_descs(-1, 1, 1, node) #define irq_alloc_desc_at(at, node) \ irq_alloc_descs(at, at, 1, node) #define irq_alloc_desc_from(from, node) \ irq_alloc_descs(-1, from, 1, node) #define irq_alloc_descs_from(from, cnt, node) \ irq_alloc_descs(-1, from, cnt, node) #define devm_irq_alloc_descs(dev, irq, from, cnt, node) \ __devm_irq_alloc_descs(dev, irq, from, cnt, node, THIS_MODULE, NULL) #define devm_irq_alloc_desc(dev, node) \ devm_irq_alloc_descs(dev, -1, 1, 1, node) #define devm_irq_alloc_desc_at(dev, at, node) \ devm_irq_alloc_descs(dev, at, at, 1, node) #define devm_irq_alloc_desc_from(dev, from, node) \ devm_irq_alloc_descs(dev, -1, from, 1, node) #define devm_irq_alloc_descs_from(dev, from, cnt, node) \ devm_irq_alloc_descs(dev, -1, from, cnt, node) void irq_free_descs(unsigned int irq, unsigned int cnt); static inline void irq_free_desc(unsigned int irq) { irq_free_descs(irq, 1); } #ifdef CONFIG_GENERIC_IRQ_LEGACY void irq_init_desc(unsigned int irq); #endif /** * struct irq_chip_regs - register offsets for struct irq_gci * @enable: Enable register offset to reg_base * @disable: Disable register offset to reg_base * @mask: Mask register offset to reg_base * @ack: Ack register offset to reg_base * @eoi: Eoi register offset to reg_base * @type: Type configuration register offset to reg_base * @polarity: Polarity configuration register offset to reg_base */ struct irq_chip_regs { unsigned long enable; unsigned long disable; unsigned long mask; unsigned long ack; unsigned long eoi; unsigned long type; unsigned long polarity; }; /** * struct irq_chip_type - Generic interrupt chip instance for a flow type * @chip: The real interrupt chip which provides the callbacks * @regs: Register offsets for this chip * @handler: Flow handler associated with this chip * @type: Chip can handle these flow types * @mask_cache_priv: Cached mask register private to the chip type * @mask_cache: Pointer to cached mask register * * A irq_generic_chip can have several instances of irq_chip_type when * it requires different functions and register offsets for different * flow types. */ struct irq_chip_type { struct irq_chip chip; struct irq_chip_regs regs; irq_flow_handler_t handler; u32 type; u32 mask_cache_priv; u32 *mask_cache; }; /** * struct irq_chip_generic - Generic irq chip data structure * @lock: Lock to protect register and cache data access * @reg_base: Register base address (virtual) * @reg_readl: Alternate I/O accessor (defaults to readl if NULL) * @reg_writel: Alternate I/O accessor (defaults to writel if NULL) * @suspend: Function called from core code on suspend once per * chip; can be useful instead of irq_chip::suspend to * handle chip details even when no interrupts are in use * @resume: Function called from core code on resume once per chip; * can be useful instead of irq_chip::suspend to handle * chip details even when no interrupts are in use * @irq_base: Interrupt base nr for this chip * @irq_cnt: Number of interrupts handled by this chip * @mask_cache: Cached mask register shared between all chip types * @type_cache: Cached type register * @polarity_cache: Cached polarity register * @wake_enabled: Interrupt can wakeup from suspend * @wake_active: Interrupt is marked as an wakeup from suspend source * @num_ct: Number of available irq_chip_type instances (usually 1) * @private: Private data for non generic chip callbacks * @installed: bitfield to denote installed interrupts * @unused: bitfield to denote unused interrupts * @domain: irq domain pointer * @list: List head for keeping track of instances * @chip_types: Array of interrupt irq_chip_types * * Note, that irq_chip_generic can have multiple irq_chip_type * implementations which can be associated to a particular irq line of * an irq_chip_generic instance. That allows to share and protect * state in an irq_chip_generic instance when we need to implement * different flow mechanisms (level/edge) for it. */ struct irq_chip_generic { raw_spinlock_t lock; void __iomem *reg_base; u32 (*reg_readl)(void __iomem *addr); void (*reg_writel)(u32 val, void __iomem *addr); void (*suspend)(struct irq_chip_generic *gc); void (*resume)(struct irq_chip_generic *gc); unsigned int irq_base; unsigned int irq_cnt; u32 mask_cache; u32 type_cache; u32 polarity_cache; u32 wake_enabled; u32 wake_active; unsigned int num_ct; void *private; unsigned long installed; unsigned long unused; struct irq_domain *domain; struct list_head list; struct irq_chip_type chip_types[]; }; /** * enum irq_gc_flags - Initialization flags for generic irq chips * @IRQ_GC_INIT_MASK_CACHE: Initialize the mask_cache by reading mask reg * @IRQ_GC_INIT_NESTED_LOCK: Set the lock class of the irqs to nested for * irq chips which need to call irq_set_wake() on * the parent irq. Usually GPIO implementations * @IRQ_GC_MASK_CACHE_PER_TYPE: Mask cache is chip type private * @IRQ_GC_NO_MASK: Do not calculate irq_data->mask * @IRQ_GC_BE_IO: Use big-endian register accesses (default: LE) */ enum irq_gc_flags { IRQ_GC_INIT_MASK_CACHE = 1 << 0, IRQ_GC_INIT_NESTED_LOCK = 1 << 1, IRQ_GC_MASK_CACHE_PER_TYPE = 1 << 2, IRQ_GC_NO_MASK = 1 << 3, IRQ_GC_BE_IO = 1 << 4, }; /* * struct irq_domain_chip_generic - Generic irq chip data structure for irq domains * @irqs_per_chip: Number of interrupts per chip * @num_chips: Number of chips * @irq_flags_to_set: IRQ* flags to set on irq setup * @irq_flags_to_clear: IRQ* flags to clear on irq setup * @gc_flags: Generic chip specific setup flags * @gc: Array of pointers to generic interrupt chips */ struct irq_domain_chip_generic { unsigned int irqs_per_chip; unsigned int num_chips; unsigned int irq_flags_to_clear; unsigned int irq_flags_to_set; enum irq_gc_flags gc_flags; struct irq_chip_generic *gc[]; }; /* Generic chip callback functions */ void irq_gc_noop(struct irq_data *d); void irq_gc_mask_disable_reg(struct irq_data *d); void irq_gc_mask_set_bit(struct irq_data *d); void irq_gc_mask_clr_bit(struct irq_data *d); void irq_gc_unmask_enable_reg(struct irq_data *d); void irq_gc_ack_set_bit(struct irq_data *d); void irq_gc_ack_clr_bit(struct irq_data *d); void irq_gc_mask_disable_and_ack_set(struct irq_data *d); void irq_gc_eoi(struct irq_data *d); int irq_gc_set_wake(struct irq_data *d, unsigned int on); /* Setup functions for irq_chip_generic */ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, irq_hw_number_t hw_irq); void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq); struct irq_chip_generic * irq_alloc_generic_chip(const char *name, int nr_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler); void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, enum irq_gc_flags flags, unsigned int clr, unsigned int set); int irq_setup_alt_chip(struct irq_data *d, unsigned int type); void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int clr, unsigned int set); struct irq_chip_generic * devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler); int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc, u32 msk, enum irq_gc_flags flags, unsigned int clr, unsigned int set); struct irq_chip_generic *irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq); int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, int num_ct, const char *name, irq_flow_handler_t handler, unsigned int clr, unsigned int set, enum irq_gc_flags flags); #define irq_alloc_domain_generic_chips(d, irqs_per_chip, num_ct, name, \ handler, clr, set, flags) \ ({ \ MAYBE_BUILD_BUG_ON(irqs_per_chip > 32); \ __irq_alloc_domain_generic_chips(d, irqs_per_chip, num_ct, name,\ handler, clr, set, flags); \ }) static inline void irq_free_generic_chip(struct irq_chip_generic *gc) { kfree(gc); } static inline void irq_destroy_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int clr, unsigned int set) { irq_remove_generic_chip(gc, msk, clr, set); irq_free_generic_chip(gc); } static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d) { return container_of(d->chip, struct irq_chip_type, chip); } #define IRQ_MSK(n) (u32)((n) < 32 ? ((1 << (n)) - 1) : UINT_MAX) #ifdef CONFIG_SMP static inline void irq_gc_lock(struct irq_chip_generic *gc) { raw_spin_lock(&gc->lock); } static inline void irq_gc_unlock(struct irq_chip_generic *gc) { raw_spin_unlock(&gc->lock); } #else static inline void irq_gc_lock(struct irq_chip_generic *gc) { } static inline void irq_gc_unlock(struct irq_chip_generic *gc) { } #endif /* * The irqsave variants are for usage in non interrupt code. Do not use * them in irq_chip callbacks. Use irq_gc_lock() instead. */ #define irq_gc_lock_irqsave(gc, flags) \ raw_spin_lock_irqsave(&(gc)->lock, flags) #define irq_gc_unlock_irqrestore(gc, flags) \ raw_spin_unlock_irqrestore(&(gc)->lock, flags) static inline void irq_reg_writel(struct irq_chip_generic *gc, u32 val, int reg_offset) { if (gc->reg_writel) gc->reg_writel(val, gc->reg_base + reg_offset); else writel(val, gc->reg_base + reg_offset); } static inline u32 irq_reg_readl(struct irq_chip_generic *gc, int reg_offset) { if (gc->reg_readl) return gc->reg_readl(gc->reg_base + reg_offset); else return readl(gc->reg_base + reg_offset); } struct irq_matrix; struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits, unsigned int alloc_start, unsigned int alloc_end); void irq_matrix_online(struct irq_matrix *m); void irq_matrix_offline(struct irq_matrix *m); void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit, bool replace); int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk); void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk); int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, unsigned int *mapped_cpu); void irq_matrix_reserve(struct irq_matrix *m); void irq_matrix_remove_reserved(struct irq_matrix *m); int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, bool reserved, unsigned int *mapped_cpu); void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, unsigned int bit, bool managed); void irq_matrix_assign(struct irq_matrix *m, unsigned int bit); unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown); unsigned int irq_matrix_allocated(struct irq_matrix *m); unsigned int irq_matrix_reserved(struct irq_matrix *m); void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind); /* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */ #define INVALID_HWIRQ (~0UL) irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu); int __ipi_send_single(struct irq_desc *desc, unsigned int cpu); int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest); int ipi_send_single(unsigned int virq, unsigned int cpu); int ipi_send_mask(unsigned int virq, const struct cpumask *dest); void ipi_mux_process(void); int ipi_mux_create(unsigned int nr_ipi, void (*mux_send)(unsigned int cpu)); #ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER /* * Registers a generic IRQ handling function as the top-level IRQ handler in * the system, which is generally the first C code called from an assembly * architecture-specific interrupt handler. * * Returns 0 on success, or -EBUSY if an IRQ handler has already been * registered. */ int __init set_handle_irq(void (*handle_irq)(struct pt_regs *)); /* * Allows interrupt handlers to find the irqchip that's been registered as the * top-level IRQ handler. */ extern void (*handle_arch_irq)(struct pt_regs *) __ro_after_init; asmlinkage void generic_handle_arch_irq(struct pt_regs *regs); #else #ifndef set_handle_irq #define set_handle_irq(handle_irq) \ do { \ (void)handle_irq; \ WARN_ON(1); \ } while (0) #endif #endif #endif /* _LINUX_IRQ_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM handshake #if !defined(_TRACE_HANDSHAKE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_HANDSHAKE_H #include <linux/net.h> #include <net/tls_prot.h> #include <linux/tracepoint.h> #include <trace/events/net_probe_common.h> #define TLS_RECORD_TYPE_LIST \ record_type(CHANGE_CIPHER_SPEC) \ record_type(ALERT) \ record_type(HANDSHAKE) \ record_type(DATA) \ record_type(HEARTBEAT) \ record_type(TLS12_CID) \ record_type_end(ACK) #undef record_type #undef record_type_end #define record_type(x) TRACE_DEFINE_ENUM(TLS_RECORD_TYPE_##x); #define record_type_end(x) TRACE_DEFINE_ENUM(TLS_RECORD_TYPE_##x); TLS_RECORD_TYPE_LIST #undef record_type #undef record_type_end #define record_type(x) { TLS_RECORD_TYPE_##x, #x }, #define record_type_end(x) { TLS_RECORD_TYPE_##x, #x } #define show_tls_content_type(type) \ __print_symbolic(type, TLS_RECORD_TYPE_LIST) TRACE_DEFINE_ENUM(TLS_ALERT_LEVEL_WARNING); TRACE_DEFINE_ENUM(TLS_ALERT_LEVEL_FATAL); #define show_tls_alert_level(level) \ __print_symbolic(level, \ { TLS_ALERT_LEVEL_WARNING, "Warning" }, \ { TLS_ALERT_LEVEL_FATAL, "Fatal" }) #define TLS_ALERT_DESCRIPTION_LIST \ alert_description(CLOSE_NOTIFY) \ alert_description(UNEXPECTED_MESSAGE) \ alert_description(BAD_RECORD_MAC) \ alert_description(RECORD_OVERFLOW) \ alert_description(HANDSHAKE_FAILURE) \ alert_description(BAD_CERTIFICATE) \ alert_description(UNSUPPORTED_CERTIFICATE) \ alert_description(CERTIFICATE_REVOKED) \ alert_description(CERTIFICATE_EXPIRED) \ alert_description(CERTIFICATE_UNKNOWN) \ alert_description(ILLEGAL_PARAMETER) \ alert_description(UNKNOWN_CA) \ alert_description(ACCESS_DENIED) \ alert_description(DECODE_ERROR) \ alert_description(DECRYPT_ERROR) \ alert_description(TOO_MANY_CIDS_REQUESTED) \ alert_description(PROTOCOL_VERSION) \ alert_description(INSUFFICIENT_SECURITY) \ alert_description(INTERNAL_ERROR) \ alert_description(INAPPROPRIATE_FALLBACK) \ alert_description(USER_CANCELED) \ alert_description(MISSING_EXTENSION) \ alert_description(UNSUPPORTED_EXTENSION) \ alert_description(UNRECOGNIZED_NAME) \ alert_description(BAD_CERTIFICATE_STATUS_RESPONSE) \ alert_description(UNKNOWN_PSK_IDENTITY) \ alert_description(CERTIFICATE_REQUIRED) \ alert_description_end(NO_APPLICATION_PROTOCOL) #undef alert_description #undef alert_description_end #define alert_description(x) TRACE_DEFINE_ENUM(TLS_ALERT_DESC_##x); #define alert_description_end(x) TRACE_DEFINE_ENUM(TLS_ALERT_DESC_##x); TLS_ALERT_DESCRIPTION_LIST #undef alert_description #undef alert_description_end #define alert_description(x) { TLS_ALERT_DESC_##x, #x }, #define alert_description_end(x) { TLS_ALERT_DESC_##x, #x } #define show_tls_alert_description(desc) \ __print_symbolic(desc, TLS_ALERT_DESCRIPTION_LIST) DECLARE_EVENT_CLASS(handshake_event_class, TP_PROTO( const struct net *net, const struct handshake_req *req, const struct sock *sk ), TP_ARGS(net, req, sk), TP_STRUCT__entry( __field(const void *, req) __field(const void *, sk) __field(unsigned int, netns_ino) ), TP_fast_assign( __entry->req = req; __entry->sk = sk; __entry->netns_ino = net->ns.inum; ), TP_printk("req=%p sk=%p", __entry->req, __entry->sk ) ); #define DEFINE_HANDSHAKE_EVENT(name) \ DEFINE_EVENT(handshake_event_class, name, \ TP_PROTO( \ const struct net *net, \ const struct handshake_req *req, \ const struct sock *sk \ ), \ TP_ARGS(net, req, sk)) DECLARE_EVENT_CLASS(handshake_fd_class, TP_PROTO( const struct net *net, const struct handshake_req *req, const struct sock *sk, int fd ), TP_ARGS(net, req, sk, fd), TP_STRUCT__entry( __field(const void *, req) __field(const void *, sk) __field(int, fd) __field(unsigned int, netns_ino) ), TP_fast_assign( __entry->req = req; __entry->sk = req->hr_sk; __entry->fd = fd; __entry->netns_ino = net->ns.inum; ), TP_printk("req=%p sk=%p fd=%d", __entry->req, __entry->sk, __entry->fd ) ); #define DEFINE_HANDSHAKE_FD_EVENT(name) \ DEFINE_EVENT(handshake_fd_class, name, \ TP_PROTO( \ const struct net *net, \ const struct handshake_req *req, \ const struct sock *sk, \ int fd \ ), \ TP_ARGS(net, req, sk, fd)) DECLARE_EVENT_CLASS(handshake_error_class, TP_PROTO( const struct net *net, const struct handshake_req *req, const struct sock *sk, int err ), TP_ARGS(net, req, sk, err), TP_STRUCT__entry( __field(const void *, req) __field(const void *, sk) __field(int, err) __field(unsigned int, netns_ino) ), TP_fast_assign( __entry->req = req; __entry->sk = sk; __entry->err = err; __entry->netns_ino = net->ns.inum; ), TP_printk("req=%p sk=%p err=%d", __entry->req, __entry->sk, __entry->err ) ); #define DEFINE_HANDSHAKE_ERROR(name) \ DEFINE_EVENT(handshake_error_class, name, \ TP_PROTO( \ const struct net *net, \ const struct handshake_req *req, \ const struct sock *sk, \ int err \ ), \ TP_ARGS(net, req, sk, err)) DECLARE_EVENT_CLASS(handshake_alert_class, TP_PROTO( const struct sock *sk, unsigned char level, unsigned char description ), TP_ARGS(sk, level, description), TP_STRUCT__entry( /* sockaddr_in6 is always bigger than sockaddr_in */ __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) __field(unsigned int, netns_ino) __field(unsigned long, level) __field(unsigned long, description) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS(__entry, inet, sk); __entry->netns_ino = sock_net(sk)->ns.inum; __entry->level = level; __entry->description = description; ), TP_printk("src=%pISpc dest=%pISpc %s: %s", __entry->saddr, __entry->daddr, show_tls_alert_level(__entry->level), show_tls_alert_description(__entry->description) ) ); #define DEFINE_HANDSHAKE_ALERT(name) \ DEFINE_EVENT(handshake_alert_class, name, \ TP_PROTO( \ const struct sock *sk, \ unsigned char level, \ unsigned char description \ ), \ TP_ARGS(sk, level, description)) /* * Request lifetime events */ DEFINE_HANDSHAKE_EVENT(handshake_submit); DEFINE_HANDSHAKE_ERROR(handshake_submit_err); DEFINE_HANDSHAKE_EVENT(handshake_cancel); DEFINE_HANDSHAKE_EVENT(handshake_cancel_none); DEFINE_HANDSHAKE_EVENT(handshake_cancel_busy); DEFINE_HANDSHAKE_EVENT(handshake_destruct); TRACE_EVENT(handshake_complete, TP_PROTO( const struct net *net, const struct handshake_req *req, const struct sock *sk, int status ), TP_ARGS(net, req, sk, status), TP_STRUCT__entry( __field(const void *, req) __field(const void *, sk) __field(int, status) __field(unsigned int, netns_ino) ), TP_fast_assign( __entry->req = req; __entry->sk = sk; __entry->status = status; __entry->netns_ino = net->ns.inum; ), TP_printk("req=%p sk=%p status=%d", __entry->req, __entry->sk, __entry->status ) ); /* * Netlink events */ DEFINE_HANDSHAKE_ERROR(handshake_notify_err); DEFINE_HANDSHAKE_FD_EVENT(handshake_cmd_accept); DEFINE_HANDSHAKE_ERROR(handshake_cmd_accept_err); DEFINE_HANDSHAKE_FD_EVENT(handshake_cmd_done); DEFINE_HANDSHAKE_ERROR(handshake_cmd_done_err); /* * TLS Record events */ TRACE_EVENT(tls_contenttype, TP_PROTO( const struct sock *sk, unsigned char type ), TP_ARGS(sk, type), TP_STRUCT__entry( /* sockaddr_in6 is always bigger than sockaddr_in */ __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) __field(unsigned int, netns_ino) __field(unsigned long, type) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS(__entry, inet, sk); __entry->netns_ino = sock_net(sk)->ns.inum; __entry->type = type; ), TP_printk("src=%pISpc dest=%pISpc %s", __entry->saddr, __entry->daddr, show_tls_content_type(__entry->type) ) ); /* * TLS Alert events */ DEFINE_HANDSHAKE_ALERT(tls_alert_send); DEFINE_HANDSHAKE_ALERT(tls_alert_recv); #endif /* _TRACE_HANDSHAKE_H */ #include <trace/define_trace.h>
22 22 22 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 // SPDX-License-Identifier: GPL-2.0 /* * fs/partitions/mac.c * * Code extracted from drivers/block/genhd.c * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King */ #include <linux/ctype.h> #include "check.h" #include "mac.h" #ifdef CONFIG_PPC_PMAC #include <asm/machdep.h> extern void note_bootable_part(dev_t dev, int part, int goodness); #endif /* * Code to understand MacOS partition tables. */ static inline void mac_fix_string(char *stg, int len) { int i; for (i = len - 1; i >= 0 && stg[i] == ' '; i--) stg[i] = 0; } int mac_partition(struct parsed_partitions *state) { Sector sect; unsigned char *data; int slot, blocks_in_map; unsigned secsize, datasize, partoffset; #ifdef CONFIG_PPC_PMAC int found_root = 0; int found_root_goodness = 0; #endif struct mac_partition *part; struct mac_driver_desc *md; /* Get 0th block and look at the first partition map entry. */ md = read_part_sector(state, 0, &sect); if (!md) return -1; if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { put_dev_sector(sect); return 0; } secsize = be16_to_cpu(md->block_size); put_dev_sector(sect); datasize = round_down(secsize, 512); data = read_part_sector(state, datasize / 512, &sect); if (!data) return -1; partoffset = secsize % 512; if (partoffset + sizeof(*part) > datasize) return -1; part = (struct mac_partition *) (data + partoffset); if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { put_dev_sector(sect); return 0; /* not a MacOS disk */ } blocks_in_map = be32_to_cpu(part->map_count); if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) { put_dev_sector(sect); return 0; } if (blocks_in_map >= state->limit) blocks_in_map = state->limit - 1; strlcat(state->pp_buf, " [mac]", PAGE_SIZE); for (slot = 1; slot <= blocks_in_map; ++slot) { int pos = slot * secsize; put_dev_sector(sect); data = read_part_sector(state, pos/512, &sect); if (!data) return -1; part = (struct mac_partition *) (data + pos%512); if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) break; put_partition(state, slot, be32_to_cpu(part->start_block) * (secsize/512), be32_to_cpu(part->block_count) * (secsize/512)); if (!strncasecmp(part->type, "Linux_RAID", 10)) state->parts[slot].flags = ADDPART_FLAG_RAID; #ifdef CONFIG_PPC_PMAC /* * If this is the first bootable partition, tell the * setup code, in case it wants to make this the root. */ if (machine_is(powermac)) { int goodness = 0; mac_fix_string(part->processor, 16); mac_fix_string(part->name, 32); mac_fix_string(part->type, 32); if ((be32_to_cpu(part->status) & MAC_STATUS_BOOTABLE) && strcasecmp(part->processor, "powerpc") == 0) goodness++; if (strcasecmp(part->type, "Apple_UNIX_SVR2") == 0 || (strncasecmp(part->type, "Linux", 5) == 0 && strcasecmp(part->type, "Linux_swap") != 0)) { int i, l; goodness++; l = strlen(part->name); if (strcmp(part->name, "/") == 0) goodness++; for (i = 0; i <= l - 4; ++i) { if (strncasecmp(part->name + i, "root", 4) == 0) { goodness += 2; break; } } if (strncasecmp(part->name, "swap", 4) == 0) goodness--; } if (goodness > found_root_goodness) { found_root = slot; found_root_goodness = goodness; } } #endif /* CONFIG_PPC_PMAC */ } #ifdef CONFIG_PPC_PMAC if (found_root_goodness) note_bootable_part(state->disk->part0->bd_dev, found_root, found_root_goodness); #endif put_dev_sector(sect); strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; }
3 1 259 255 255 255 3 255 232 255 257 241 241 1 241 40 3 1 40 39 40 241 240 1 248 1 247 5 247 248 2 248 248 248 248 248 248 245 1 244 1 244 221 1 220 1 236 236 245 216 214 224 224 222 219 222 224 224 123 102 257 242 242 1 240 241 241 239 239 239 1 1 239 239 237 236 236 1 236 38 36 2 235 4 6 256 4 256 1 252 251 242 224 2 64 20 10 16 240 123 123 21 123 123 6 6 5 4 2 1 1 1 259 259 259 259 259 49 6 49 9 49 20 49 10 49 28 48 240 1 1 239 42 2 42 1 42 42 42 239 42 239 239 2 1 1 1 1 17 1 1 237 240 3 3 3 3 3 3 3 3 3 1 3 3 3 1 3 1 3 1 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 // SPDX-License-Identifier: GPL-2.0-only #include <linux/fs.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/posix_acl_xattr.h> #include <linux/seq_file.h> #include <linux/xattr.h> #include "overlayfs.h" #include "params.h" static bool ovl_redirect_dir_def = IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_DIR); module_param_named(redirect_dir, ovl_redirect_dir_def, bool, 0644); MODULE_PARM_DESC(redirect_dir, "Default to on or off for the redirect_dir feature"); static bool ovl_redirect_always_follow = IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW); module_param_named(redirect_always_follow, ovl_redirect_always_follow, bool, 0644); MODULE_PARM_DESC(redirect_always_follow, "Follow redirects even if redirect_dir feature is turned off"); static bool ovl_xino_auto_def = IS_ENABLED(CONFIG_OVERLAY_FS_XINO_AUTO); module_param_named(xino_auto, ovl_xino_auto_def, bool, 0644); MODULE_PARM_DESC(xino_auto, "Auto enable xino feature"); static bool ovl_index_def = IS_ENABLED(CONFIG_OVERLAY_FS_INDEX); module_param_named(index, ovl_index_def, bool, 0644); MODULE_PARM_DESC(index, "Default to on or off for the inodes index feature"); static bool ovl_nfs_export_def = IS_ENABLED(CONFIG_OVERLAY_FS_NFS_EXPORT); module_param_named(nfs_export, ovl_nfs_export_def, bool, 0644); MODULE_PARM_DESC(nfs_export, "Default to on or off for the NFS export feature"); static bool ovl_metacopy_def = IS_ENABLED(CONFIG_OVERLAY_FS_METACOPY); module_param_named(metacopy, ovl_metacopy_def, bool, 0644); MODULE_PARM_DESC(metacopy, "Default to on or off for the metadata only copy up feature"); enum ovl_opt { Opt_lowerdir, Opt_lowerdir_add, Opt_datadir_add, Opt_upperdir, Opt_workdir, Opt_default_permissions, Opt_redirect_dir, Opt_index, Opt_uuid, Opt_nfs_export, Opt_userxattr, Opt_xino, Opt_metacopy, Opt_verity, Opt_volatile, }; static const struct constant_table ovl_parameter_bool[] = { { "on", true }, { "off", false }, {} }; static const struct constant_table ovl_parameter_uuid[] = { { "off", OVL_UUID_OFF }, { "null", OVL_UUID_NULL }, { "auto", OVL_UUID_AUTO }, { "on", OVL_UUID_ON }, {} }; static const char *ovl_uuid_mode(struct ovl_config *config) { return ovl_parameter_uuid[config->uuid].name; } static int ovl_uuid_def(void) { return OVL_UUID_AUTO; } static const struct constant_table ovl_parameter_xino[] = { { "off", OVL_XINO_OFF }, { "auto", OVL_XINO_AUTO }, { "on", OVL_XINO_ON }, {} }; const char *ovl_xino_mode(struct ovl_config *config) { return ovl_parameter_xino[config->xino].name; } static int ovl_xino_def(void) { return ovl_xino_auto_def ? OVL_XINO_AUTO : OVL_XINO_OFF; } const struct constant_table ovl_parameter_redirect_dir[] = { { "off", OVL_REDIRECT_OFF }, { "follow", OVL_REDIRECT_FOLLOW }, { "nofollow", OVL_REDIRECT_NOFOLLOW }, { "on", OVL_REDIRECT_ON }, {} }; static const char *ovl_redirect_mode(struct ovl_config *config) { return ovl_parameter_redirect_dir[config->redirect_mode].name; } static int ovl_redirect_mode_def(void) { return ovl_redirect_dir_def ? OVL_REDIRECT_ON : ovl_redirect_always_follow ? OVL_REDIRECT_FOLLOW : OVL_REDIRECT_NOFOLLOW; } static const struct constant_table ovl_parameter_verity[] = { { "off", OVL_VERITY_OFF }, { "on", OVL_VERITY_ON }, { "require", OVL_VERITY_REQUIRE }, {} }; static const char *ovl_verity_mode(struct ovl_config *config) { return ovl_parameter_verity[config->verity_mode].name; } static int ovl_verity_mode_def(void) { return OVL_VERITY_OFF; } #define fsparam_string_empty(NAME, OPT) \ __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL) const struct fs_parameter_spec ovl_parameter_spec[] = { fsparam_string_empty("lowerdir", Opt_lowerdir), fsparam_string("lowerdir+", Opt_lowerdir_add), fsparam_string("datadir+", Opt_datadir_add), fsparam_string("upperdir", Opt_upperdir), fsparam_string("workdir", Opt_workdir), fsparam_flag("default_permissions", Opt_default_permissions), fsparam_enum("redirect_dir", Opt_redirect_dir, ovl_parameter_redirect_dir), fsparam_enum("index", Opt_index, ovl_parameter_bool), fsparam_enum("uuid", Opt_uuid, ovl_parameter_uuid), fsparam_enum("nfs_export", Opt_nfs_export, ovl_parameter_bool), fsparam_flag("userxattr", Opt_userxattr), fsparam_enum("xino", Opt_xino, ovl_parameter_xino), fsparam_enum("metacopy", Opt_metacopy, ovl_parameter_bool), fsparam_enum("verity", Opt_verity, ovl_parameter_verity), fsparam_flag("volatile", Opt_volatile), {} }; static char *ovl_next_opt(char **s) { char *sbegin = *s; char *p; if (sbegin == NULL) return NULL; for (p = sbegin; *p; p++) { if (*p == '\\') { p++; if (!*p) break; } else if (*p == ',') { *p = '\0'; *s = p + 1; return sbegin; } } *s = NULL; return sbegin; } static int ovl_parse_monolithic(struct fs_context *fc, void *data) { return vfs_parse_monolithic_sep(fc, data, ovl_next_opt); } static ssize_t ovl_parse_param_split_lowerdirs(char *str) { ssize_t nr_layers = 1, nr_colons = 0; char *s, *d; for (s = d = str;; s++, d++) { if (*s == '\\') { /* keep esc chars in split lowerdir */ *d++ = *s++; } else if (*s == ':') { bool next_colon = (*(s + 1) == ':'); nr_colons++; if (nr_colons == 2 && next_colon) { pr_err("only single ':' or double '::' sequences of unescaped colons in lowerdir mount option allowed.\n"); return -EINVAL; } /* count layers, not colons */ if (!next_colon) nr_layers++; *d = '\0'; continue; } *d = *s; if (!*s) { /* trailing colons */ if (nr_colons) { pr_err("unescaped trailing colons in lowerdir mount option.\n"); return -EINVAL; } break; } nr_colons = 0; } return nr_layers; } static int ovl_mount_dir_noesc(const char *name, struct path *path) { int err = -EINVAL; if (!*name) { pr_err("empty lowerdir\n"); goto out; } err = kern_path(name, LOOKUP_FOLLOW, path); if (err) { pr_err("failed to resolve '%s': %i\n", name, err); goto out; } return 0; out: return err; } static void ovl_unescape(char *s) { char *d = s; for (;; s++, d++) { if (*s == '\\') s++; *d = *s; if (!*s) break; } } static int ovl_mount_dir(const char *name, struct path *path) { int err = -ENOMEM; char *tmp = kstrdup(name, GFP_KERNEL); if (tmp) { ovl_unescape(tmp); err = ovl_mount_dir_noesc(tmp, path); kfree(tmp); } return err; } static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path, enum ovl_opt layer, const char *name, bool upper) { struct ovl_fs_context *ctx = fc->fs_private; if (ovl_dentry_weird(path->dentry)) return invalfc(fc, "filesystem on %s not supported", name); if (!d_is_dir(path->dentry)) return invalfc(fc, "%s is not a directory", name); /* * Check whether upper path is read-only here to report failures * early. Don't forget to recheck when the superblock is created * as the mount attributes could change. */ if (upper) { if (path->dentry->d_flags & DCACHE_OP_REAL) return invalfc(fc, "filesystem on %s not supported as upperdir", name); if (__mnt_is_readonly(path->mnt)) return invalfc(fc, "filesystem on %s is read-only", name); } else { if (ctx->lowerdir_all && layer != Opt_lowerdir) return invalfc(fc, "lowerdir+ and datadir+ cannot follow lowerdir"); if (ctx->nr_data && layer == Opt_lowerdir_add) return invalfc(fc, "regular lower layers cannot follow data layers"); if (ctx->nr == OVL_MAX_STACK) return invalfc(fc, "too many lower directories, limit is %d", OVL_MAX_STACK); } return 0; } static int ovl_ctx_realloc_lower(struct fs_context *fc) { struct ovl_fs_context *ctx = fc->fs_private; struct ovl_fs_context_layer *l; size_t nr; if (ctx->nr < ctx->capacity) return 0; nr = min_t(size_t, max(4096 / sizeof(*l), ctx->capacity * 2), OVL_MAX_STACK); l = krealloc_array(ctx->lower, nr, sizeof(*l), GFP_KERNEL_ACCOUNT); if (!l) return -ENOMEM; ctx->lower = l; ctx->capacity = nr; return 0; } static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer, struct path *path, char **pname) { struct ovl_fs *ofs = fc->s_fs_info; struct ovl_config *config = &ofs->config; struct ovl_fs_context *ctx = fc->fs_private; struct ovl_fs_context_layer *l; switch (layer) { case Opt_workdir: swap(config->workdir, *pname); swap(ctx->work, *path); break; case Opt_upperdir: swap(config->upperdir, *pname); swap(ctx->upper, *path); break; case Opt_datadir_add: ctx->nr_data++; fallthrough; case Opt_lowerdir_add: WARN_ON(ctx->nr >= ctx->capacity); l = &ctx->lower[ctx->nr++]; memset(l, 0, sizeof(*l)); swap(l->name, *pname); swap(l->path, *path); break; default: WARN_ON(1); } } static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param, enum ovl_opt layer) { char *name = kstrdup(param->string, GFP_KERNEL); bool upper = (layer == Opt_upperdir || layer == Opt_workdir); struct path path; int err; if (!name) return -ENOMEM; if (upper) err = ovl_mount_dir(name, &path); else err = ovl_mount_dir_noesc(name, &path); if (err) goto out_free; err = ovl_mount_dir_check(fc, &path, layer, name, upper); if (err) goto out_put; if (!upper) { err = ovl_ctx_realloc_lower(fc); if (err) goto out_put; } /* Store the user provided path string in ctx to show in mountinfo */ ovl_add_layer(fc, layer, &path, &name); out_put: path_put(&path); out_free: kfree(name); return err; } static void ovl_reset_lowerdirs(struct ovl_fs_context *ctx) { struct ovl_fs_context_layer *l = ctx->lower; // Reset old user provided lowerdir string kfree(ctx->lowerdir_all); ctx->lowerdir_all = NULL; for (size_t nr = 0; nr < ctx->nr; nr++, l++) { path_put(&l->path); kfree(l->name); l->name = NULL; } ctx->nr = 0; ctx->nr_data = 0; } /* * Parse lowerdir= mount option: * * e.g.: lowerdir=/lower1:/lower2:/lower3::/data1::/data2 * Set "/lower1", "/lower2", and "/lower3" as lower layers and * "/data1" and "/data2" as data lower layers. Any existing lower * layers are replaced. */ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) { int err; struct ovl_fs_context *ctx = fc->fs_private; struct ovl_fs_context_layer *l; char *dup = NULL, *iter; ssize_t nr_lower, nr; bool data_layer = false; /* * Ensure we're backwards compatible with mount(2) * by allowing relative paths. */ /* drop all existing lower layers */ ovl_reset_lowerdirs(ctx); if (!*name) return 0; if (*name == ':') { pr_err("cannot append lower layer"); return -EINVAL; } // Store user provided lowerdir string to show in mount options ctx->lowerdir_all = kstrdup(name, GFP_KERNEL); if (!ctx->lowerdir_all) return -ENOMEM; dup = kstrdup(name, GFP_KERNEL); if (!dup) return -ENOMEM; err = -EINVAL; nr_lower = ovl_parse_param_split_lowerdirs(dup); if (nr_lower < 0) goto out_err; if (nr_lower > OVL_MAX_STACK) { pr_err("too many lower directories, limit is %d\n", OVL_MAX_STACK); goto out_err; } if (nr_lower > ctx->capacity) { err = -ENOMEM; l = krealloc_array(ctx->lower, nr_lower, sizeof(*ctx->lower), GFP_KERNEL_ACCOUNT); if (!l) goto out_err; ctx->lower = l; ctx->capacity = nr_lower; } iter = dup; l = ctx->lower; for (nr = 0; nr < nr_lower; nr++, l++) { ctx->nr++; memset(l, 0, sizeof(*l)); err = ovl_mount_dir(iter, &l->path); if (err) goto out_put; err = ovl_mount_dir_check(fc, &l->path, Opt_lowerdir, iter, false); if (err) goto out_put; err = -ENOMEM; l->name = kstrdup(iter, GFP_KERNEL_ACCOUNT); if (!l->name) goto out_put; if (data_layer) ctx->nr_data++; /* Calling strchr() again would overrun. */ if (ctx->nr == nr_lower) break; err = -EINVAL; iter = strchr(iter, '\0') + 1; if (*iter) { /* * This is a regular layer so we require that * there are no data layers. */ if (ctx->nr_data > 0) { pr_err("regular lower layers cannot follow data lower layers"); goto out_put; } data_layer = false; continue; } /* This is a data lower layer. */ data_layer = true; iter++; } kfree(dup); return 0; out_put: ovl_reset_lowerdirs(ctx); out_err: kfree(dup); /* Intentionally don't realloc to a smaller size. */ return err; } static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param) { int err = 0; struct fs_parse_result result; struct ovl_fs *ofs = fc->s_fs_info; struct ovl_config *config = &ofs->config; struct ovl_fs_context *ctx = fc->fs_private; int opt; if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { /* * On remount overlayfs has always ignored all mount * options no matter if malformed or not so for * backwards compatibility we do the same here. */ if (fc->oldapi) return 0; /* * Give us the freedom to allow changing mount options * with the new mount api in the future. So instead of * silently ignoring everything we report a proper * error. This is only visible for users of the new * mount api. */ return invalfc(fc, "No changes allowed in reconfigure"); } opt = fs_parse(fc, ovl_parameter_spec, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_lowerdir: err = ovl_parse_param_lowerdir(param->string, fc); break; case Opt_lowerdir_add: case Opt_datadir_add: case Opt_upperdir: case Opt_workdir: err = ovl_parse_layer(fc, param, opt); break; case Opt_default_permissions: config->default_permissions = true; break; case Opt_redirect_dir: config->redirect_mode = result.uint_32; if (config->redirect_mode == OVL_REDIRECT_OFF) { config->redirect_mode = ovl_redirect_always_follow ? OVL_REDIRECT_FOLLOW : OVL_REDIRECT_NOFOLLOW; } ctx->set.redirect = true; break; case Opt_index: config->index = result.uint_32; ctx->set.index = true; break; case Opt_uuid: config->uuid = result.uint_32; break; case Opt_nfs_export: config->nfs_export = result.uint_32; ctx->set.nfs_export = true; break; case Opt_xino: config->xino = result.uint_32; break; case Opt_metacopy: config->metacopy = result.uint_32; ctx->set.metacopy = true; break; case Opt_verity: config->verity_mode = result.uint_32; break; case Opt_volatile: config->ovl_volatile = true; break; case Opt_userxattr: config->userxattr = true; break; default: pr_err("unrecognized mount option \"%s\" or missing value\n", param->key); return -EINVAL; } return err; } static int ovl_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, ovl_fill_super); } static inline void ovl_fs_context_free(struct ovl_fs_context *ctx) { ovl_reset_lowerdirs(ctx); path_put(&ctx->upper); path_put(&ctx->work); kfree(ctx->lower); kfree(ctx); } static void ovl_free(struct fs_context *fc) { struct ovl_fs *ofs = fc->s_fs_info; struct ovl_fs_context *ctx = fc->fs_private; /* * ofs is stored in the fs_context when it is initialized. * ofs is transferred to the superblock on a successful mount, * but if an error occurs before the transfer we have to free * it here. */ if (ofs) ovl_free_fs(ofs); if (ctx) ovl_fs_context_free(ctx); } static int ovl_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct ovl_fs *ofs = OVL_FS(sb); struct super_block *upper_sb; int ret = 0; if (!(fc->sb_flags & SB_RDONLY) && ovl_force_readonly(ofs)) return -EROFS; if (fc->sb_flags & SB_RDONLY && !sb_rdonly(sb)) { upper_sb = ovl_upper_mnt(ofs)->mnt_sb; if (ovl_should_sync(ofs)) { down_read(&upper_sb->s_umount); ret = sync_filesystem(upper_sb); up_read(&upper_sb->s_umount); } } return ret; } static const struct fs_context_operations ovl_context_ops = { .parse_monolithic = ovl_parse_monolithic, .parse_param = ovl_parse_param, .get_tree = ovl_get_tree, .reconfigure = ovl_reconfigure, .free = ovl_free, }; /* * This is called during fsopen() and will record the user namespace of * the caller in fc->user_ns since we've raised FS_USERNS_MOUNT. We'll * need it when we actually create the superblock to verify that the * process creating the superblock is in the same user namespace as * process that called fsopen(). */ int ovl_init_fs_context(struct fs_context *fc) { struct ovl_fs_context *ctx; struct ovl_fs *ofs; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT); if (!ctx) return -ENOMEM; /* * By default we allocate for three lower layers. It's likely * that it'll cover most users. */ ctx->lower = kmalloc_array(3, sizeof(*ctx->lower), GFP_KERNEL_ACCOUNT); if (!ctx->lower) goto out_err; ctx->capacity = 3; ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL); if (!ofs) goto out_err; ofs->config.redirect_mode = ovl_redirect_mode_def(); ofs->config.index = ovl_index_def; ofs->config.uuid = ovl_uuid_def(); ofs->config.nfs_export = ovl_nfs_export_def; ofs->config.xino = ovl_xino_def(); ofs->config.metacopy = ovl_metacopy_def; fc->s_fs_info = ofs; fc->fs_private = ctx; fc->ops = &ovl_context_ops; return 0; out_err: ovl_fs_context_free(ctx); return -ENOMEM; } void ovl_free_fs(struct ovl_fs *ofs) { struct vfsmount **mounts; unsigned i; iput(ofs->workbasedir_trap); iput(ofs->workdir_trap); dput(ofs->whiteout); dput(ofs->workdir); if (ofs->workdir_locked) ovl_inuse_unlock(ofs->workbasedir); dput(ofs->workbasedir); if (ofs->upperdir_locked) ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root); /* Reuse ofs->config.lowerdirs as a vfsmount array before freeing it */ mounts = (struct vfsmount **) ofs->config.lowerdirs; for (i = 0; i < ofs->numlayer; i++) { iput(ofs->layers[i].trap); kfree(ofs->config.lowerdirs[i]); mounts[i] = ofs->layers[i].mnt; } kern_unmount_array(mounts, ofs->numlayer); kfree(ofs->layers); for (i = 0; i < ofs->numfs; i++) free_anon_bdev(ofs->fs[i].pseudo_dev); kfree(ofs->fs); kfree(ofs->config.lowerdirs); kfree(ofs->config.upperdir); kfree(ofs->config.workdir); if (ofs->creator_cred) put_cred(ofs->creator_cred); kfree(ofs); } int ovl_fs_params_verify(const struct ovl_fs_context *ctx, struct ovl_config *config) { struct ovl_opt_set set = ctx->set; if (ctx->nr_data > 0 && !config->metacopy) { pr_err("lower data-only dirs require metacopy support.\n"); return -EINVAL; } /* Workdir/index are useless in non-upper mount */ if (!config->upperdir) { if (config->workdir) { pr_info("option \"workdir=%s\" is useless in a non-upper mount, ignore\n", config->workdir); kfree(config->workdir); config->workdir = NULL; } if (config->index && set.index) { pr_info("option \"index=on\" is useless in a non-upper mount, ignore\n"); set.index = false; } config->index = false; } if (!config->upperdir && config->ovl_volatile) { pr_info("option \"volatile\" is meaningless in a non-upper mount, ignoring it.\n"); config->ovl_volatile = false; } if (!config->upperdir && config->uuid == OVL_UUID_ON) { pr_info("option \"uuid=on\" requires an upper fs, falling back to uuid=null.\n"); config->uuid = OVL_UUID_NULL; } /* Resolve verity -> metacopy dependency */ if (config->verity_mode && !config->metacopy) { /* Don't allow explicit specified conflicting combinations */ if (set.metacopy) { pr_err("conflicting options: metacopy=off,verity=%s\n", ovl_verity_mode(config)); return -EINVAL; } /* Otherwise automatically enable metacopy. */ config->metacopy = true; } /* * This is to make the logic below simpler. It doesn't make any other * difference, since redirect_dir=on is only used for upper. */ if (!config->upperdir && config->redirect_mode == OVL_REDIRECT_FOLLOW) config->redirect_mode = OVL_REDIRECT_ON; /* Resolve verity -> metacopy -> redirect_dir dependency */ if (config->metacopy && config->redirect_mode != OVL_REDIRECT_ON) { if (set.metacopy && set.redirect) { pr_err("conflicting options: metacopy=on,redirect_dir=%s\n", ovl_redirect_mode(config)); return -EINVAL; } if (config->verity_mode && set.redirect) { pr_err("conflicting options: verity=%s,redirect_dir=%s\n", ovl_verity_mode(config), ovl_redirect_mode(config)); return -EINVAL; } if (set.redirect) { /* * There was an explicit redirect_dir=... that resulted * in this conflict. */ pr_info("disabling metacopy due to redirect_dir=%s\n", ovl_redirect_mode(config)); config->metacopy = false; } else { /* Automatically enable redirect otherwise. */ config->redirect_mode = OVL_REDIRECT_ON; } } /* Resolve nfs_export -> index dependency */ if (config->nfs_export && !config->index) { if (!config->upperdir && config->redirect_mode != OVL_REDIRECT_NOFOLLOW) { pr_info("NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n"); config->nfs_export = false; } else if (set.nfs_export && set.index) { pr_err("conflicting options: nfs_export=on,index=off\n"); return -EINVAL; } else if (set.index) { /* * There was an explicit index=off that resulted * in this conflict. */ pr_info("disabling nfs_export due to index=off\n"); config->nfs_export = false; } else { /* Automatically enable index otherwise. */ config->index = true; } } /* Resolve nfs_export -> !metacopy && !verity dependency */ if (config->nfs_export && config->metacopy) { if (set.nfs_export && set.metacopy) { pr_err("conflicting options: nfs_export=on,metacopy=on\n"); return -EINVAL; } if (set.metacopy) { /* * There was an explicit metacopy=on that resulted * in this conflict. */ pr_info("disabling nfs_export due to metacopy=on\n"); config->nfs_export = false; } else if (config->verity_mode) { /* * There was an explicit verity=.. that resulted * in this conflict. */ pr_info("disabling nfs_export due to verity=%s\n", ovl_verity_mode(config)); config->nfs_export = false; } else { /* * There was an explicit nfs_export=on that resulted * in this conflict. */ pr_info("disabling metacopy due to nfs_export=on\n"); config->metacopy = false; } } /* Resolve userxattr -> !redirect && !metacopy && !verity dependency */ if (config->userxattr) { if (set.redirect && config->redirect_mode != OVL_REDIRECT_NOFOLLOW) { pr_err("conflicting options: userxattr,redirect_dir=%s\n", ovl_redirect_mode(config)); return -EINVAL; } if (config->metacopy && set.metacopy) { pr_err("conflicting options: userxattr,metacopy=on\n"); return -EINVAL; } if (config->verity_mode) { pr_err("conflicting options: userxattr,verity=%s\n", ovl_verity_mode(config)); return -EINVAL; } /* * Silently disable default setting of redirect and metacopy. * This shall be the default in the future as well: these * options must be explicitly enabled if used together with * userxattr. */ config->redirect_mode = OVL_REDIRECT_NOFOLLOW; config->metacopy = false; } return 0; } /** * ovl_show_options * @m: the seq_file handle * @dentry: The dentry to query * * Prints the mount options for a given superblock. * Returns zero; does not fail. */ int ovl_show_options(struct seq_file *m, struct dentry *dentry) { struct super_block *sb = dentry->d_sb; struct ovl_fs *ofs = OVL_FS(sb); size_t nr, nr_merged_lower, nr_lower = 0; char **lowerdirs = ofs->config.lowerdirs; /* * lowerdirs[0] holds the colon separated list that user provided * with lowerdir mount option. * lowerdirs[1..numlayer] hold the lowerdir paths that were added * using the lowerdir+ and datadir+ mount options. * For now, we do not allow mixing the legacy lowerdir mount option * with the new lowerdir+ and datadir+ mount options. */ if (lowerdirs[0]) { seq_show_option(m, "lowerdir", lowerdirs[0]); } else { nr_lower = ofs->numlayer; nr_merged_lower = nr_lower - ofs->numdatalayer; } for (nr = 1; nr < nr_lower; nr++) { if (nr < nr_merged_lower) seq_show_option(m, "lowerdir+", lowerdirs[nr]); else seq_show_option(m, "datadir+", lowerdirs[nr]); } if (ofs->config.upperdir) { seq_show_option(m, "upperdir", ofs->config.upperdir); seq_show_option(m, "workdir", ofs->config.workdir); } if (ofs->config.default_permissions) seq_puts(m, ",default_permissions"); if (ofs->config.redirect_mode != ovl_redirect_mode_def()) seq_printf(m, ",redirect_dir=%s", ovl_redirect_mode(&ofs->config)); if (ofs->config.index != ovl_index_def) seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off"); if (ofs->config.uuid != ovl_uuid_def()) seq_printf(m, ",uuid=%s", ovl_uuid_mode(&ofs->config)); if (ofs->config.nfs_export != ovl_nfs_export_def) seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ? "on" : "off"); if (ofs->config.xino != ovl_xino_def() && !ovl_same_fs(ofs)) seq_printf(m, ",xino=%s", ovl_xino_mode(&ofs->config)); if (ofs->config.metacopy != ovl_metacopy_def) seq_printf(m, ",metacopy=%s", ofs->config.metacopy ? "on" : "off"); if (ofs->config.ovl_volatile) seq_puts(m, ",volatile"); if (ofs->config.userxattr) seq_puts(m, ",userxattr"); if (ofs->config.verity_mode != ovl_verity_mode_def()) seq_printf(m, ",verity=%s", ovl_verity_mode(&ofs->config)); return 0; }
2 2 2 2 2 1 1 2 2 2 1 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool.h> #include "netlink.h" #include "common.h" #include "bitset.h" struct module_req_info { struct ethnl_req_info base; }; struct module_reply_data { struct ethnl_reply_data base; struct ethtool_module_power_mode_params power; }; #define MODULE_REPDATA(__reply_base) \ container_of(__reply_base, struct module_reply_data, base) /* MODULE_GET */ const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1] = { [ETHTOOL_A_MODULE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int module_get_power_mode(struct net_device *dev, struct module_reply_data *data, struct netlink_ext_ack *extack) { const struct ethtool_ops *ops = dev->ethtool_ops; if (!ops->get_module_power_mode) return 0; return ops->get_module_power_mode(dev, &data->power, extack); } static int module_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct module_reply_data *data = MODULE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = module_get_power_mode(dev, data, info->extack); if (ret < 0) goto out_complete; out_complete: ethnl_ops_complete(dev); return ret; } static int module_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { struct module_reply_data *data = MODULE_REPDATA(reply_base); int len = 0; if (data->power.policy) len += nla_total_size(sizeof(u8)); /* _MODULE_POWER_MODE_POLICY */ if (data->power.mode) len += nla_total_size(sizeof(u8)); /* _MODULE_POWER_MODE */ return len; } static int module_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct module_reply_data *data = MODULE_REPDATA(reply_base); if (data->power.policy && nla_put_u8(skb, ETHTOOL_A_MODULE_POWER_MODE_POLICY, data->power.policy)) return -EMSGSIZE; if (data->power.mode && nla_put_u8(skb, ETHTOOL_A_MODULE_POWER_MODE, data->power.mode)) return -EMSGSIZE; return 0; } /* MODULE_SET */ const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1] = { [ETHTOOL_A_MODULE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_MODULE_POWER_MODE_POLICY] = NLA_POLICY_RANGE(NLA_U8, ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH, ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO), }; static int ethnl_set_module_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; struct nlattr **tb = info->attrs; if (!tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]) return 0; if (!ops->get_module_power_mode || !ops->set_module_power_mode) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY], "Setting power mode policy is not supported by this device"); return -EOPNOTSUPP; } return 1; } static int ethnl_set_module(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_module_power_mode_params power = {}; struct ethtool_module_power_mode_params power_new; const struct ethtool_ops *ops; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; int ret; ops = dev->ethtool_ops; power_new.policy = nla_get_u8(tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]); ret = ops->get_module_power_mode(dev, &power, info->extack); if (ret < 0) return ret; if (power_new.policy == power.policy) return 0; ret = ops->set_module_power_mode(dev, &power_new, info->extack); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_module_request_ops = { .request_cmd = ETHTOOL_MSG_MODULE_GET, .reply_cmd = ETHTOOL_MSG_MODULE_GET_REPLY, .hdr_attr = ETHTOOL_A_MODULE_HEADER, .req_info_size = sizeof(struct module_req_info), .reply_data_size = sizeof(struct module_reply_data), .prepare_data = module_prepare_data, .reply_size = module_reply_size, .fill_reply = module_fill_reply, .set_validate = ethnl_set_module_validate, .set = ethnl_set_module, .set_ntf_cmd = ETHTOOL_MSG_MODULE_NTF, };
10 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (c) 1999 Andreas Gal * Copyright (c) 2000-2001 Vojtech Pavlik * Copyright (c) 2006-2007 Jiri Kosina */ /* * * Should you need to contact me, the author, you can do so either by * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail: * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic */ #ifndef __HID_H #define __HID_H #include <linux/bitops.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/mod_devicetable.h> /* hid_device_id */ #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/input.h> #include <linux/semaphore.h> #include <linux/mutex.h> #include <linux/power_supply.h> #include <uapi/linux/hid.h> #include <linux/hid_bpf.h> /* * We parse each description item into this structure. Short items data * values are expanded to 32-bit signed int, long items contain a pointer * into the data area. */ struct hid_item { unsigned format; __u8 size; __u8 type; __u8 tag; union { __u8 u8; __s8 s8; __u16 u16; __s16 s16; __u32 u32; __s32 s32; __u8 *longdata; } data; }; /* * HID report item format */ #define HID_ITEM_FORMAT_SHORT 0 #define HID_ITEM_FORMAT_LONG 1 /* * Special tag indicating long items */ #define HID_ITEM_TAG_LONG 15 /* * HID report descriptor item type (prefix bit 2,3) */ #define HID_ITEM_TYPE_MAIN 0 #define HID_ITEM_TYPE_GLOBAL 1 #define HID_ITEM_TYPE_LOCAL 2 #define HID_ITEM_TYPE_RESERVED 3 /* * HID report descriptor main item tags */ #define HID_MAIN_ITEM_TAG_INPUT 8 #define HID_MAIN_ITEM_TAG_OUTPUT 9 #define HID_MAIN_ITEM_TAG_FEATURE 11 #define HID_MAIN_ITEM_TAG_BEGIN_COLLECTION 10 #define HID_MAIN_ITEM_TAG_END_COLLECTION 12 /* * HID report descriptor main item contents */ #define HID_MAIN_ITEM_CONSTANT 0x001 #define HID_MAIN_ITEM_VARIABLE 0x002 #define HID_MAIN_ITEM_RELATIVE 0x004 #define HID_MAIN_ITEM_WRAP 0x008 #define HID_MAIN_ITEM_NONLINEAR 0x010 #define HID_MAIN_ITEM_NO_PREFERRED 0x020 #define HID_MAIN_ITEM_NULL_STATE 0x040 #define HID_MAIN_ITEM_VOLATILE 0x080 #define HID_MAIN_ITEM_BUFFERED_BYTE 0x100 /* * HID report descriptor collection item types */ #define HID_COLLECTION_PHYSICAL 0 #define HID_COLLECTION_APPLICATION 1 #define HID_COLLECTION_LOGICAL 2 #define HID_COLLECTION_NAMED_ARRAY 4 /* * HID report descriptor global item tags */ #define HID_GLOBAL_ITEM_TAG_USAGE_PAGE 0 #define HID_GLOBAL_ITEM_TAG_LOGICAL_MINIMUM 1 #define HID_GLOBAL_ITEM_TAG_LOGICAL_MAXIMUM 2 #define HID_GLOBAL_ITEM_TAG_PHYSICAL_MINIMUM 3 #define HID_GLOBAL_ITEM_TAG_PHYSICAL_MAXIMUM 4 #define HID_GLOBAL_ITEM_TAG_UNIT_EXPONENT 5 #define HID_GLOBAL_ITEM_TAG_UNIT 6 #define HID_GLOBAL_ITEM_TAG_REPORT_SIZE 7 #define HID_GLOBAL_ITEM_TAG_REPORT_ID 8 #define HID_GLOBAL_ITEM_TAG_REPORT_COUNT 9 #define HID_GLOBAL_ITEM_TAG_PUSH 10 #define HID_GLOBAL_ITEM_TAG_POP 11 /* * HID report descriptor local item tags */ #define HID_LOCAL_ITEM_TAG_USAGE 0 #define HID_LOCAL_ITEM_TAG_USAGE_MINIMUM 1 #define HID_LOCAL_ITEM_TAG_USAGE_MAXIMUM 2 #define HID_LOCAL_ITEM_TAG_DESIGNATOR_INDEX 3 #define HID_LOCAL_ITEM_TAG_DESIGNATOR_MINIMUM 4 #define HID_LOCAL_ITEM_TAG_DESIGNATOR_MAXIMUM 5 #define HID_LOCAL_ITEM_TAG_STRING_INDEX 7 #define HID_LOCAL_ITEM_TAG_STRING_MINIMUM 8 #define HID_LOCAL_ITEM_TAG_STRING_MAXIMUM 9 #define HID_LOCAL_ITEM_TAG_DELIMITER 10 /* * HID usage tables */ #define HID_USAGE_PAGE 0xffff0000 #define HID_UP_UNDEFINED 0x00000000 #define HID_UP_GENDESK 0x00010000 #define HID_UP_SIMULATION 0x00020000 #define HID_UP_GENDEVCTRLS 0x00060000 #define HID_UP_KEYBOARD 0x00070000 #define HID_UP_LED 0x00080000 #define HID_UP_BUTTON 0x00090000 #define HID_UP_ORDINAL 0x000a0000 #define HID_UP_TELEPHONY 0x000b0000 #define HID_UP_CONSUMER 0x000c0000 #define HID_UP_DIGITIZER 0x000d0000 #define HID_UP_PID 0x000f0000 #define HID_UP_BATTERY 0x00850000 #define HID_UP_CAMERA 0x00900000 #define HID_UP_HPVENDOR 0xff7f0000 #define HID_UP_HPVENDOR2 0xff010000 #define HID_UP_MSVENDOR 0xff000000 #define HID_UP_CUSTOM 0x00ff0000 #define HID_UP_LOGIVENDOR 0xffbc0000 #define HID_UP_LOGIVENDOR2 0xff090000 #define HID_UP_LOGIVENDOR3 0xff430000 #define HID_UP_LNVENDOR 0xffa00000 #define HID_UP_SENSOR 0x00200000 #define HID_UP_ASUSVENDOR 0xff310000 #define HID_UP_GOOGLEVENDOR 0xffd10000 #define HID_USAGE 0x0000ffff #define HID_GD_POINTER 0x00010001 #define HID_GD_MOUSE 0x00010002 #define HID_GD_JOYSTICK 0x00010004 #define HID_GD_GAMEPAD 0x00010005 #define HID_GD_KEYBOARD 0x00010006 #define HID_GD_KEYPAD 0x00010007 #define HID_GD_MULTIAXIS 0x00010008 /* * Microsoft Win8 Wireless Radio Controls extensions CA, see: * http://www.usb.org/developers/hidpage/HUTRR40RadioHIDUsagesFinal.pdf */ #define HID_GD_WIRELESS_RADIO_CTLS 0x0001000c /* * System Multi-Axis, see: * http://www.usb.org/developers/hidpage/HUTRR62_-_Generic_Desktop_CA_for_System_Multi-Axis_Controllers.txt */ #define HID_GD_SYSTEM_MULTIAXIS 0x0001000e #define HID_GD_X 0x00010030 #define HID_GD_Y 0x00010031 #define HID_GD_Z 0x00010032 #define HID_GD_RX 0x00010033 #define HID_GD_RY 0x00010034 #define HID_GD_RZ 0x00010035 #define HID_GD_SLIDER 0x00010036 #define HID_GD_DIAL 0x00010037 #define HID_GD_WHEEL 0x00010038 #define HID_GD_HATSWITCH 0x00010039 #define HID_GD_BUFFER 0x0001003a #define HID_GD_BYTECOUNT 0x0001003b #define HID_GD_MOTION 0x0001003c #define HID_GD_START 0x0001003d #define HID_GD_SELECT 0x0001003e #define HID_GD_VX 0x00010040 #define HID_GD_VY 0x00010041 #define HID_GD_VZ 0x00010042 #define HID_GD_VBRX 0x00010043 #define HID_GD_VBRY 0x00010044 #define HID_GD_VBRZ 0x00010045 #define HID_GD_VNO 0x00010046 #define HID_GD_FEATURE 0x00010047 #define HID_GD_RESOLUTION_MULTIPLIER 0x00010048 #define HID_GD_SYSTEM_CONTROL 0x00010080 #define HID_GD_UP 0x00010090 #define HID_GD_DOWN 0x00010091 #define HID_GD_RIGHT 0x00010092 #define HID_GD_LEFT 0x00010093 /* Microsoft Win8 Wireless Radio Controls CA usage codes */ #define HID_GD_RFKILL_BTN 0x000100c6 #define HID_GD_RFKILL_LED 0x000100c7 #define HID_GD_RFKILL_SWITCH 0x000100c8 #define HID_DC_BATTERYSTRENGTH 0x00060020 #define HID_CP_CONSUMER_CONTROL 0x000c0001 #define HID_CP_AC_PAN 0x000c0238 #define HID_DG_DIGITIZER 0x000d0001 #define HID_DG_PEN 0x000d0002 #define HID_DG_LIGHTPEN 0x000d0003 #define HID_DG_TOUCHSCREEN 0x000d0004 #define HID_DG_TOUCHPAD 0x000d0005 #define HID_DG_WHITEBOARD 0x000d0006 #define HID_DG_STYLUS 0x000d0020 #define HID_DG_PUCK 0x000d0021 #define HID_DG_FINGER 0x000d0022 #define HID_DG_TIPPRESSURE 0x000d0030 #define HID_DG_BARRELPRESSURE 0x000d0031 #define HID_DG_INRANGE 0x000d0032 #define HID_DG_TOUCH 0x000d0033 #define HID_DG_UNTOUCH 0x000d0034 #define HID_DG_TAP 0x000d0035 #define HID_DG_TRANSDUCER_INDEX 0x000d0038 #define HID_DG_TABLETFUNCTIONKEY 0x000d0039 #define HID_DG_PROGRAMCHANGEKEY 0x000d003a #define HID_DG_BATTERYSTRENGTH 0x000d003b #define HID_DG_INVERT 0x000d003c #define HID_DG_TILT_X 0x000d003d #define HID_DG_TILT_Y 0x000d003e #define HID_DG_TWIST 0x000d0041 #define HID_DG_TIPSWITCH 0x000d0042 #define HID_DG_TIPSWITCH2 0x000d0043 #define HID_DG_BARRELSWITCH 0x000d0044 #define HID_DG_ERASER 0x000d0045 #define HID_DG_TABLETPICK 0x000d0046 #define HID_DG_PEN_COLOR 0x000d005c #define HID_DG_PEN_LINE_WIDTH 0x000d005e #define HID_DG_PEN_LINE_STYLE 0x000d0070 #define HID_DG_PEN_LINE_STYLE_INK 0x000d0072 #define HID_DG_PEN_LINE_STYLE_PENCIL 0x000d0073 #define HID_DG_PEN_LINE_STYLE_HIGHLIGHTER 0x000d0074 #define HID_DG_PEN_LINE_STYLE_CHISEL_MARKER 0x000d0075 #define HID_DG_PEN_LINE_STYLE_BRUSH 0x000d0076 #define HID_DG_PEN_LINE_STYLE_NO_PREFERENCE 0x000d0077 #define HID_CP_CONSUMERCONTROL 0x000c0001 #define HID_CP_NUMERICKEYPAD 0x000c0002 #define HID_CP_PROGRAMMABLEBUTTONS 0x000c0003 #define HID_CP_MICROPHONE 0x000c0004 #define HID_CP_HEADPHONE 0x000c0005 #define HID_CP_GRAPHICEQUALIZER 0x000c0006 #define HID_CP_FUNCTIONBUTTONS 0x000c0036 #define HID_CP_SELECTION 0x000c0080 #define HID_CP_MEDIASELECTION 0x000c0087 #define HID_CP_SELECTDISC 0x000c00ba #define HID_CP_VOLUMEUP 0x000c00e9 #define HID_CP_VOLUMEDOWN 0x000c00ea #define HID_CP_PLAYBACKSPEED 0x000c00f1 #define HID_CP_PROXIMITY 0x000c0109 #define HID_CP_SPEAKERSYSTEM 0x000c0160 #define HID_CP_CHANNELLEFT 0x000c0161 #define HID_CP_CHANNELRIGHT 0x000c0162 #define HID_CP_CHANNELCENTER 0x000c0163 #define HID_CP_CHANNELFRONT 0x000c0164 #define HID_CP_CHANNELCENTERFRONT 0x000c0165 #define HID_CP_CHANNELSIDE 0x000c0166 #define HID_CP_CHANNELSURROUND 0x000c0167 #define HID_CP_CHANNELLOWFREQUENCYENHANCEMENT 0x000c0168 #define HID_CP_CHANNELTOP 0x000c0169 #define HID_CP_CHANNELUNKNOWN 0x000c016a #define HID_CP_APPLICATIONLAUNCHBUTTONS 0x000c0180 #define HID_CP_GENERICGUIAPPLICATIONCONTROLS 0x000c0200 #define HID_DG_DEVICECONFIG 0x000d000e #define HID_DG_DEVICESETTINGS 0x000d0023 #define HID_DG_AZIMUTH 0x000d003f #define HID_DG_CONFIDENCE 0x000d0047 #define HID_DG_WIDTH 0x000d0048 #define HID_DG_HEIGHT 0x000d0049 #define HID_DG_CONTACTID 0x000d0051 #define HID_DG_INPUTMODE 0x000d0052 #define HID_DG_DEVICEINDEX 0x000d0053 #define HID_DG_CONTACTCOUNT 0x000d0054 #define HID_DG_CONTACTMAX 0x000d0055 #define HID_DG_SCANTIME 0x000d0056 #define HID_DG_SURFACESWITCH 0x000d0057 #define HID_DG_BUTTONSWITCH 0x000d0058 #define HID_DG_BUTTONTYPE 0x000d0059 #define HID_DG_BARRELSWITCH2 0x000d005a #define HID_DG_TOOLSERIALNUMBER 0x000d005b #define HID_DG_LATENCYMODE 0x000d0060 #define HID_BAT_ABSOLUTESTATEOFCHARGE 0x00850065 #define HID_BAT_CHARGING 0x00850044 #define HID_VD_ASUS_CUSTOM_MEDIA_KEYS 0xff310076 /* * HID connect requests */ #define HID_CONNECT_HIDINPUT BIT(0) #define HID_CONNECT_HIDINPUT_FORCE BIT(1) #define HID_CONNECT_HIDRAW BIT(2) #define HID_CONNECT_HIDDEV BIT(3) #define HID_CONNECT_HIDDEV_FORCE BIT(4) #define HID_CONNECT_FF BIT(5) #define HID_CONNECT_DRIVER BIT(6) #define HID_CONNECT_DEFAULT (HID_CONNECT_HIDINPUT|HID_CONNECT_HIDRAW| \ HID_CONNECT_HIDDEV|HID_CONNECT_FF) /* * HID device quirks. */ /* * Increase this if you need to configure more HID quirks at module load time */ #define MAX_USBHID_BOOT_QUIRKS 4 /** * DOC: HID quirks * | @HID_QUIRK_NOTOUCH: * | @HID_QUIRK_IGNORE: ignore this device * | @HID_QUIRK_NOGET: * | @HID_QUIRK_HIDDEV_FORCE: * | @HID_QUIRK_BADPAD: * | @HID_QUIRK_MULTI_INPUT: * | @HID_QUIRK_HIDINPUT_FORCE: * | @HID_QUIRK_ALWAYS_POLL: * | @HID_QUIRK_INPUT_PER_APP: * | @HID_QUIRK_X_INVERT: * | @HID_QUIRK_Y_INVERT: * | @HID_QUIRK_SKIP_OUTPUT_REPORTS: * | @HID_QUIRK_SKIP_OUTPUT_REPORT_ID: * | @HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP: * | @HID_QUIRK_HAVE_SPECIAL_DRIVER: * | @HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE: * | @HID_QUIRK_FULLSPEED_INTERVAL: * | @HID_QUIRK_NO_INIT_REPORTS: * | @HID_QUIRK_NO_IGNORE: * | @HID_QUIRK_NO_INPUT_SYNC: */ /* BIT(0) reserved for backward compatibility, was HID_QUIRK_INVERT */ #define HID_QUIRK_NOTOUCH BIT(1) #define HID_QUIRK_IGNORE BIT(2) #define HID_QUIRK_NOGET BIT(3) #define HID_QUIRK_HIDDEV_FORCE BIT(4) #define HID_QUIRK_BADPAD BIT(5) #define HID_QUIRK_MULTI_INPUT BIT(6) #define HID_QUIRK_HIDINPUT_FORCE BIT(7) /* BIT(8) reserved for backward compatibility, was HID_QUIRK_NO_EMPTY_INPUT */ /* BIT(9) reserved for backward compatibility, was NO_INIT_INPUT_REPORTS */ #define HID_QUIRK_ALWAYS_POLL BIT(10) #define HID_QUIRK_INPUT_PER_APP BIT(11) #define HID_QUIRK_X_INVERT BIT(12) #define HID_QUIRK_Y_INVERT BIT(13) #define HID_QUIRK_SKIP_OUTPUT_REPORTS BIT(16) #define HID_QUIRK_SKIP_OUTPUT_REPORT_ID BIT(17) #define HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP BIT(18) #define HID_QUIRK_HAVE_SPECIAL_DRIVER BIT(19) #define HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE BIT(20) #define HID_QUIRK_NOINVERT BIT(21) #define HID_QUIRK_FULLSPEED_INTERVAL BIT(28) #define HID_QUIRK_NO_INIT_REPORTS BIT(29) #define HID_QUIRK_NO_IGNORE BIT(30) #define HID_QUIRK_NO_INPUT_SYNC BIT(31) /* * HID device groups * * Note: HID_GROUP_ANY is declared in linux/mod_devicetable.h * and has a value of 0x0000 */ #define HID_GROUP_GENERIC 0x0001 #define HID_GROUP_MULTITOUCH 0x0002 #define HID_GROUP_SENSOR_HUB 0x0003 #define HID_GROUP_MULTITOUCH_WIN_8 0x0004 /* * Vendor specific HID device groups */ #define HID_GROUP_RMI 0x0100 #define HID_GROUP_WACOM 0x0101 #define HID_GROUP_LOGITECH_DJ_DEVICE 0x0102 #define HID_GROUP_STEAM 0x0103 #define HID_GROUP_LOGITECH_27MHZ_DEVICE 0x0104 #define HID_GROUP_VIVALDI 0x0105 /* * HID protocol status */ #define HID_REPORT_PROTOCOL 1 #define HID_BOOT_PROTOCOL 0 /* * This is the global environment of the parser. This information is * persistent for main-items. The global environment can be saved and * restored with PUSH/POP statements. */ struct hid_global { unsigned usage_page; __s32 logical_minimum; __s32 logical_maximum; __s32 physical_minimum; __s32 physical_maximum; __s32 unit_exponent; unsigned unit; unsigned report_id; unsigned report_size; unsigned report_count; }; /* * This is the local environment. It is persistent up the next main-item. */ #define HID_MAX_USAGES 12288 #define HID_DEFAULT_NUM_COLLECTIONS 16 struct hid_local { unsigned usage[HID_MAX_USAGES]; /* usage array */ u8 usage_size[HID_MAX_USAGES]; /* usage size array */ unsigned collection_index[HID_MAX_USAGES]; /* collection index array */ unsigned usage_index; unsigned usage_minimum; unsigned delimiter_depth; unsigned delimiter_branch; }; /* * This is the collection stack. We climb up the stack to determine * application and function of each field. */ struct hid_collection { int parent_idx; /* device->collection */ unsigned type; unsigned usage; unsigned level; }; struct hid_usage { unsigned hid; /* hid usage code */ unsigned collection_index; /* index into collection array */ unsigned usage_index; /* index into usage array */ __s8 resolution_multiplier;/* Effective Resolution Multiplier (HUT v1.12, 4.3.1), default: 1 */ /* hidinput data */ __s8 wheel_factor; /* 120/resolution_multiplier */ __u16 code; /* input driver code */ __u8 type; /* input driver type */ __s8 hat_min; /* hat switch fun */ __s8 hat_max; /* ditto */ __s8 hat_dir; /* ditto */ __s16 wheel_accumulated; /* hi-res wheel */ }; struct hid_input; struct hid_field { unsigned physical; /* physical usage for this field */ unsigned logical; /* logical usage for this field */ unsigned application; /* application usage for this field */ struct hid_usage *usage; /* usage table for this function */ unsigned maxusage; /* maximum usage index */ unsigned flags; /* main-item flags (i.e. volatile,array,constant) */ unsigned report_offset; /* bit offset in the report */ unsigned report_size; /* size of this field in the report */ unsigned report_count; /* number of this field in the report */ unsigned report_type; /* (input,output,feature) */ __s32 *value; /* last known value(s) */ __s32 *new_value; /* newly read value(s) */ __s32 *usages_priorities; /* priority of each usage when reading the report * bits 8-16 are reserved for hid-input usage */ __s32 logical_minimum; __s32 logical_maximum; __s32 physical_minimum; __s32 physical_maximum; __s32 unit_exponent; unsigned unit; bool ignored; /* this field is ignored in this event */ struct hid_report *report; /* associated report */ unsigned index; /* index into report->field[] */ /* hidinput data */ struct hid_input *hidinput; /* associated input structure */ __u16 dpad; /* dpad input code */ unsigned int slot_idx; /* slot index in a report */ }; #define HID_MAX_FIELDS 256 struct hid_field_entry { struct list_head list; struct hid_field *field; unsigned int index; __s32 priority; }; struct hid_report { struct list_head list; struct list_head hidinput_list; struct list_head field_entry_list; /* ordered list of input fields */ unsigned int id; /* id of this report */ enum hid_report_type type; /* report type */ unsigned int application; /* application usage for this report */ struct hid_field *field[HID_MAX_FIELDS]; /* fields of the report */ struct hid_field_entry *field_entries; /* allocated memory of input field_entry */ unsigned maxfield; /* maximum valid field index */ unsigned size; /* size of the report (bits) */ struct hid_device *device; /* associated device */ /* tool related state */ bool tool_active; /* whether the current tool is active */ unsigned int tool; /* BTN_TOOL_* */ }; #define HID_MAX_IDS 256 struct hid_report_enum { unsigned numbered; struct list_head report_list; struct hid_report *report_id_hash[HID_MAX_IDS]; }; #define HID_MIN_BUFFER_SIZE 64 /* make sure there is at least a packet size of space */ #define HID_MAX_BUFFER_SIZE 16384 /* 16kb */ #define HID_CONTROL_FIFO_SIZE 256 /* to init devices with >100 reports */ #define HID_OUTPUT_FIFO_SIZE 64 struct hid_control_fifo { unsigned char dir; struct hid_report *report; char *raw_report; }; struct hid_output_fifo { struct hid_report *report; char *raw_report; }; #define HID_CLAIMED_INPUT BIT(0) #define HID_CLAIMED_HIDDEV BIT(1) #define HID_CLAIMED_HIDRAW BIT(2) #define HID_CLAIMED_DRIVER BIT(3) #define HID_STAT_ADDED BIT(0) #define HID_STAT_PARSED BIT(1) #define HID_STAT_DUP_DETECTED BIT(2) #define HID_STAT_REPROBED BIT(3) struct hid_input { struct list_head list; struct hid_report *report; struct input_dev *input; const char *name; struct list_head reports; /* the list of reports */ unsigned int application; /* application usage for this input */ bool registered; }; enum hid_type { HID_TYPE_OTHER = 0, HID_TYPE_USBMOUSE, HID_TYPE_USBNONE }; enum hid_battery_status { HID_BATTERY_UNKNOWN = 0, HID_BATTERY_QUERIED, /* Kernel explicitly queried battery strength */ HID_BATTERY_REPORTED, /* Device sent unsolicited battery strength report */ }; struct hid_driver; struct hid_ll_driver; struct hid_device { /* device report descriptor */ __u8 *dev_rdesc; unsigned dev_rsize; __u8 *rdesc; unsigned rsize; struct hid_collection *collection; /* List of HID collections */ unsigned collection_size; /* Number of allocated hid_collections */ unsigned maxcollection; /* Number of parsed collections */ unsigned maxapplication; /* Number of applications */ __u16 bus; /* BUS ID */ __u16 group; /* Report group */ __u32 vendor; /* Vendor ID */ __u32 product; /* Product ID */ __u32 version; /* HID version */ enum hid_type type; /* device type (mouse, kbd, ...) */ unsigned country; /* HID country */ struct hid_report_enum report_enum[HID_REPORT_TYPES]; struct work_struct led_work; /* delayed LED worker */ struct semaphore driver_input_lock; /* protects the current driver */ struct device dev; /* device */ struct hid_driver *driver; void *devres_group_id; /* ID of probe devres group */ const struct hid_ll_driver *ll_driver; struct mutex ll_open_lock; unsigned int ll_open_count; #ifdef CONFIG_HID_BATTERY_STRENGTH /* * Power supply information for HID devices which report * battery strength. power_supply was successfully registered if * battery is non-NULL. */ struct power_supply *battery; __s32 battery_capacity; __s32 battery_min; __s32 battery_max; __s32 battery_report_type; __s32 battery_report_id; __s32 battery_charge_status; enum hid_battery_status battery_status; bool battery_avoid_query; ktime_t battery_ratelimit_time; #endif unsigned long status; /* see STAT flags above */ unsigned claimed; /* Claimed by hidinput, hiddev? */ unsigned quirks; /* Various quirks the device can pull on us */ unsigned initial_quirks; /* Initial set of quirks supplied when creating device */ bool io_started; /* If IO has started */ struct list_head inputs; /* The list of inputs */ void *hiddev; /* The hiddev structure */ void *hidraw; char name[128]; /* Device name */ char phys[64]; /* Device physical location */ char uniq[64]; /* Device unique identifier (serial #) */ void *driver_data; /* temporary hid_ff handling (until moved to the drivers) */ int (*ff_init)(struct hid_device *); /* hiddev event handler */ int (*hiddev_connect)(struct hid_device *, unsigned int); void (*hiddev_disconnect)(struct hid_device *); void (*hiddev_hid_event) (struct hid_device *, struct hid_field *field, struct hid_usage *, __s32); void (*hiddev_report_event) (struct hid_device *, struct hid_report *); /* debugging support via debugfs */ unsigned short debug; struct dentry *debug_dir; struct dentry *debug_rdesc; struct dentry *debug_events; struct list_head debug_list; spinlock_t debug_list_lock; wait_queue_head_t debug_wait; struct kref ref; unsigned int id; /* system unique id */ #ifdef CONFIG_BPF struct hid_bpf bpf; /* hid-bpf data */ #endif /* CONFIG_BPF */ }; void hiddev_free(struct kref *ref); #define to_hid_device(pdev) \ container_of(pdev, struct hid_device, dev) static inline void *hid_get_drvdata(struct hid_device *hdev) { return dev_get_drvdata(&hdev->dev); } static inline void hid_set_drvdata(struct hid_device *hdev, void *data) { dev_set_drvdata(&hdev->dev, data); } #define HID_GLOBAL_STACK_SIZE 4 #define HID_COLLECTION_STACK_SIZE 4 #define HID_SCAN_FLAG_MT_WIN_8 BIT(0) #define HID_SCAN_FLAG_VENDOR_SPECIFIC BIT(1) #define HID_SCAN_FLAG_GD_POINTER BIT(2) struct hid_parser { struct hid_global global; struct hid_global global_stack[HID_GLOBAL_STACK_SIZE]; unsigned int global_stack_ptr; struct hid_local local; unsigned int *collection_stack; unsigned int collection_stack_ptr; unsigned int collection_stack_size; struct hid_device *device; unsigned int scan_flags; }; struct hid_class_descriptor { __u8 bDescriptorType; __le16 wDescriptorLength; } __attribute__ ((packed)); struct hid_descriptor { __u8 bLength; __u8 bDescriptorType; __le16 bcdHID; __u8 bCountryCode; __u8 bNumDescriptors; struct hid_class_descriptor desc[1]; } __attribute__ ((packed)); #define HID_DEVICE(b, g, ven, prod) \ .bus = (b), .group = (g), .vendor = (ven), .product = (prod) #define HID_USB_DEVICE(ven, prod) \ .bus = BUS_USB, .vendor = (ven), .product = (prod) #define HID_BLUETOOTH_DEVICE(ven, prod) \ .bus = BUS_BLUETOOTH, .vendor = (ven), .product = (prod) #define HID_I2C_DEVICE(ven, prod) \ .bus = BUS_I2C, .vendor = (ven), .product = (prod) #define HID_REPORT_ID(rep) \ .report_type = (rep) #define HID_USAGE_ID(uhid, utype, ucode) \ .usage_hid = (uhid), .usage_type = (utype), .usage_code = (ucode) /* we don't want to catch types and codes equal to 0 */ #define HID_TERMINATOR (HID_ANY_ID - 1) struct hid_report_id { __u32 report_type; }; struct hid_usage_id { __u32 usage_hid; __u32 usage_type; __u32 usage_code; }; /** * struct hid_driver * @name: driver name (e.g. "Footech_bar-wheel") * @id_table: which devices is this driver for (must be non-NULL for probe * to be called) * @dyn_list: list of dynamically added device ids * @dyn_lock: lock protecting @dyn_list * @match: check if the given device is handled by this driver * @probe: new device inserted * @remove: device removed (NULL if not a hot-plug capable driver) * @report_table: on which reports to call raw_event (NULL means all) * @raw_event: if report in report_table, this hook is called (NULL means nop) * @usage_table: on which events to call event (NULL means all) * @event: if usage in usage_table, this hook is called (NULL means nop) * @report: this hook is called after parsing a report (NULL means nop) * @report_fixup: called before report descriptor parsing (NULL means nop) * @input_mapping: invoked on input registering before mapping an usage * @input_mapped: invoked on input registering after mapping an usage * @input_configured: invoked just before the device is registered * @feature_mapping: invoked on feature registering * @suspend: invoked on suspend (NULL means nop) * @resume: invoked on resume if device was not reset (NULL means nop) * @reset_resume: invoked on resume if device was reset (NULL means nop) * * probe should return -errno on error, or 0 on success. During probe, * input will not be passed to raw_event unless hid_device_io_start is * called. * * raw_event and event should return negative on error, any other value will * pass the event on to .event() typically return 0 for success. * * input_mapping shall return a negative value to completely ignore this usage * (e.g. doubled or invalid usage), zero to continue with parsing of this * usage by generic code (no special handling needed) or positive to skip * generic parsing (needed special handling which was done in the hook already) * input_mapped shall return negative to inform the layer that this usage * should not be considered for further processing or zero to notify that * no processing was performed and should be done in a generic manner * Both these functions may be NULL which means the same behavior as returning * zero from them. */ struct hid_driver { char *name; const struct hid_device_id *id_table; struct list_head dyn_list; spinlock_t dyn_lock; bool (*match)(struct hid_device *dev, bool ignore_special_driver); int (*probe)(struct hid_device *dev, const struct hid_device_id *id); void (*remove)(struct hid_device *dev); const struct hid_report_id *report_table; int (*raw_event)(struct hid_device *hdev, struct hid_report *report, u8 *data, int size); const struct hid_usage_id *usage_table; int (*event)(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage, __s32 value); void (*report)(struct hid_device *hdev, struct hid_report *report); __u8 *(*report_fixup)(struct hid_device *hdev, __u8 *buf, unsigned int *size); int (*input_mapping)(struct hid_device *hdev, struct hid_input *hidinput, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max); int (*input_mapped)(struct hid_device *hdev, struct hid_input *hidinput, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max); int (*input_configured)(struct hid_device *hdev, struct hid_input *hidinput); void (*feature_mapping)(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage); int (*suspend)(struct hid_device *hdev, pm_message_t message); int (*resume)(struct hid_device *hdev); int (*reset_resume)(struct hid_device *hdev); /* private: */ struct device_driver driver; }; #define to_hid_driver(pdrv) \ container_of(pdrv, struct hid_driver, driver) /** * struct hid_ll_driver - low level driver callbacks * @start: called on probe to start the device * @stop: called on remove * @open: called by input layer on open * @close: called by input layer on close * @power: request underlying hardware to enter requested power mode * @parse: this method is called only once to parse the device data, * shouldn't allocate anything to not leak memory * @request: send report request to device (e.g. feature report) * @wait: wait for buffered io to complete (send/recv reports) * @raw_request: send raw report request to device (e.g. feature report) * @output_report: send output report to device * @idle: send idle request to device * @may_wakeup: return if device may act as a wakeup source during system-suspend * @max_buffer_size: over-ride maximum data buffer size (default: HID_MAX_BUFFER_SIZE) */ struct hid_ll_driver { int (*start)(struct hid_device *hdev); void (*stop)(struct hid_device *hdev); int (*open)(struct hid_device *hdev); void (*close)(struct hid_device *hdev); int (*power)(struct hid_device *hdev, int level); int (*parse)(struct hid_device *hdev); void (*request)(struct hid_device *hdev, struct hid_report *report, int reqtype); int (*wait)(struct hid_device *hdev); int (*raw_request) (struct hid_device *hdev, unsigned char reportnum, __u8 *buf, size_t len, unsigned char rtype, int reqtype); int (*output_report) (struct hid_device *hdev, __u8 *buf, size_t len); int (*idle)(struct hid_device *hdev, int report, int idle, int reqtype); bool (*may_wakeup)(struct hid_device *hdev); unsigned int max_buffer_size; }; extern bool hid_is_usb(const struct hid_device *hdev); #define PM_HINT_FULLON 1<<5 #define PM_HINT_NORMAL 1<<1 /* Applications from HID Usage Tables 4/8/99 Version 1.1 */ /* We ignore a few input applications that are not widely used */ #define IS_INPUT_APPLICATION(a) \ (((a >= HID_UP_GENDESK) && (a <= HID_GD_MULTIAXIS)) \ || ((a >= HID_DG_DIGITIZER) && (a <= HID_DG_WHITEBOARD)) \ || (a == HID_GD_SYSTEM_CONTROL) || (a == HID_CP_CONSUMER_CONTROL) \ || (a == HID_GD_WIRELESS_RADIO_CTLS)) /* HID core API */ extern bool hid_ignore(struct hid_device *); extern int hid_add_device(struct hid_device *); extern void hid_destroy_device(struct hid_device *); extern const struct bus_type hid_bus_type; extern int __must_check __hid_register_driver(struct hid_driver *, struct module *, const char *mod_name); /* use a define to avoid include chaining to get THIS_MODULE & friends */ #define hid_register_driver(driver) \ __hid_register_driver(driver, THIS_MODULE, KBUILD_MODNAME) extern void hid_unregister_driver(struct hid_driver *); /** * module_hid_driver() - Helper macro for registering a HID driver * @__hid_driver: hid_driver struct * * Helper macro for HID drivers which do not do anything special in module * init/exit. This eliminates a lot of boilerplate. Each module may only * use this macro once, and calling it replaces module_init() and module_exit() */ #define module_hid_driver(__hid_driver) \ module_driver(__hid_driver, hid_register_driver, \ hid_unregister_driver) extern void hidinput_hid_event(struct hid_device *, struct hid_field *, struct hid_usage *, __s32); extern void hidinput_report_event(struct hid_device *hid, struct hid_report *report); extern int hidinput_connect(struct hid_device *hid, unsigned int force); extern void hidinput_disconnect(struct hid_device *); int hid_set_field(struct hid_field *, unsigned, __s32); int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt); struct hid_field *hidinput_get_led_field(struct hid_device *hid); unsigned int hidinput_count_leds(struct hid_device *hid); __s32 hidinput_calc_abs_res(const struct hid_field *field, __u16 code); void hid_output_report(struct hid_report *report, __u8 *data); int __hid_request(struct hid_device *hid, struct hid_report *rep, enum hid_class_request reqtype); u8 *hid_alloc_report_buf(struct hid_report *report, gfp_t flags); struct hid_device *hid_allocate_device(void); struct hid_report *hid_register_report(struct hid_device *device, enum hid_report_type type, unsigned int id, unsigned int application); int hid_parse_report(struct hid_device *hid, __u8 *start, unsigned size); struct hid_report *hid_validate_values(struct hid_device *hid, enum hid_report_type type, unsigned int id, unsigned int field_index, unsigned int report_counts); void hid_setup_resolution_multiplier(struct hid_device *hid); int hid_open_report(struct hid_device *device); int hid_check_keys_pressed(struct hid_device *hid); int hid_connect(struct hid_device *hid, unsigned int connect_mask); void hid_disconnect(struct hid_device *hid); bool hid_match_one_id(const struct hid_device *hdev, const struct hid_device_id *id); const struct hid_device_id *hid_match_id(const struct hid_device *hdev, const struct hid_device_id *id); const struct hid_device_id *hid_match_device(struct hid_device *hdev, struct hid_driver *hdrv); bool hid_compare_device_paths(struct hid_device *hdev_a, struct hid_device *hdev_b, char separator); s32 hid_snto32(__u32 value, unsigned n); __u32 hid_field_extract(const struct hid_device *hid, __u8 *report, unsigned offset, unsigned n); #ifdef CONFIG_PM int hid_driver_suspend(struct hid_device *hdev, pm_message_t state); int hid_driver_reset_resume(struct hid_device *hdev); int hid_driver_resume(struct hid_device *hdev); #else static inline int hid_driver_suspend(struct hid_device *hdev, pm_message_t state) { return 0; } static inline int hid_driver_reset_resume(struct hid_device *hdev) { return 0; } static inline int hid_driver_resume(struct hid_device *hdev) { return 0; } #endif /** * hid_device_io_start - enable HID input during probe, remove * * @hid: the device * * This should only be called during probe or remove and only be * called by the thread calling probe or remove. It will allow * incoming packets to be delivered to the driver. */ static inline void hid_device_io_start(struct hid_device *hid) { if (hid->io_started) { dev_warn(&hid->dev, "io already started\n"); return; } hid->io_started = true; up(&hid->driver_input_lock); } /** * hid_device_io_stop - disable HID input during probe, remove * * @hid: the device * * Should only be called after hid_device_io_start. It will prevent * incoming packets from going to the driver for the duration of * probe, remove. If called during probe, packets will still go to the * driver after probe is complete. This function should only be called * by the thread calling probe or remove. */ static inline void hid_device_io_stop(struct hid_device *hid) { if (!hid->io_started) { dev_warn(&hid->dev, "io already stopped\n"); return; } hid->io_started = false; down(&hid->driver_input_lock); } /** * hid_map_usage - map usage input bits * * @hidinput: hidinput which we are interested in * @usage: usage to fill in * @bit: pointer to input->{}bit (out parameter) * @max: maximal valid usage->code to consider later (out parameter) * @type: input event type (EV_KEY, EV_REL, ...) * @c: code which corresponds to this usage and type * * The value pointed to by @bit will be set to NULL if either @type is * an unhandled event type, or if @c is out of range for @type. This * can be used as an error condition. */ static inline void hid_map_usage(struct hid_input *hidinput, struct hid_usage *usage, unsigned long **bit, int *max, __u8 type, unsigned int c) { struct input_dev *input = hidinput->input; unsigned long *bmap = NULL; unsigned int limit = 0; switch (type) { case EV_ABS: bmap = input->absbit; limit = ABS_MAX; break; case EV_REL: bmap = input->relbit; limit = REL_MAX; break; case EV_KEY: bmap = input->keybit; limit = KEY_MAX; break; case EV_LED: bmap = input->ledbit; limit = LED_MAX; break; case EV_MSC: bmap = input->mscbit; limit = MSC_MAX; break; } if (unlikely(c > limit || !bmap)) { pr_warn_ratelimited("%s: Invalid code %d type %d\n", input->name, c, type); *bit = NULL; return; } usage->type = type; usage->code = c; *max = limit; *bit = bmap; } /** * hid_map_usage_clear - map usage input bits and clear the input bit * * @hidinput: hidinput which we are interested in * @usage: usage to fill in * @bit: pointer to input->{}bit (out parameter) * @max: maximal valid usage->code to consider later (out parameter) * @type: input event type (EV_KEY, EV_REL, ...) * @c: code which corresponds to this usage and type * * The same as hid_map_usage, except the @c bit is also cleared in supported * bits (@bit). */ static inline void hid_map_usage_clear(struct hid_input *hidinput, struct hid_usage *usage, unsigned long **bit, int *max, __u8 type, __u16 c) { hid_map_usage(hidinput, usage, bit, max, type, c); if (*bit) clear_bit(usage->code, *bit); } /** * hid_parse - parse HW reports * * @hdev: hid device * * Call this from probe after you set up the device (if needed). Your * report_fixup will be called (if non-NULL) after reading raw report from * device before passing it to hid layer for real parsing. */ static inline int __must_check hid_parse(struct hid_device *hdev) { return hid_open_report(hdev); } int __must_check hid_hw_start(struct hid_device *hdev, unsigned int connect_mask); void hid_hw_stop(struct hid_device *hdev); int __must_check hid_hw_open(struct hid_device *hdev); void hid_hw_close(struct hid_device *hdev); void hid_hw_request(struct hid_device *hdev, struct hid_report *report, enum hid_class_request reqtype); int hid_hw_raw_request(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, size_t len, enum hid_report_type rtype, enum hid_class_request reqtype); int hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len); /** * hid_hw_power - requests underlying HW to go into given power mode * * @hdev: hid device * @level: requested power level (one of %PM_HINT_* defines) * * This function requests underlying hardware to enter requested power * mode. */ static inline int hid_hw_power(struct hid_device *hdev, int level) { return hdev->ll_driver->power ? hdev->ll_driver->power(hdev, level) : 0; } /** * hid_hw_idle - send idle request to device * * @hdev: hid device * @report: report to control * @idle: idle state * @reqtype: hid request type */ static inline int hid_hw_idle(struct hid_device *hdev, int report, int idle, enum hid_class_request reqtype) { if (hdev->ll_driver->idle) return hdev->ll_driver->idle(hdev, report, idle, reqtype); return 0; } /** * hid_hw_may_wakeup - return if the hid device may act as a wakeup source during system-suspend * * @hdev: hid device */ static inline bool hid_hw_may_wakeup(struct hid_device *hdev) { if (hdev->ll_driver->may_wakeup) return hdev->ll_driver->may_wakeup(hdev); if (hdev->dev.parent) return device_may_wakeup(hdev->dev.parent); return false; } /** * hid_hw_wait - wait for buffered io to complete * * @hdev: hid device */ static inline void hid_hw_wait(struct hid_device *hdev) { if (hdev->ll_driver->wait) hdev->ll_driver->wait(hdev); } /** * hid_report_len - calculate the report length * * @report: the report we want to know the length */ static inline u32 hid_report_len(struct hid_report *report) { return DIV_ROUND_UP(report->size, 8) + (report->id > 0); } int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt); /* HID quirks API */ unsigned long hid_lookup_quirk(const struct hid_device *hdev); int hid_quirks_init(char **quirks_param, __u16 bus, int count); void hid_quirks_exit(__u16 bus); #ifdef CONFIG_HID_PID int hid_pidff_init(struct hid_device *hid); #else #define hid_pidff_init NULL #endif #define dbg_hid(fmt, ...) pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__) #define hid_err(hid, fmt, ...) \ dev_err(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_notice(hid, fmt, ...) \ dev_notice(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_warn(hid, fmt, ...) \ dev_warn(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_info(hid, fmt, ...) \ dev_info(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_dbg(hid, fmt, ...) \ dev_dbg(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_err_once(hid, fmt, ...) \ dev_err_once(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_notice_once(hid, fmt, ...) \ dev_notice_once(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_warn_once(hid, fmt, ...) \ dev_warn_once(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_info_once(hid, fmt, ...) \ dev_info_once(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_dbg_once(hid, fmt, ...) \ dev_dbg_once(&(hid)->dev, fmt, ##__VA_ARGS__) #endif
58 59 59 58 58 59 59 59 59 58 59 59 58 59 10 10 10 1 1 1 59 59 10 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 // SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "mmu_internal.h" #include "tdp_iter.h" #include "spte.h" /* * Recalculates the pointer to the SPTE for the current GFN and level and * reread the SPTE. */ static void tdp_iter_refresh_sptep(struct tdp_iter *iter) { iter->sptep = iter->pt_path[iter->level - 1] + SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level); iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep); } /* * Return the TDP iterator to the root PT and allow it to continue its * traversal over the paging structure from there. */ void tdp_iter_restart(struct tdp_iter *iter) { iter->yielded = false; iter->yielded_gfn = iter->next_last_level_gfn; iter->level = iter->root_level; iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level); tdp_iter_refresh_sptep(iter); iter->valid = true; } /* * Sets a TDP iterator to walk a pre-order traversal of the paging structure * rooted at root_pt, starting with the walk to translate next_last_level_gfn. */ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, int min_level, gfn_t next_last_level_gfn) { if (WARN_ON_ONCE(!root || (root->role.level < 1) || (root->role.level > PT64_ROOT_MAX_LEVEL))) { iter->valid = false; return; } iter->next_last_level_gfn = next_last_level_gfn; iter->root_level = root->role.level; iter->min_level = min_level; iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt; iter->as_id = kvm_mmu_page_as_id(root); tdp_iter_restart(iter); } /* * Given an SPTE and its level, returns a pointer containing the host virtual * address of the child page table referenced by the SPTE. Returns null if * there is no such entry. */ tdp_ptep_t spte_to_child_pt(u64 spte, int level) { /* * There's no child entry if this entry isn't present or is a * last-level entry. */ if (!is_shadow_present_pte(spte) || is_last_spte(spte, level)) return NULL; return (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT); } /* * Steps down one level in the paging structure towards the goal GFN. Returns * true if the iterator was able to step down a level, false otherwise. */ static bool try_step_down(struct tdp_iter *iter) { tdp_ptep_t child_pt; if (iter->level == iter->min_level) return false; /* * Reread the SPTE before stepping down to avoid traversing into page * tables that are no longer linked from this entry. */ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep); child_pt = spte_to_child_pt(iter->old_spte, iter->level); if (!child_pt) return false; iter->level--; iter->pt_path[iter->level - 1] = child_pt; iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level); tdp_iter_refresh_sptep(iter); return true; } /* * Steps to the next entry in the current page table, at the current page table * level. The next entry could point to a page backing guest memory or another * page table, or it could be non-present. Returns true if the iterator was * able to step to the next entry in the page table, false if the iterator was * already at the end of the current page table. */ static bool try_step_side(struct tdp_iter *iter) { /* * Check if the iterator is already at the end of the current page * table. */ if (SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level) == (SPTE_ENT_PER_PAGE - 1)) return false; iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); iter->next_last_level_gfn = iter->gfn; iter->sptep++; iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep); return true; } /* * Tries to traverse back up a level in the paging structure so that the walk * can continue from the next entry in the parent page table. Returns true on a * successful step up, false if already in the root page. */ static bool try_step_up(struct tdp_iter *iter) { if (iter->level == iter->root_level) return false; iter->level++; iter->gfn = gfn_round_for_level(iter->gfn, iter->level); tdp_iter_refresh_sptep(iter); return true; } /* * Step to the next SPTE in a pre-order traversal of the paging structure. * To get to the next SPTE, the iterator either steps down towards the goal * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a * higher GFN. * * The basic algorithm is as follows: * 1. If the current SPTE is a non-last-level SPTE, step down into the page * table it points to. * 2. If the iterator cannot step down, it will try to step to the next SPTE * in the current page of the paging structure. * 3. If the iterator cannot step to the next entry in the current page, it will * try to step up to the parent paging structure page. In this case, that * SPTE will have already been visited, and so the iterator must also step * to the side again. */ void tdp_iter_next(struct tdp_iter *iter) { if (iter->yielded) { tdp_iter_restart(iter); return; } if (try_step_down(iter)) return; do { if (try_step_side(iter)) return; } while (try_step_up(iter)); iter->valid = false; }
2 1 2 1 1 1 6 8 5 5 1 5 5 3 3 1 3 3 3 2 3 3 2 1 2 2 1 1 1 7 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2022 * * This file is part of the SCTP kernel implementation * * These functions manipulate sctp stream queue/scheduling. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Xin Long <lucien.xin@gmail.com> */ #include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> /* Fair Capacity and Weighted Fair Queueing handling * RFC 8260 section 3.5 and 3.6 */ static void sctp_sched_fc_unsched_all(struct sctp_stream *stream); static int sctp_sched_wfq_set(struct sctp_stream *stream, __u16 sid, __u16 weight, gfp_t gfp) { struct sctp_stream_out_ext *soute = SCTP_SO(stream, sid)->ext; if (!weight) return -EINVAL; soute->fc_weight = weight; return 0; } static int sctp_sched_wfq_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { struct sctp_stream_out_ext *soute = SCTP_SO(stream, sid)->ext; *value = soute->fc_weight; return 0; } static int sctp_sched_fc_set(struct sctp_stream *stream, __u16 sid, __u16 weight, gfp_t gfp) { return 0; } static int sctp_sched_fc_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { return 0; } static int sctp_sched_fc_init(struct sctp_stream *stream) { INIT_LIST_HEAD(&stream->fc_list); return 0; } static int sctp_sched_fc_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { struct sctp_stream_out_ext *soute = SCTP_SO(stream, sid)->ext; INIT_LIST_HEAD(&soute->fc_list); soute->fc_length = 0; soute->fc_weight = 1; return 0; } static void sctp_sched_fc_free_sid(struct sctp_stream *stream, __u16 sid) { } static void sctp_sched_fc_sched(struct sctp_stream *stream, struct sctp_stream_out_ext *soute) { struct sctp_stream_out_ext *pos; if (!list_empty(&soute->fc_list)) return; list_for_each_entry(pos, &stream->fc_list, fc_list) if ((__u64)pos->fc_length * soute->fc_weight >= (__u64)soute->fc_length * pos->fc_weight) break; list_add_tail(&soute->fc_list, &pos->fc_list); } static void sctp_sched_fc_enqueue(struct sctp_outq *q, struct sctp_datamsg *msg) { struct sctp_stream *stream; struct sctp_chunk *ch; __u16 sid; ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); sid = sctp_chunk_stream_no(ch); stream = &q->asoc->stream; sctp_sched_fc_sched(stream, SCTP_SO(stream, sid)->ext); } static struct sctp_chunk *sctp_sched_fc_dequeue(struct sctp_outq *q) { struct sctp_stream *stream = &q->asoc->stream; struct sctp_stream_out_ext *soute; struct sctp_chunk *ch; /* Bail out quickly if queue is empty */ if (list_empty(&q->out_chunk_list)) return NULL; /* Find which chunk is next */ if (stream->out_curr) soute = stream->out_curr->ext; else soute = list_entry(stream->fc_list.next, struct sctp_stream_out_ext, fc_list); ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list); sctp_sched_dequeue_common(q, ch); return ch; } static void sctp_sched_fc_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) { struct sctp_stream *stream = &q->asoc->stream; struct sctp_stream_out_ext *soute, *pos; __u16 sid, i; sid = sctp_chunk_stream_no(ch); soute = SCTP_SO(stream, sid)->ext; /* reduce all fc_lengths by U32_MAX / 4 if the current fc_length overflows. */ if (soute->fc_length > U32_MAX - ch->skb->len) { for (i = 0; i < stream->outcnt; i++) { pos = SCTP_SO(stream, i)->ext; if (!pos) continue; if (pos->fc_length <= (U32_MAX >> 2)) { pos->fc_length = 0; continue; } pos->fc_length -= (U32_MAX >> 2); } } soute->fc_length += ch->skb->len; if (list_empty(&soute->outq)) { list_del_init(&soute->fc_list); return; } pos = soute; list_for_each_entry_continue(pos, &stream->fc_list, fc_list) if ((__u64)pos->fc_length * soute->fc_weight >= (__u64)soute->fc_length * pos->fc_weight) break; list_move_tail(&soute->fc_list, &pos->fc_list); } static void sctp_sched_fc_sched_all(struct sctp_stream *stream) { struct sctp_association *asoc; struct sctp_chunk *ch; asoc = container_of(stream, struct sctp_association, stream); list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { __u16 sid = sctp_chunk_stream_no(ch); if (SCTP_SO(stream, sid)->ext) sctp_sched_fc_sched(stream, SCTP_SO(stream, sid)->ext); } } static void sctp_sched_fc_unsched_all(struct sctp_stream *stream) { struct sctp_stream_out_ext *soute, *tmp; list_for_each_entry_safe(soute, tmp, &stream->fc_list, fc_list) list_del_init(&soute->fc_list); } static struct sctp_sched_ops sctp_sched_fc = { .set = sctp_sched_fc_set, .get = sctp_sched_fc_get, .init = sctp_sched_fc_init, .init_sid = sctp_sched_fc_init_sid, .free_sid = sctp_sched_fc_free_sid, .enqueue = sctp_sched_fc_enqueue, .dequeue = sctp_sched_fc_dequeue, .dequeue_done = sctp_sched_fc_dequeue_done, .sched_all = sctp_sched_fc_sched_all, .unsched_all = sctp_sched_fc_unsched_all, }; void sctp_sched_ops_fc_init(void) { sctp_sched_ops_register(SCTP_SS_FC, &sctp_sched_fc); } static struct sctp_sched_ops sctp_sched_wfq = { .set = sctp_sched_wfq_set, .get = sctp_sched_wfq_get, .init = sctp_sched_fc_init, .init_sid = sctp_sched_fc_init_sid, .free_sid = sctp_sched_fc_free_sid, .enqueue = sctp_sched_fc_enqueue, .dequeue = sctp_sched_fc_dequeue, .dequeue_done = sctp_sched_fc_dequeue_done, .sched_all = sctp_sched_fc_sched_all, .unsched_all = sctp_sched_fc_unsched_all, }; void sctp_sched_ops_wfq_init(void) { sctp_sched_ops_register(SCTP_SS_WFQ, &sctp_sched_wfq); }
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PSI_H #define _LINUX_PSI_H #include <linux/jump_label.h> #include <linux/psi_types.h> #include <linux/sched.h> #include <linux/poll.h> #include <linux/cgroup-defs.h> #include <linux/cgroup.h> struct seq_file; struct css_set; #ifdef CONFIG_PSI extern struct static_key_false psi_disabled; extern struct psi_group psi_system; void psi_init(void); void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, enum psi_res res, struct file *file, struct kernfs_open_file *of); void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); #ifdef CONFIG_CGROUPS static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) { return cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; } int psi_cgroup_alloc(struct cgroup *cgrp); void psi_cgroup_free(struct cgroup *cgrp); void cgroup_move_task(struct task_struct *p, struct css_set *to); void psi_cgroup_restart(struct psi_group *group); #endif #else /* CONFIG_PSI */ static inline void psi_init(void) {} static inline void psi_memstall_enter(unsigned long *flags) {} static inline void psi_memstall_leave(unsigned long *flags) {} #ifdef CONFIG_CGROUPS static inline int psi_cgroup_alloc(struct cgroup *cgrp) { return 0; } static inline void psi_cgroup_free(struct cgroup *cgrp) { } static inline void cgroup_move_task(struct task_struct *p, struct css_set *to) { rcu_assign_pointer(p->cgroups, to); } static inline void psi_cgroup_restart(struct psi_group *group) {} #endif #endif /* CONFIG_PSI */ #endif /* _LINUX_PSI_H */
4 4 3 3 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2006 Patrick McHardy <kaber@trash.net> * * Based on ipt_random and ipt_nth by Fabrice MARIE <fabrice@netfilter.org>. */ #include <linux/init.h> #include <linux/spinlock.h> #include <linux/skbuff.h> #include <linux/net.h> #include <linux/slab.h> #include <linux/netfilter/xt_statistic.h> #include <linux/netfilter/x_tables.h> #include <linux/module.h> struct xt_statistic_priv { atomic_t count; } ____cacheline_aligned_in_smp; MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Xtables: statistics-based matching (\"Nth\", random)"); MODULE_ALIAS("ipt_statistic"); MODULE_ALIAS("ip6t_statistic"); static bool statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_statistic_info *info = par->matchinfo; bool ret = info->flags & XT_STATISTIC_INVERT; int nval, oval; switch (info->mode) { case XT_STATISTIC_MODE_RANDOM: if ((get_random_u32() & 0x7FFFFFFF) < info->u.random.probability) ret = !ret; break; case XT_STATISTIC_MODE_NTH: do { oval = atomic_read(&info->master->count); nval = (oval == info->u.nth.every) ? 0 : oval + 1; } while (atomic_cmpxchg(&info->master->count, oval, nval) != oval); if (nval == 0) ret = !ret; break; } return ret; } static int statistic_mt_check(const struct xt_mtchk_param *par) { struct xt_statistic_info *info = par->matchinfo; if (info->mode > XT_STATISTIC_MODE_MAX || info->flags & ~XT_STATISTIC_MASK) return -EINVAL; info->master = kzalloc(sizeof(*info->master), GFP_KERNEL); if (info->master == NULL) return -ENOMEM; atomic_set(&info->master->count, info->u.nth.count); return 0; } static void statistic_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_statistic_info *info = par->matchinfo; kfree(info->master); } static struct xt_match xt_statistic_mt_reg __read_mostly = { .name = "statistic", .revision = 0, .family = NFPROTO_UNSPEC, .match = statistic_mt, .checkentry = statistic_mt_check, .destroy = statistic_mt_destroy, .matchsize = sizeof(struct xt_statistic_info), .usersize = offsetof(struct xt_statistic_info, master), .me = THIS_MODULE, }; static int __init statistic_mt_init(void) { return xt_register_match(&xt_statistic_mt_reg); } static void __exit statistic_mt_exit(void) { xt_unregister_match(&xt_statistic_mt_reg); } module_init(statistic_mt_init); module_exit(statistic_mt_exit);
2266 662 662 69 69 69 69 69 69 69 513 636 513 513 513 636 636 513 513 513 513 512 636 636 150 140 140 149 16 16 3 16 44 44 44 44 27 27 33 27 27 143 143 6 6 6 5 5 2 2 2 2 2464 2463 2464 2465 1 1 2 2 3689 3685 3687 38 38 38 847 847 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 // SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/gfp.h> #include <linux/hugetlb.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/fixmap.h> #include <asm/mtrr.h> #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; EXPORT_SYMBOL(physical_mask); #endif #ifdef CONFIG_HIGHPTE #define PGTABLE_HIGHMEM __GFP_HIGHMEM #else #define PGTABLE_HIGHMEM 0 #endif #ifndef CONFIG_PARAVIRT static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) { tlb_remove_page(tlb, table); } #endif gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; pgtable_t pte_alloc_one(struct mm_struct *mm) { return __pte_alloc_one(mm, __userpte_alloc_gfp); } static int __init setup_userpte(char *arg) { if (!arg) return -EINVAL; /* * "userpte=nohigh" disables allocation of user pagetables in * high memory. */ if (strcmp(arg, "nohigh") == 0) __userpte_alloc_gfp &= ~__GFP_HIGHMEM; else return -EINVAL; return 0; } early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pagetable_pte_dtor(page_ptdesc(pte)); paravirt_release_pte(page_to_pfn(pte)); paravirt_tlb_remove_table(tlb, pte); } #if CONFIG_PGTABLE_LEVELS > 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { struct ptdesc *ptdesc = virt_to_ptdesc(pmd); paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); /* * NOTE! For PAE, any changes to the top page-directory-pointer-table * entries need a full cr3 reload to flush. */ #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif pagetable_pmd_dtor(ptdesc); paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc)); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { struct ptdesc *ptdesc = virt_to_ptdesc(pud); pagetable_pud_dtor(ptdesc); paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); paravirt_tlb_remove_table(tlb, virt_to_page(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); paravirt_tlb_remove_table(tlb, virt_to_page(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* CONFIG_PGTABLE_LEVELS > 2 */ static inline void pgd_list_add(pgd_t *pgd) { struct ptdesc *ptdesc = virt_to_ptdesc(pgd); list_add(&ptdesc->pt_list, &pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { struct ptdesc *ptdesc = virt_to_ptdesc(pgd); list_del(&ptdesc->pt_list); } #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) #define MAX_UNSHARED_PTRS_PER_PGD \ max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD) static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) { virt_to_ptdesc(pgd)->pt_mm = mm; } struct mm_struct *pgd_page_get_mm(struct page *page) { return page_ptdesc(page)->pt_mm; } static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) { /* If the pgd points to a shared pagetable level (either the ptes in non-PAE, or shared PMD in PAE), then just copy the references from swapper_pg_dir. */ if (CONFIG_PGTABLE_LEVELS == 2 || (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || CONFIG_PGTABLE_LEVELS >= 4) { clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); } /* list required to sync kernel mapping updates */ if (!SHARED_KERNEL_PMD) { pgd_set_mm(pgd, mm); pgd_list_add(pgd); } } static void pgd_dtor(pgd_t *pgd) { if (SHARED_KERNEL_PMD) return; spin_lock(&pgd_lock); pgd_list_del(pgd); spin_unlock(&pgd_lock); } /* * List of all pgd's needed for non-PAE so it can invalidate entries * in both cached and uncached pgd's; not needed for PAE since the * kernel pmd is shared. If PAE were not to share the pmd a similar * tactic would be needed. This is essentially codepath-based locking * against pageattr.c; it is the unique case in which a valid change * of kernel pagetables can't be lazily synchronized by vmalloc faults. * vmalloc faults work because attached pagetables are never freed. * -- nyc */ #ifdef CONFIG_X86_PAE /* * In PAE mode, we need to do a cr3 reload (=tlb flush) when * updating the top-level pagetable entries to guarantee the * processor notices the update. Since this is expensive, and * all 4 top-level entries are used almost immediately in a * new process's life, we just pre-populate them here. * * Also, if we're in a paravirt environment where the kernel pmd is * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate * and initialize the kernel pmds here. */ #define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD #define MAX_PREALLOCATED_PMDS MAX_UNSHARED_PTRS_PER_PGD /* * We allocate separate PMDs for the kernel part of the user page-table * when PTI is enabled. We need them to map the per-process LDT into the * user-space page-table. */ #define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \ KERNEL_PGD_PTRS : 0) #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) { paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); /* Note: almost everything apart from _PAGE_PRESENT is reserved at the pmd (PDPT) level. */ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); /* * According to Intel App note "TLBs, Paging-Structure Caches, * and Their Invalidation", April 2007, document 317080-001, * section 8.1: in PAE mode we explicitly have to flush the * TLB via cr3 if the top-level pgd is changed... */ flush_tlb_mm(mm); } #else /* !CONFIG_X86_PAE */ /* No need to prepopulate any pagetable entries in non-PAE modes. */ #define PREALLOCATED_PMDS 0 #define MAX_PREALLOCATED_PMDS 0 #define PREALLOCATED_USER_PMDS 0 #define MAX_PREALLOCATED_USER_PMDS 0 #endif /* CONFIG_X86_PAE */ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; struct ptdesc *ptdesc; for (i = 0; i < count; i++) if (pmds[i]) { ptdesc = virt_to_ptdesc(pmds[i]); pagetable_pmd_dtor(ptdesc); pagetable_free(ptdesc); mm_dec_nr_pmds(mm); } } static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; bool failed = false; gfp_t gfp = GFP_PGTABLE_USER; if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; gfp &= ~__GFP_HIGHMEM; for (i = 0; i < count; i++) { pmd_t *pmd = NULL; struct ptdesc *ptdesc = pagetable_alloc(gfp, 0); if (!ptdesc) failed = true; if (ptdesc && !pagetable_pmd_ctor(ptdesc)) { pagetable_free(ptdesc); ptdesc = NULL; failed = true; } if (ptdesc) { mm_inc_nr_pmds(mm); pmd = ptdesc_address(ptdesc); } pmds[i] = pmd; } if (failed) { free_pmds(mm, pmds, count); return -ENOMEM; } return 0; } /* * Mop up any pmd pages which may still be attached to the pgd. * Normally they will be freed by munmap/exit_mmap, but any pmd we * preallocate which never got a corresponding vma will need to be * freed manually. */ static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) { pgd_t pgd = *pgdp; if (pgd_val(pgd) != 0) { pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); pgd_clear(pgdp); paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); pmd_free(mm, pmd); mm_dec_nr_pmds(mm); } } static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) { int i; for (i = 0; i < PREALLOCATED_PMDS; i++) mop_up_one_pmd(mm, &pgdp[i]); #ifdef CONFIG_PAGE_TABLE_ISOLATION if (!boot_cpu_has(X86_FEATURE_PTI)) return; pgdp = kernel_to_user_pgdp(pgdp); for (i = 0; i < PREALLOCATED_USER_PMDS; i++) mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]); #endif } static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { p4d_t *p4d; pud_t *pud; int i; p4d = p4d_offset(pgd, 0); pud = pud_offset(p4d, 0); for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; if (i >= KERNEL_PGD_BOUNDARY) memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), sizeof(pmd_t) * PTRS_PER_PMD); pud_populate(mm, pud, pmd); } } #ifdef CONFIG_PAGE_TABLE_ISOLATION static void pgd_prepopulate_user_pmd(struct mm_struct *mm, pgd_t *k_pgd, pmd_t *pmds[]) { pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir); pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); p4d_t *u_p4d; pud_t *u_pud; int i; u_p4d = p4d_offset(u_pgd, 0); u_pud = pud_offset(u_p4d, 0); s_pgd += KERNEL_PGD_BOUNDARY; u_pud += KERNEL_PGD_BOUNDARY; for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) { pmd_t *pmd = pmds[i]; memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd), sizeof(pmd_t) * PTRS_PER_PMD); pud_populate(mm, u_pud, pmd); } } #else static void pgd_prepopulate_user_pmd(struct mm_struct *mm, pgd_t *k_pgd, pmd_t *pmds[]) { } #endif /* * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also * assumes that pgd should be in one page. * * But kernel with PAE paging that is not running as a Xen domain * only needs to allocate 32 bytes for pgd instead of one page. */ #ifdef CONFIG_X86_PAE #include <linux/slab.h> #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) #define PGD_ALIGN 32 static struct kmem_cache *pgd_cache; void __init pgtable_cache_init(void) { /* * When PAE kernel is running as a Xen domain, it does not use * shared kernel pmd. And this requires a whole page for pgd. */ if (!SHARED_KERNEL_PMD) return; /* * when PAE kernel is not running as a Xen domain, it uses * shared kernel pmd. Shared kernel pmd does not require a whole * page for pgd. We are able to just allocate a 32-byte for pgd. * During boot time, we create a 32-byte slab for pgd table allocation. */ pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, SLAB_PANIC, NULL); } static inline pgd_t *_pgd_alloc(void) { /* * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain. * We allocate one page for pgd. */ if (!SHARED_KERNEL_PMD) return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, PGD_ALLOCATION_ORDER); /* * Now PAE kernel is not running as a Xen domain. We can allocate * a 32-byte slab for pgd to save memory space. */ return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER); } static inline void _pgd_free(pgd_t *pgd) { if (!SHARED_KERNEL_PMD) free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); else kmem_cache_free(pgd_cache, pgd); } #else static inline pgd_t *_pgd_alloc(void) { return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, PGD_ALLOCATION_ORDER); } static inline void _pgd_free(pgd_t *pgd) { free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); } #endif /* CONFIG_X86_PAE */ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; pmd_t *pmds[MAX_PREALLOCATED_PMDS]; pgd = _pgd_alloc(); if (pgd == NULL) goto out; mm->pgd = pgd; if (sizeof(pmds) != 0 && preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) goto out_free_pgd; if (sizeof(u_pmds) != 0 && preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) goto out_free_pmds; if (paravirt_pgd_alloc(mm) != 0) goto out_free_user_pmds; /* * Make sure that pre-populating the pmds is atomic with * respect to anything walking the pgd_list, so that they * never see a partially populated pgd. */ spin_lock(&pgd_lock); pgd_ctor(mm, pgd); if (sizeof(pmds) != 0) pgd_prepopulate_pmd(mm, pgd, pmds); if (sizeof(u_pmds) != 0) pgd_prepopulate_user_pmd(mm, pgd, u_pmds); spin_unlock(&pgd_lock); return pgd; out_free_user_pmds: if (sizeof(u_pmds) != 0) free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); out_free_pmds: if (sizeof(pmds) != 0) free_pmds(mm, pmds, PREALLOCATED_PMDS); out_free_pgd: _pgd_free(pgd); out: return NULL; } void pgd_free(struct mm_struct *mm, pgd_t *pgd) { pgd_mop_up_pmds(mm, pgd); pgd_dtor(pgd); paravirt_pgd_free(mm, pgd); _pgd_free(pgd); } /* * Used to set accessed or dirty bits in the page table entries * on other architectures. On x86, the accessed and dirty bits * are tracked by hardware. However, do_wp_page calls this function * to also make the pte writeable at the same time the dirty bit is * set. In that case we do actually need to write the PTE. */ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { int changed = !pte_same(*ptep, entry); if (changed && dirty) set_pte(ptep, entry); return changed; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { int changed = !pmd_same(*pmdp, entry); VM_BUG_ON(address & ~HPAGE_PMD_MASK); if (changed && dirty) { set_pmd(pmdp, entry); /* * We had a write-protection fault here and changed the pmd * to to more permissive. No need to flush the TLB for that, * #PF is architecturally guaranteed to do that and in the * worst-case we'll generate a spurious fault. */ } return changed; } int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty) { int changed = !pud_same(*pudp, entry); VM_BUG_ON(address & ~HPAGE_PUD_MASK); if (changed && dirty) { set_pud(pudp, entry); /* * We had a write-protection fault here and changed the pud * to to more permissive. No need to flush the TLB for that, * #PF is architecturally guaranteed to do that and in the * worst-case we'll generate a spurious fault. */ } return changed; } #endif int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { int ret = 0; if (pte_young(*ptep)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *) &ptep->pte); return ret; } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { int ret = 0; if (pmd_young(*pmdp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pmdp); return ret; } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp) { int ret = 0; if (pud_young(*pudp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pudp); return ret; } #endif int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { /* * On x86 CPUs, clearing the accessed bit without a TLB flush * doesn't cause data corruption. [ It could cause incorrect * page aging and the (mistaken) reclaim of hot pages, but the * chance of that should be relatively low. ] * * So as a performance optimization don't flush the TLB when * clearing the accessed bit, it will eventually be flushed by * a context switch or a VM operation anyway. [ In the rare * event of it not getting flushed for a long time the delay * shouldn't really matter because there's no real memory * pressure for swapout to react to. ] */ return ptep_test_and_clear_young(vma, address, ptep); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { int young; VM_BUG_ON(address & ~HPAGE_PMD_MASK); young = pmdp_test_and_clear_young(vma, address, pmdp); if (young) flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return young; } pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { /* * No flush is necessary. Once an invalid PTE is established, the PTE's * access and dirty bits cannot be updated. */ return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); } #endif /** * reserve_top_address - reserves a hole in the top of kernel address space * @reserve - size of hole to reserve * * Can be used to relocate the fixmap area and poke a hole in the top * of kernel address space to make room for a hypervisor. */ void __init reserve_top_address(unsigned long reserve) { #ifdef CONFIG_X86_32 BUG_ON(fixmaps_set > 0); __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", -reserve, __FIXADDR_TOP + PAGE_SIZE); #endif } int fixmaps_set; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) { unsigned long address = __fix_to_virt(idx); #ifdef CONFIG_X86_64 /* * Ensure that the static initial page tables are covering the * fixmap completely. */ BUILD_BUG_ON(__end_of_permanent_fixed_addresses > (FIXMAP_PMD_NUM * PTRS_PER_PTE)); #endif if (idx >= __end_of_fixed_addresses) { BUG(); return; } set_pte_vaddr(address, pte); fixmaps_set++; } void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags) { /* Sanitize 'prot' against any unsupported bits: */ pgprot_val(flags) &= __default_kernel_pte_mask; __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); } #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP #ifdef CONFIG_X86_5LEVEL /** * p4d_set_huge - setup kernel P4D mapping * * No 512GB pages yet -- always return 0 */ int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } /** * p4d_clear_huge - clear kernel P4D mapping when it is set * * No 512GB pages yet -- always return 0 */ void p4d_clear_huge(p4d_t *p4d) { } #endif /** * pud_set_huge - setup kernel PUD mapping * * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this * function sets up a huge page only if the complete range has the same MTRR * caching mode. * * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger * page mapping attempt fails. * * Returns 1 on success and 0 on failure. */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { u8 uniform; mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); if (!uniform) return 0; /* Bail out if we are we on a populated non-leaf entry: */ if (pud_present(*pud) && !pud_huge(*pud)) return 0; set_pte((pte_t *)pud, pfn_pte( (u64)addr >> PAGE_SHIFT, __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } /** * pmd_set_huge - setup kernel PMD mapping * * See text over pud_set_huge() above. * * Returns 1 on success and 0 on failure. */ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { u8 uniform; mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); if (!uniform) { pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", __func__, addr, addr + PMD_SIZE); return 0; } /* Bail out if we are we on a populated non-leaf entry: */ if (pmd_present(*pmd) && !pmd_huge(*pmd)) return 0; set_pte((pte_t *)pmd, pfn_pte( (u64)addr >> PAGE_SHIFT, __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } /** * pud_clear_huge - clear kernel PUD mapping when it is set * * Returns 1 on success and 0 on failure (no PUD map is found). */ int pud_clear_huge(pud_t *pud) { if (pud_large(*pud)) { pud_clear(pud); return 1; } return 0; } /** * pmd_clear_huge - clear kernel PMD mapping when it is set * * Returns 1 on success and 0 on failure (no PMD map is found). */ int pmd_clear_huge(pmd_t *pmd) { if (pmd_large(*pmd)) { pmd_clear(pmd); return 1; } return 0; } #ifdef CONFIG_X86_64 /** * pud_free_pmd_page - Clear pud entry and free pmd page. * @pud: Pointer to a PUD. * @addr: Virtual address associated with pud. * * Context: The pud range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. * * NOTE: Callers must allow a single page allocation. */ int pud_free_pmd_page(pud_t *pud, unsigned long addr) { pmd_t *pmd, *pmd_sv; pte_t *pte; int i; pmd = pud_pgtable(*pud); pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); if (!pmd_sv) return 0; for (i = 0; i < PTRS_PER_PMD; i++) { pmd_sv[i] = pmd[i]; if (!pmd_none(pmd[i])) pmd_clear(&pmd[i]); } pud_clear(pud); /* INVLPG to clear all paging-structure caches */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); for (i = 0; i < PTRS_PER_PMD; i++) { if (!pmd_none(pmd_sv[i])) { pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); free_page((unsigned long)pte); } } free_page((unsigned long)pmd_sv); pagetable_pmd_dtor(virt_to_ptdesc(pmd)); free_page((unsigned long)pmd); return 1; } /** * pmd_free_pte_page - Clear pmd entry and free pte page. * @pmd: Pointer to a PMD. * @addr: Virtual address associated with pmd. * * Context: The pmd range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. */ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { pte_t *pte; pte = (pte_t *)pmd_page_vaddr(*pmd); pmd_clear(pmd); /* INVLPG to clear all paging-structure caches */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); free_page((unsigned long)pte); return 1; } #else /* !CONFIG_X86_64 */ /* * Disable free page handling on x86-PAE. This assures that ioremap() * does not update sync'd pmd entries. See vmalloc_sync_one(). */ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { return pmd_none(*pmd); } #endif /* CONFIG_X86_64 */ #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHADOW_STACK) return pte_mkwrite_shstk(pte); pte = pte_mkwrite_novma(pte); return pte_clear_saveddirty(pte); } pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHADOW_STACK) return pmd_mkwrite_shstk(pmd); pmd = pmd_mkwrite_novma(pmd); return pmd_clear_saveddirty(pmd); } void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) { /* * Hardware before shadow stack can (rarely) set Dirty=1 * on a Write=0 PTE. So the below condition * only indicates a software bug when shadow stack is * supported by the HW. This checking is covered in * pte_shstk(). */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pte_shstk(pte)); } void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) { /* See note in arch_check_zapped_pte() */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pmd_shstk(pmd)); }
1 1 1 1 1 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 // SPDX-License-Identifier: GPL-2.0-only // Copyright (c) 2020 Facebook Inc. #include <linux/debugfs.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <net/udp_tunnel.h> #include "netdevsim.h" static int nsim_udp_tunnel_set_port(struct net_device *dev, unsigned int table, unsigned int entry, struct udp_tunnel_info *ti) { struct netdevsim *ns = netdev_priv(dev); int ret; ret = -ns->udp_ports.inject_error; ns->udp_ports.inject_error = 0; if (ns->udp_ports.sleep) msleep(ns->udp_ports.sleep); if (!ret) { if (ns->udp_ports.ports[table][entry]) { WARN(1, "entry already in use\n"); ret = -EBUSY; } else { ns->udp_ports.ports[table][entry] = be16_to_cpu(ti->port) << 16 | ti->type; } } netdev_info(dev, "set [%d, %d] type %d family %d port %d - %d\n", table, entry, ti->type, ti->sa_family, ntohs(ti->port), ret); return ret; } static int nsim_udp_tunnel_unset_port(struct net_device *dev, unsigned int table, unsigned int entry, struct udp_tunnel_info *ti) { struct netdevsim *ns = netdev_priv(dev); int ret; ret = -ns->udp_ports.inject_error; ns->udp_ports.inject_error = 0; if (ns->udp_ports.sleep) msleep(ns->udp_ports.sleep); if (!ret) { u32 val = be16_to_cpu(ti->port) << 16 | ti->type; if (val == ns->udp_ports.ports[table][entry]) { ns->udp_ports.ports[table][entry] = 0; } else { WARN(1, "entry not installed %x vs %x\n", val, ns->udp_ports.ports[table][entry]); ret = -ENOENT; } } netdev_info(dev, "unset [%d, %d] type %d family %d port %d - %d\n", table, entry, ti->type, ti->sa_family, ntohs(ti->port), ret); return ret; } static int nsim_udp_tunnel_sync_table(struct net_device *dev, unsigned int table) { struct netdevsim *ns = netdev_priv(dev); struct udp_tunnel_info ti; unsigned int i; int ret; ret = -ns->udp_ports.inject_error; ns->udp_ports.inject_error = 0; for (i = 0; i < NSIM_UDP_TUNNEL_N_PORTS; i++) { udp_tunnel_nic_get_port(dev, table, i, &ti); ns->udp_ports.ports[table][i] = be16_to_cpu(ti.port) << 16 | ti.type; } return ret; } static const struct udp_tunnel_nic_info nsim_udp_tunnel_info = { .set_port = nsim_udp_tunnel_set_port, .unset_port = nsim_udp_tunnel_unset_port, .sync_table = nsim_udp_tunnel_sync_table, .tables = { { .n_entries = NSIM_UDP_TUNNEL_N_PORTS, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN, }, { .n_entries = NSIM_UDP_TUNNEL_N_PORTS, .tunnel_types = UDP_TUNNEL_TYPE_GENEVE | UDP_TUNNEL_TYPE_VXLAN_GPE, }, }, }; static ssize_t nsim_udp_tunnels_info_reset_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) { struct net_device *dev = file->private_data; struct netdevsim *ns = netdev_priv(dev); memset(ns->udp_ports.ports, 0, sizeof(ns->udp_ports.__ports)); rtnl_lock(); udp_tunnel_nic_reset_ntf(dev); rtnl_unlock(); return count; } static const struct file_operations nsim_udp_tunnels_info_reset_fops = { .open = simple_open, .write = nsim_udp_tunnels_info_reset_write, .llseek = generic_file_llseek, .owner = THIS_MODULE, }; int nsim_udp_tunnels_info_create(struct nsim_dev *nsim_dev, struct net_device *dev) { struct netdevsim *ns = netdev_priv(dev); struct udp_tunnel_nic_info *info; if (nsim_dev->udp_ports.shared && nsim_dev->udp_ports.open_only) { dev_err(&nsim_dev->nsim_bus_dev->dev, "shared can't be used in conjunction with open_only\n"); return -EINVAL; } if (!nsim_dev->udp_ports.shared) ns->udp_ports.ports = ns->udp_ports.__ports; else ns->udp_ports.ports = nsim_dev->udp_ports.__ports; debugfs_create_u32("udp_ports_inject_error", 0600, ns->nsim_dev_port->ddir, &ns->udp_ports.inject_error); ns->udp_ports.dfs_ports[0].array = ns->udp_ports.ports[0]; ns->udp_ports.dfs_ports[0].n_elements = NSIM_UDP_TUNNEL_N_PORTS; debugfs_create_u32_array("udp_ports_table0", 0400, ns->nsim_dev_port->ddir, &ns->udp_ports.dfs_ports[0]); ns->udp_ports.dfs_ports[1].array = ns->udp_ports.ports[1]; ns->udp_ports.dfs_ports[1].n_elements = NSIM_UDP_TUNNEL_N_PORTS; debugfs_create_u32_array("udp_ports_table1", 0400, ns->nsim_dev_port->ddir, &ns->udp_ports.dfs_ports[1]); debugfs_create_file("udp_ports_reset", 0200, ns->nsim_dev_port->ddir, dev, &nsim_udp_tunnels_info_reset_fops); /* Note: it's not normal to allocate the info struct like this! * Drivers are expected to use a static const one, here we're testing. */ info = kmemdup(&nsim_udp_tunnel_info, sizeof(nsim_udp_tunnel_info), GFP_KERNEL); if (!info) return -ENOMEM; ns->udp_ports.sleep = nsim_dev->udp_ports.sleep; if (nsim_dev->udp_ports.sync_all) { info->set_port = NULL; info->unset_port = NULL; } else { info->sync_table = NULL; } if (ns->udp_ports.sleep) info->flags |= UDP_TUNNEL_NIC_INFO_MAY_SLEEP; if (nsim_dev->udp_ports.open_only) info->flags |= UDP_TUNNEL_NIC_INFO_OPEN_ONLY; if (nsim_dev->udp_ports.ipv4_only) info->flags |= UDP_TUNNEL_NIC_INFO_IPV4_ONLY; if (nsim_dev->udp_ports.shared) info->shared = &nsim_dev->udp_ports.utn_shared; if (nsim_dev->udp_ports.static_iana_vxlan) info->flags |= UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN; dev->udp_tunnel_nic_info = info; return 0; } void nsim_udp_tunnels_info_destroy(struct net_device *dev) { kfree(dev->udp_tunnel_nic_info); dev->udp_tunnel_nic_info = NULL; } void nsim_udp_tunnels_debugfs_create(struct nsim_dev *nsim_dev) { debugfs_create_bool("udp_ports_sync_all", 0600, nsim_dev->ddir, &nsim_dev->udp_ports.sync_all); debugfs_create_bool("udp_ports_open_only", 0600, nsim_dev->ddir, &nsim_dev->udp_ports.open_only); debugfs_create_bool("udp_ports_ipv4_only", 0600, nsim_dev->ddir, &nsim_dev->udp_ports.ipv4_only); debugfs_create_bool("udp_ports_shared", 0600, nsim_dev->ddir, &nsim_dev->udp_ports.shared); debugfs_create_bool("udp_ports_static_iana_vxlan", 0600, nsim_dev->ddir, &nsim_dev->udp_ports.static_iana_vxlan); debugfs_create_u32("udp_ports_sleep", 0600, nsim_dev->ddir, &nsim_dev->udp_ports.sleep); }
4 4 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 /* SPDX-License-Identifier: GPL-2.0 */ /* * Declarations of AX.25 type objects. * * Alan Cox (GW4PTS) 10/11/93 */ #ifndef _AX25_H #define _AX25_H #include <linux/ax25.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/refcount.h> #include <net/neighbour.h> #include <net/sock.h> #include <linux/seq_file.h> #define AX25_T1CLAMPLO 1 #define AX25_T1CLAMPHI (30 * HZ) #define AX25_BPQ_HEADER_LEN 16 #define AX25_KISS_HEADER_LEN 1 #define AX25_HEADER_LEN 17 #define AX25_ADDR_LEN 7 #define AX25_DIGI_HEADER_LEN (AX25_MAX_DIGIS * AX25_ADDR_LEN) #define AX25_MAX_HEADER_LEN (AX25_HEADER_LEN + AX25_DIGI_HEADER_LEN) /* AX.25 Protocol IDs */ #define AX25_P_ROSE 0x01 #define AX25_P_VJCOMP 0x06 /* Compressed TCP/IP packet */ /* Van Jacobsen (RFC 1144) */ #define AX25_P_VJUNCOMP 0x07 /* Uncompressed TCP/IP packet */ /* Van Jacobsen (RFC 1144) */ #define AX25_P_SEGMENT 0x08 /* Segmentation fragment */ #define AX25_P_TEXNET 0xc3 /* TEXTNET datagram protocol */ #define AX25_P_LQ 0xc4 /* Link Quality Protocol */ #define AX25_P_ATALK 0xca /* Appletalk */ #define AX25_P_ATALK_ARP 0xcb /* Appletalk ARP */ #define AX25_P_IP 0xcc /* ARPA Internet Protocol */ #define AX25_P_ARP 0xcd /* ARPA Address Resolution */ #define AX25_P_FLEXNET 0xce /* FlexNet */ #define AX25_P_NETROM 0xcf /* NET/ROM */ #define AX25_P_TEXT 0xF0 /* No layer 3 protocol impl. */ /* AX.25 Segment control values */ #define AX25_SEG_REM 0x7F #define AX25_SEG_FIRST 0x80 #define AX25_CBIT 0x80 /* Command/Response bit */ #define AX25_EBIT 0x01 /* HDLC Address Extension bit */ #define AX25_HBIT 0x80 /* Has been repeated bit */ #define AX25_SSSID_SPARE 0x60 /* Unused bits in SSID for standard AX.25 */ #define AX25_ESSID_SPARE 0x20 /* Unused bits in SSID for extended AX.25 */ #define AX25_DAMA_FLAG 0x20 /* Well, it is *NOT* unused! (dl1bke 951121 */ #define AX25_COND_ACK_PENDING 0x01 #define AX25_COND_REJECT 0x02 #define AX25_COND_PEER_RX_BUSY 0x04 #define AX25_COND_OWN_RX_BUSY 0x08 #define AX25_COND_DAMA_MODE 0x10 #ifndef _LINUX_NETDEVICE_H #include <linux/netdevice.h> #endif /* Upper sub-layer (LAPB) definitions */ /* Control field templates */ #define AX25_I 0x00 /* Information frames */ #define AX25_S 0x01 /* Supervisory frames */ #define AX25_RR 0x01 /* Receiver ready */ #define AX25_RNR 0x05 /* Receiver not ready */ #define AX25_REJ 0x09 /* Reject */ #define AX25_U 0x03 /* Unnumbered frames */ #define AX25_SABM 0x2f /* Set Asynchronous Balanced Mode */ #define AX25_SABME 0x6f /* Set Asynchronous Balanced Mode Extended */ #define AX25_DISC 0x43 /* Disconnect */ #define AX25_DM 0x0f /* Disconnected mode */ #define AX25_UA 0x63 /* Unnumbered acknowledge */ #define AX25_FRMR 0x87 /* Frame reject */ #define AX25_UI 0x03 /* Unnumbered information */ #define AX25_XID 0xaf /* Exchange information */ #define AX25_TEST 0xe3 /* Test */ #define AX25_PF 0x10 /* Poll/final bit for standard AX.25 */ #define AX25_EPF 0x01 /* Poll/final bit for extended AX.25 */ #define AX25_ILLEGAL 0x100 /* Impossible to be a real frame type */ #define AX25_POLLOFF 0 #define AX25_POLLON 1 /* AX25 L2 C-bit */ #define AX25_COMMAND 1 #define AX25_RESPONSE 2 /* Define Link State constants. */ enum { AX25_STATE_0, /* Listening */ AX25_STATE_1, /* SABM sent */ AX25_STATE_2, /* DISC sent */ AX25_STATE_3, /* Established */ AX25_STATE_4 /* Recovery */ }; #define AX25_MODULUS 8 /* Standard AX.25 modulus */ #define AX25_EMODULUS 128 /* Extended AX.25 modulus */ enum { AX25_PROTO_STD_SIMPLEX, AX25_PROTO_STD_DUPLEX, #ifdef CONFIG_AX25_DAMA_SLAVE AX25_PROTO_DAMA_SLAVE, #ifdef CONFIG_AX25_DAMA_MASTER AX25_PROTO_DAMA_MASTER, #define AX25_PROTO_MAX AX25_PROTO_DAMA_MASTER #endif #endif __AX25_PROTO_MAX, AX25_PROTO_MAX = __AX25_PROTO_MAX -1 }; enum { AX25_VALUES_IPDEFMODE, /* 0=DG 1=VC */ AX25_VALUES_AXDEFMODE, /* 0=Normal 1=Extended Seq Nos */ AX25_VALUES_BACKOFF, /* 0=None 1=Linear 2=Exponential */ AX25_VALUES_CONMODE, /* Allow connected modes - 0=No 1=no "PID text" 2=all PIDs */ AX25_VALUES_WINDOW, /* Default window size for standard AX.25 */ AX25_VALUES_EWINDOW, /* Default window size for extended AX.25 */ AX25_VALUES_T1, /* Default T1 timeout value */ AX25_VALUES_T2, /* Default T2 timeout value */ AX25_VALUES_T3, /* Default T3 timeout value */ AX25_VALUES_IDLE, /* Connected mode idle timer */ AX25_VALUES_N2, /* Default N2 value */ AX25_VALUES_PACLEN, /* AX.25 MTU */ AX25_VALUES_PROTOCOL, /* Std AX.25, DAMA Slave, DAMA Master */ AX25_VALUES_DS_TIMEOUT, /* DAMA Slave timeout */ AX25_MAX_VALUES /* THIS MUST REMAIN THE LAST ENTRY OF THIS LIST */ }; #define AX25_DEF_IPDEFMODE 0 /* Datagram */ #define AX25_DEF_AXDEFMODE 0 /* Normal */ #define AX25_DEF_BACKOFF 1 /* Linear backoff */ #define AX25_DEF_CONMODE 2 /* Connected mode allowed */ #define AX25_DEF_WINDOW 2 /* Window=2 */ #define AX25_DEF_EWINDOW 32 /* Module-128 Window=32 */ #define AX25_DEF_T1 10000 /* T1=10s */ #define AX25_DEF_T2 3000 /* T2=3s */ #define AX25_DEF_T3 300000 /* T3=300s */ #define AX25_DEF_N2 10 /* N2=10 */ #define AX25_DEF_IDLE 0 /* Idle=None */ #define AX25_DEF_PACLEN 256 /* Paclen=256 */ #define AX25_DEF_PROTOCOL AX25_PROTO_STD_SIMPLEX /* Standard AX.25 */ #define AX25_DEF_DS_TIMEOUT 180000 /* DAMA timeout 3 minutes */ typedef struct ax25_uid_assoc { struct hlist_node uid_node; refcount_t refcount; kuid_t uid; ax25_address call; } ax25_uid_assoc; #define ax25_uid_for_each(__ax25, list) \ hlist_for_each_entry(__ax25, list, uid_node) #define ax25_uid_hold(ax25) \ refcount_inc(&((ax25)->refcount)) static inline void ax25_uid_put(ax25_uid_assoc *assoc) { if (refcount_dec_and_test(&assoc->refcount)) { kfree(assoc); } } typedef struct { ax25_address calls[AX25_MAX_DIGIS]; unsigned char repeated[AX25_MAX_DIGIS]; unsigned char ndigi; signed char lastrepeat; } ax25_digi; typedef struct ax25_route { struct ax25_route *next; ax25_address callsign; struct net_device *dev; ax25_digi *digipeat; char ip_mode; } ax25_route; void __ax25_put_route(ax25_route *ax25_rt); extern rwlock_t ax25_route_lock; static inline void ax25_route_lock_use(void) { read_lock(&ax25_route_lock); } static inline void ax25_route_lock_unuse(void) { read_unlock(&ax25_route_lock); } typedef struct { char slave; /* slave_mode? */ struct timer_list slave_timer; /* timeout timer */ unsigned short slave_timeout; /* when? */ } ax25_dama_info; struct ctl_table; typedef struct ax25_dev { struct ax25_dev *next; struct net_device *dev; netdevice_tracker dev_tracker; struct net_device *forward; struct ctl_table_header *sysheader; int values[AX25_MAX_VALUES]; #if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) ax25_dama_info dama; #endif refcount_t refcount; bool device_up; } ax25_dev; typedef struct ax25_cb { struct hlist_node ax25_node; ax25_address source_addr, dest_addr; ax25_digi *digipeat; ax25_dev *ax25_dev; netdevice_tracker dev_tracker; unsigned char iamdigi; unsigned char state, modulus, pidincl; unsigned short vs, vr, va; unsigned char condition, backoff; unsigned char n2, n2count; struct timer_list t1timer, t2timer, t3timer, idletimer; unsigned long t1, t2, t3, idle, rtt; unsigned short paclen, fragno, fraglen; struct sk_buff_head write_queue; struct sk_buff_head reseq_queue; struct sk_buff_head ack_queue; struct sk_buff_head frag_queue; unsigned char window; struct timer_list timer, dtimer; struct sock *sk; /* Backlink to socket */ refcount_t refcount; } ax25_cb; struct ax25_sock { struct sock sk; struct ax25_cb *cb; }; #define ax25_sk(ptr) container_of_const(ptr, struct ax25_sock, sk) static inline struct ax25_cb *sk_to_ax25(const struct sock *sk) { return ax25_sk(sk)->cb; } #define ax25_for_each(__ax25, list) \ hlist_for_each_entry(__ax25, list, ax25_node) #define ax25_cb_hold(__ax25) \ refcount_inc(&((__ax25)->refcount)) static __inline__ void ax25_cb_put(ax25_cb *ax25) { if (refcount_dec_and_test(&ax25->refcount)) { kfree(ax25->digipeat); kfree(ax25); } } static inline void ax25_dev_hold(ax25_dev *ax25_dev) { refcount_inc(&ax25_dev->refcount); } static inline void ax25_dev_put(ax25_dev *ax25_dev) { if (refcount_dec_and_test(&ax25_dev->refcount)) { kfree(ax25_dev); } } static inline __be16 ax25_type_trans(struct sk_buff *skb, struct net_device *dev) { skb->dev = dev; skb_reset_mac_header(skb); skb->pkt_type = PACKET_HOST; return htons(ETH_P_AX25); } /* af_ax25.c */ extern struct hlist_head ax25_list; extern spinlock_t ax25_list_lock; void ax25_cb_add(ax25_cb *); struct sock *ax25_find_listener(ax25_address *, int, struct net_device *, int); struct sock *ax25_get_socket(ax25_address *, ax25_address *, int); ax25_cb *ax25_find_cb(const ax25_address *, ax25_address *, ax25_digi *, struct net_device *); void ax25_send_to_raw(ax25_address *, struct sk_buff *, int); void ax25_destroy_socket(ax25_cb *); ax25_cb * __must_check ax25_create_cb(void); void ax25_fillin_cb(ax25_cb *, ax25_dev *); struct sock *ax25_make_new(struct sock *, struct ax25_dev *); /* ax25_addr.c */ extern const ax25_address ax25_bcast; extern const ax25_address ax25_defaddr; extern const ax25_address null_ax25_address; char *ax2asc(char *buf, const ax25_address *); void asc2ax(ax25_address *addr, const char *callsign); int ax25cmp(const ax25_address *, const ax25_address *); int ax25digicmp(const ax25_digi *, const ax25_digi *); const unsigned char *ax25_addr_parse(const unsigned char *, int, ax25_address *, ax25_address *, ax25_digi *, int *, int *); int ax25_addr_build(unsigned char *, const ax25_address *, const ax25_address *, const ax25_digi *, int, int); int ax25_addr_size(const ax25_digi *); void ax25_digi_invert(const ax25_digi *, ax25_digi *); /* ax25_dev.c */ extern ax25_dev *ax25_dev_list; extern spinlock_t ax25_dev_lock; #if IS_ENABLED(CONFIG_AX25) static inline ax25_dev *ax25_dev_ax25dev(struct net_device *dev) { return dev->ax25_ptr; } #endif ax25_dev *ax25_addr_ax25dev(ax25_address *); void ax25_dev_device_up(struct net_device *); void ax25_dev_device_down(struct net_device *); int ax25_fwd_ioctl(unsigned int, struct ax25_fwd_struct *); struct net_device *ax25_fwd_dev(struct net_device *); void ax25_dev_free(void); /* ax25_ds_in.c */ int ax25_ds_frame_in(ax25_cb *, struct sk_buff *, int); /* ax25_ds_subr.c */ void ax25_ds_nr_error_recovery(ax25_cb *); void ax25_ds_enquiry_response(ax25_cb *); void ax25_ds_establish_data_link(ax25_cb *); void ax25_dev_dama_off(ax25_dev *); void ax25_dama_on(ax25_cb *); void ax25_dama_off(ax25_cb *); /* ax25_ds_timer.c */ void ax25_ds_setup_timer(ax25_dev *); void ax25_ds_set_timer(ax25_dev *); void ax25_ds_del_timer(ax25_dev *); void ax25_ds_timer(ax25_cb *); void ax25_ds_t1_timeout(ax25_cb *); void ax25_ds_heartbeat_expiry(ax25_cb *); void ax25_ds_t3timer_expiry(ax25_cb *); void ax25_ds_idletimer_expiry(ax25_cb *); /* ax25_iface.c */ struct ax25_protocol { struct ax25_protocol *next; unsigned int pid; int (*func)(struct sk_buff *, ax25_cb *); }; void ax25_register_pid(struct ax25_protocol *ap); void ax25_protocol_release(unsigned int); struct ax25_linkfail { struct hlist_node lf_node; void (*func)(ax25_cb *, int); }; void ax25_linkfail_register(struct ax25_linkfail *lf); void ax25_linkfail_release(struct ax25_linkfail *lf); int __must_check ax25_listen_register(const ax25_address *, struct net_device *); void ax25_listen_release(const ax25_address *, struct net_device *); int(*ax25_protocol_function(unsigned int))(struct sk_buff *, ax25_cb *); int ax25_listen_mine(const ax25_address *, struct net_device *); void ax25_link_failed(ax25_cb *, int); int ax25_protocol_is_registered(unsigned int); /* ax25_in.c */ int ax25_rx_iframe(ax25_cb *, struct sk_buff *); int ax25_kiss_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); /* ax25_ip.c */ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb); extern const struct header_ops ax25_header_ops; /* ax25_out.c */ ax25_cb *ax25_send_frame(struct sk_buff *, int, const ax25_address *, ax25_address *, ax25_digi *, struct net_device *); void ax25_output(ax25_cb *, int, struct sk_buff *); void ax25_kick(ax25_cb *); void ax25_transmit_buffer(ax25_cb *, struct sk_buff *, int); void ax25_queue_xmit(struct sk_buff *skb, struct net_device *dev); int ax25_check_iframes_acked(ax25_cb *, unsigned short); /* ax25_route.c */ void ax25_rt_device_down(struct net_device *); int ax25_rt_ioctl(unsigned int, void __user *); extern const struct seq_operations ax25_rt_seqops; ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev); int ax25_rt_autobind(ax25_cb *, ax25_address *); struct sk_buff *ax25_rt_build_path(struct sk_buff *, ax25_address *, ax25_address *, ax25_digi *); void ax25_rt_free(void); /* ax25_std_in.c */ int ax25_std_frame_in(ax25_cb *, struct sk_buff *, int); /* ax25_std_subr.c */ void ax25_std_nr_error_recovery(ax25_cb *); void ax25_std_establish_data_link(ax25_cb *); void ax25_std_transmit_enquiry(ax25_cb *); void ax25_std_enquiry_response(ax25_cb *); void ax25_std_timeout_response(ax25_cb *); /* ax25_std_timer.c */ void ax25_std_heartbeat_expiry(ax25_cb *); void ax25_std_t1timer_expiry(ax25_cb *); void ax25_std_t2timer_expiry(ax25_cb *); void ax25_std_t3timer_expiry(ax25_cb *); void ax25_std_idletimer_expiry(ax25_cb *); /* ax25_subr.c */ void ax25_clear_queues(ax25_cb *); void ax25_frames_acked(ax25_cb *, unsigned short); void ax25_requeue_frames(ax25_cb *); int ax25_validate_nr(ax25_cb *, unsigned short); int ax25_decode(ax25_cb *, struct sk_buff *, int *, int *, int *); void ax25_send_control(ax25_cb *, int, int, int); void ax25_return_dm(struct net_device *, ax25_address *, ax25_address *, ax25_digi *); void ax25_calculate_t1(ax25_cb *); void ax25_calculate_rtt(ax25_cb *); void ax25_disconnect(ax25_cb *, int); /* ax25_timer.c */ void ax25_setup_timers(ax25_cb *); void ax25_start_heartbeat(ax25_cb *); void ax25_start_t1timer(ax25_cb *); void ax25_start_t2timer(ax25_cb *); void ax25_start_t3timer(ax25_cb *); void ax25_start_idletimer(ax25_cb *); void ax25_stop_heartbeat(ax25_cb *); void ax25_stop_t1timer(ax25_cb *); void ax25_stop_t2timer(ax25_cb *); void ax25_stop_t3timer(ax25_cb *); void ax25_stop_idletimer(ax25_cb *); int ax25_t1timer_running(ax25_cb *); unsigned long ax25_display_timer(struct timer_list *); /* ax25_uid.c */ extern int ax25_uid_policy; ax25_uid_assoc *ax25_findbyuid(kuid_t); int __must_check ax25_uid_ioctl(int, struct sockaddr_ax25 *); extern const struct seq_operations ax25_uid_seqops; void ax25_uid_free(void); /* sysctl_net_ax25.c */ #ifdef CONFIG_SYSCTL int ax25_register_dev_sysctl(ax25_dev *ax25_dev); void ax25_unregister_dev_sysctl(ax25_dev *ax25_dev); #else static inline int ax25_register_dev_sysctl(ax25_dev *ax25_dev) { return 0; } static inline void ax25_unregister_dev_sysctl(ax25_dev *ax25_dev) {} #endif /* CONFIG_SYSCTL */ #endif
46 2 3 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/pagemap.h> #include <linux/blkdev.h> #include "../blk.h" /* * add_gd_partition adds a partitions details to the devices partition * description. */ struct parsed_partitions { struct gendisk *disk; char name[BDEVNAME_SIZE]; struct { sector_t from; sector_t size; int flags; bool has_info; struct partition_meta_info info; } *parts; int next; int limit; bool access_beyond_eod; char *pp_buf; }; typedef struct { struct folio *v; } Sector; void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p); static inline void put_dev_sector(Sector p) { folio_put(p.v); } static inline void put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) { if (n < p->limit) { char tmp[1 + BDEVNAME_SIZE + 10 + 1]; p->parts[n].from = from; p->parts[n].size = size; snprintf(tmp, sizeof(tmp), " %s%d", p->name, n); strlcat(p->pp_buf, tmp, PAGE_SIZE); } } /* detection routines go here in alphabetical order: */ int adfspart_check_ADFS(struct parsed_partitions *state); int adfspart_check_CUMANA(struct parsed_partitions *state); int adfspart_check_EESOX(struct parsed_partitions *state); int adfspart_check_ICS(struct parsed_partitions *state); int adfspart_check_POWERTEC(struct parsed_partitions *state); int aix_partition(struct parsed_partitions *state); int amiga_partition(struct parsed_partitions *state); int atari_partition(struct parsed_partitions *state); int cmdline_partition(struct parsed_partitions *state); int efi_partition(struct parsed_partitions *state); int ibm_partition(struct parsed_partitions *); int karma_partition(struct parsed_partitions *state); int ldm_partition(struct parsed_partitions *state); int mac_partition(struct parsed_partitions *state); int msdos_partition(struct parsed_partitions *state); int osf_partition(struct parsed_partitions *state); int sgi_partition(struct parsed_partitions *state); int sun_partition(struct parsed_partitions *state); int sysv68_partition(struct parsed_partitions *state); int ultrix_partition(struct parsed_partitions *state);
1 8 1 1 1 1 1 1 8 1 2 2 1 2 1 2 1 1 2 2 2 9 9 9 9 4 6 6 4 4 17 17 9 22 22 22 9 9 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 // SPDX-License-Identifier: GPL-2.0-only #include <linux/ceph/ceph_debug.h> #include <linux/backing-dev.h> #include <linux/ctype.h> #include <linux/fs.h> #include <linux/inet.h> #include <linux/in6.h> #include <linux/key.h> #include <keys/ceph-type.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/nsproxy.h> #include <linux/fs_parser.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/statfs.h> #include <linux/string.h> #include <linux/vmalloc.h> #include <linux/ceph/ceph_features.h> #include <linux/ceph/libceph.h> #include <linux/ceph/debugfs.h> #include <linux/ceph/decode.h> #include <linux/ceph/mon_client.h> #include <linux/ceph/auth.h> #include "crypto.h" /* * Module compatibility interface. For now it doesn't do anything, * but its existence signals a certain level of functionality. * * The data buffer is used to pass information both to and from * libceph. The return value indicates whether libceph determines * it is compatible with the caller (from another kernel module), * given the provided data. * * The data pointer can be null. */ bool libceph_compatible(void *data) { return true; } EXPORT_SYMBOL(libceph_compatible); static int param_get_supported_features(char *buffer, const struct kernel_param *kp) { return sprintf(buffer, "0x%llx", CEPH_FEATURES_SUPPORTED_DEFAULT); } static const struct kernel_param_ops param_ops_supported_features = { .get = param_get_supported_features, }; module_param_cb(supported_features, &param_ops_supported_features, NULL, 0444); const char *ceph_msg_type_name(int type) { switch (type) { case CEPH_MSG_SHUTDOWN: return "shutdown"; case CEPH_MSG_PING: return "ping"; case CEPH_MSG_AUTH: return "auth"; case CEPH_MSG_AUTH_REPLY: return "auth_reply"; case CEPH_MSG_MON_MAP: return "mon_map"; case CEPH_MSG_MON_GET_MAP: return "mon_get_map"; case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe"; case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack"; case CEPH_MSG_STATFS: return "statfs"; case CEPH_MSG_STATFS_REPLY: return "statfs_reply"; case CEPH_MSG_MON_GET_VERSION: return "mon_get_version"; case CEPH_MSG_MON_GET_VERSION_REPLY: return "mon_get_version_reply"; case CEPH_MSG_MDS_MAP: return "mds_map"; case CEPH_MSG_FS_MAP_USER: return "fs_map_user"; case CEPH_MSG_CLIENT_SESSION: return "client_session"; case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; case CEPH_MSG_CLIENT_REQUEST: return "client_request"; case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward"; case CEPH_MSG_CLIENT_REPLY: return "client_reply"; case CEPH_MSG_CLIENT_CAPS: return "client_caps"; case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release"; case CEPH_MSG_CLIENT_QUOTA: return "client_quota"; case CEPH_MSG_CLIENT_SNAP: return "client_snap"; case CEPH_MSG_CLIENT_LEASE: return "client_lease"; case CEPH_MSG_POOLOP_REPLY: return "poolop_reply"; case CEPH_MSG_POOLOP: return "poolop"; case CEPH_MSG_MON_COMMAND: return "mon_command"; case CEPH_MSG_MON_COMMAND_ACK: return "mon_command_ack"; case CEPH_MSG_OSD_MAP: return "osd_map"; case CEPH_MSG_OSD_OP: return "osd_op"; case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; case CEPH_MSG_WATCH_NOTIFY: return "watch_notify"; case CEPH_MSG_OSD_BACKOFF: return "osd_backoff"; default: return "unknown"; } } EXPORT_SYMBOL(ceph_msg_type_name); /* * Initially learn our fsid, or verify an fsid matches. */ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) { if (client->have_fsid) { if (ceph_fsid_compare(&client->fsid, fsid)) { pr_err("bad fsid, had %pU got %pU", &client->fsid, fsid); return -1; } } else { memcpy(&client->fsid, fsid, sizeof(*fsid)); } return 0; } EXPORT_SYMBOL(ceph_check_fsid); static int strcmp_null(const char *s1, const char *s2) { if (!s1 && !s2) return 0; if (s1 && !s2) return -1; if (!s1 && s2) return 1; return strcmp(s1, s2); } int ceph_compare_options(struct ceph_options *new_opt, struct ceph_client *client) { struct ceph_options *opt1 = new_opt; struct ceph_options *opt2 = client->options; int ofs = offsetof(struct ceph_options, mon_addr); int i; int ret; /* * Don't bother comparing options if network namespaces don't * match. */ if (!net_eq(current->nsproxy->net_ns, read_pnet(&client->msgr.net))) return -1; ret = memcmp(opt1, opt2, ofs); if (ret) return ret; ret = strcmp_null(opt1->name, opt2->name); if (ret) return ret; if (opt1->key && !opt2->key) return -1; if (!opt1->key && opt2->key) return 1; if (opt1->key && opt2->key) { if (opt1->key->type != opt2->key->type) return -1; if (opt1->key->created.tv_sec != opt2->key->created.tv_sec) return -1; if (opt1->key->created.tv_nsec != opt2->key->created.tv_nsec) return -1; if (opt1->key->len != opt2->key->len) return -1; if (opt1->key->key && !opt2->key->key) return -1; if (!opt1->key->key && opt2->key->key) return 1; if (opt1->key->key && opt2->key->key) { ret = memcmp(opt1->key->key, opt2->key->key, opt1->key->len); if (ret) return ret; } } ret = ceph_compare_crush_locs(&opt1->crush_locs, &opt2->crush_locs); if (ret) return ret; /* any matching mon ip implies a match */ for (i = 0; i < opt1->num_mon; i++) { if (ceph_monmap_contains(client->monc.monmap, &opt1->mon_addr[i])) return 0; } return -1; } EXPORT_SYMBOL(ceph_compare_options); int ceph_parse_fsid(const char *str, struct ceph_fsid *fsid) { int i = 0; char tmp[3]; int err = -EINVAL; int d; dout("%s '%s'\n", __func__, str); tmp[2] = 0; while (*str && i < 16) { if (ispunct(*str)) { str++; continue; } if (!isxdigit(str[0]) || !isxdigit(str[1])) break; tmp[0] = str[0]; tmp[1] = str[1]; if (sscanf(tmp, "%x", &d) < 1) break; fsid->fsid[i] = d & 0xff; i++; str += 2; } if (i == 16) err = 0; dout("%s ret %d got fsid %pU\n", __func__, err, fsid); return err; } EXPORT_SYMBOL(ceph_parse_fsid); /* * ceph options */ enum { Opt_osdkeepalivetimeout, Opt_mount_timeout, Opt_osd_idle_ttl, Opt_osd_request_timeout, /* int args above */ Opt_fsid, Opt_name, Opt_secret, Opt_key, Opt_ip, Opt_crush_location, Opt_read_from_replica, Opt_ms_mode, /* string args above */ Opt_share, Opt_crc, Opt_cephx_require_signatures, Opt_cephx_sign_messages, Opt_tcp_nodelay, Opt_abort_on_full, Opt_rxbounce, }; enum { Opt_read_from_replica_no, Opt_read_from_replica_balance, Opt_read_from_replica_localize, }; static const struct constant_table ceph_param_read_from_replica[] = { {"no", Opt_read_from_replica_no}, {"balance", Opt_read_from_replica_balance}, {"localize", Opt_read_from_replica_localize}, {} }; enum ceph_ms_mode { Opt_ms_mode_legacy, Opt_ms_mode_crc, Opt_ms_mode_secure, Opt_ms_mode_prefer_crc, Opt_ms_mode_prefer_secure }; static const struct constant_table ceph_param_ms_mode[] = { {"legacy", Opt_ms_mode_legacy}, {"crc", Opt_ms_mode_crc}, {"secure", Opt_ms_mode_secure}, {"prefer-crc", Opt_ms_mode_prefer_crc}, {"prefer-secure", Opt_ms_mode_prefer_secure}, {} }; static const struct fs_parameter_spec ceph_parameters[] = { fsparam_flag ("abort_on_full", Opt_abort_on_full), __fsparam (NULL, "cephx_require_signatures", Opt_cephx_require_signatures, fs_param_neg_with_no|fs_param_deprecated, NULL), fsparam_flag_no ("cephx_sign_messages", Opt_cephx_sign_messages), fsparam_flag_no ("crc", Opt_crc), fsparam_string ("crush_location", Opt_crush_location), fsparam_string ("fsid", Opt_fsid), fsparam_string ("ip", Opt_ip), fsparam_string ("key", Opt_key), fsparam_u32 ("mount_timeout", Opt_mount_timeout), fsparam_string ("name", Opt_name), fsparam_u32 ("osd_idle_ttl", Opt_osd_idle_ttl), fsparam_u32 ("osd_request_timeout", Opt_osd_request_timeout), fsparam_u32 ("osdkeepalive", Opt_osdkeepalivetimeout), fsparam_enum ("read_from_replica", Opt_read_from_replica, ceph_param_read_from_replica), fsparam_flag ("rxbounce", Opt_rxbounce), fsparam_enum ("ms_mode", Opt_ms_mode, ceph_param_ms_mode), fsparam_string ("secret", Opt_secret), fsparam_flag_no ("share", Opt_share), fsparam_flag_no ("tcp_nodelay", Opt_tcp_nodelay), {} }; struct ceph_options *ceph_alloc_options(void) { struct ceph_options *opt; opt = kzalloc(sizeof(*opt), GFP_KERNEL); if (!opt) return NULL; opt->crush_locs = RB_ROOT; opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), GFP_KERNEL); if (!opt->mon_addr) { kfree(opt); return NULL; } opt->flags = CEPH_OPT_DEFAULT; opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT; opt->read_from_replica = CEPH_READ_FROM_REPLICA_DEFAULT; opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN; opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN; return opt; } EXPORT_SYMBOL(ceph_alloc_options); void ceph_destroy_options(struct ceph_options *opt) { dout("destroy_options %p\n", opt); if (!opt) return; ceph_clear_crush_locs(&opt->crush_locs); kfree(opt->name); if (opt->key) { ceph_crypto_key_destroy(opt->key); kfree(opt->key); } kfree(opt->mon_addr); kfree(opt); } EXPORT_SYMBOL(ceph_destroy_options); /* get secret from key store */ static int get_secret(struct ceph_crypto_key *dst, const char *name, struct p_log *log) { struct key *ukey; int key_err; int err = 0; struct ceph_crypto_key *ckey; ukey = request_key(&key_type_ceph, name, NULL); if (IS_ERR(ukey)) { /* request_key errors don't map nicely to mount(2) errors; don't even try, but still printk */ key_err = PTR_ERR(ukey); switch (key_err) { case -ENOKEY: error_plog(log, "Failed due to key not found: %s", name); break; case -EKEYEXPIRED: error_plog(log, "Failed due to expired key: %s", name); break; case -EKEYREVOKED: error_plog(log, "Failed due to revoked key: %s", name); break; default: error_plog(log, "Failed due to key error %d: %s", key_err, name); } err = -EPERM; goto out; } ckey = ukey->payload.data[0]; err = ceph_crypto_key_clone(dst, ckey); if (err) goto out_key; /* pass through, err is 0 */ out_key: key_put(ukey); out: return err; } int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt, struct fc_log *l, char delim) { struct p_log log = {.prefix = "libceph", .log = l}; int ret; /* ip1[:port1][<delim>ip2[:port2]...] */ ret = ceph_parse_ips(buf, buf + len, opt->mon_addr, CEPH_MAX_MON, &opt->num_mon, delim); if (ret) { error_plog(&log, "Failed to parse monitor IPs: %d", ret); return ret; } return 0; } EXPORT_SYMBOL(ceph_parse_mon_ips); int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, struct fc_log *l) { struct fs_parse_result result; int token, err; struct p_log log = {.prefix = "libceph", .log = l}; token = __fs_parse(&log, ceph_parameters, param, &result); dout("%s fs_parse '%s' token %d\n", __func__, param->key, token); if (token < 0) return token; switch (token) { case Opt_ip: err = ceph_parse_ips(param->string, param->string + param->size, &opt->my_addr, 1, NULL, ','); if (err) { error_plog(&log, "Failed to parse ip: %d", err); return err; } opt->flags |= CEPH_OPT_MYIP; break; case Opt_fsid: err = ceph_parse_fsid(param->string, &opt->fsid); if (err) { error_plog(&log, "Failed to parse fsid: %d", err); return err; } opt->flags |= CEPH_OPT_FSID; break; case Opt_name: kfree(opt->name); opt->name = param->string; param->string = NULL; break; case Opt_secret: ceph_crypto_key_destroy(opt->key); kfree(opt->key); opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); if (!opt->key) return -ENOMEM; err = ceph_crypto_key_unarmor(opt->key, param->string); if (err) { error_plog(&log, "Failed to parse secret: %d", err); return err; } break; case Opt_key: ceph_crypto_key_destroy(opt->key); kfree(opt->key); opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); if (!opt->key) return -ENOMEM; return get_secret(opt->key, param->string, &log); case Opt_crush_location: ceph_clear_crush_locs(&opt->crush_locs); err = ceph_parse_crush_location(param->string, &opt->crush_locs); if (err) { error_plog(&log, "Failed to parse CRUSH location: %d", err); return err; } break; case Opt_read_from_replica: switch (result.uint_32) { case Opt_read_from_replica_no: opt->read_from_replica = 0; break; case Opt_read_from_replica_balance: opt->read_from_replica = CEPH_OSD_FLAG_BALANCE_READS; break; case Opt_read_from_replica_localize: opt->read_from_replica = CEPH_OSD_FLAG_LOCALIZE_READS; break; default: BUG(); } break; case Opt_ms_mode: switch (result.uint_32) { case Opt_ms_mode_legacy: opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN; opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN; break; case Opt_ms_mode_crc: opt->con_modes[0] = CEPH_CON_MODE_CRC; opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN; break; case Opt_ms_mode_secure: opt->con_modes[0] = CEPH_CON_MODE_SECURE; opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN; break; case Opt_ms_mode_prefer_crc: opt->con_modes[0] = CEPH_CON_MODE_CRC; opt->con_modes[1] = CEPH_CON_MODE_SECURE; break; case Opt_ms_mode_prefer_secure: opt->con_modes[0] = CEPH_CON_MODE_SECURE; opt->con_modes[1] = CEPH_CON_MODE_CRC; break; default: BUG(); } break; case Opt_osdkeepalivetimeout: /* 0 isn't well defined right now, reject it */ if (result.uint_32 < 1 || result.uint_32 > INT_MAX / 1000) goto out_of_range; opt->osd_keepalive_timeout = msecs_to_jiffies(result.uint_32 * 1000); break; case Opt_osd_idle_ttl: /* 0 isn't well defined right now, reject it */ if (result.uint_32 < 1 || result.uint_32 > INT_MAX / 1000) goto out_of_range; opt->osd_idle_ttl = msecs_to_jiffies(result.uint_32 * 1000); break; case Opt_mount_timeout: /* 0 is "wait forever" (i.e. infinite timeout) */ if (result.uint_32 > INT_MAX / 1000) goto out_of_range; opt->mount_timeout = msecs_to_jiffies(result.uint_32 * 1000); break; case Opt_osd_request_timeout: /* 0 is "wait forever" (i.e. infinite timeout) */ if (result.uint_32 > INT_MAX / 1000) goto out_of_range; opt->osd_request_timeout = msecs_to_jiffies(result.uint_32 * 1000); break; case Opt_share: if (!result.negated) opt->flags &= ~CEPH_OPT_NOSHARE; else opt->flags |= CEPH_OPT_NOSHARE; break; case Opt_crc: if (!result.negated) opt->flags &= ~CEPH_OPT_NOCRC; else opt->flags |= CEPH_OPT_NOCRC; break; case Opt_cephx_require_signatures: if (!result.negated) warn_plog(&log, "Ignoring cephx_require_signatures"); else warn_plog(&log, "Ignoring nocephx_require_signatures, use nocephx_sign_messages"); break; case Opt_cephx_sign_messages: if (!result.negated) opt->flags &= ~CEPH_OPT_NOMSGSIGN; else opt->flags |= CEPH_OPT_NOMSGSIGN; break; case Opt_tcp_nodelay: if (!result.negated) opt->flags |= CEPH_OPT_TCP_NODELAY; else opt->flags &= ~CEPH_OPT_TCP_NODELAY; break; case Opt_abort_on_full: opt->flags |= CEPH_OPT_ABORT_ON_FULL; break; case Opt_rxbounce: opt->flags |= CEPH_OPT_RXBOUNCE; break; default: BUG(); } return 0; out_of_range: return inval_plog(&log, "%s out of range", param->key); } EXPORT_SYMBOL(ceph_parse_param); int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, bool show_all) { struct ceph_options *opt = client->options; size_t pos = m->count; struct rb_node *n; if (opt->name) { seq_puts(m, "name="); seq_escape(m, opt->name, ", \t\n\\"); seq_putc(m, ','); } if (opt->key) seq_puts(m, "secret=<hidden>,"); if (!RB_EMPTY_ROOT(&opt->crush_locs)) { seq_puts(m, "crush_location="); for (n = rb_first(&opt->crush_locs); ; ) { struct crush_loc_node *loc = rb_entry(n, struct crush_loc_node, cl_node); seq_printf(m, "%s:%s", loc->cl_loc.cl_type_name, loc->cl_loc.cl_name); n = rb_next(n); if (!n) break; seq_putc(m, '|'); } seq_putc(m, ','); } if (opt->read_from_replica == CEPH_OSD_FLAG_BALANCE_READS) { seq_puts(m, "read_from_replica=balance,"); } else if (opt->read_from_replica == CEPH_OSD_FLAG_LOCALIZE_READS) { seq_puts(m, "read_from_replica=localize,"); } if (opt->con_modes[0] != CEPH_CON_MODE_UNKNOWN) { if (opt->con_modes[0] == CEPH_CON_MODE_CRC && opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) { seq_puts(m, "ms_mode=crc,"); } else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE && opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) { seq_puts(m, "ms_mode=secure,"); } else if (opt->con_modes[0] == CEPH_CON_MODE_CRC && opt->con_modes[1] == CEPH_CON_MODE_SECURE) { seq_puts(m, "ms_mode=prefer-crc,"); } else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE && opt->con_modes[1] == CEPH_CON_MODE_CRC) { seq_puts(m, "ms_mode=prefer-secure,"); } } if (opt->flags & CEPH_OPT_FSID) seq_printf(m, "fsid=%pU,", &opt->fsid); if (opt->flags & CEPH_OPT_NOSHARE) seq_puts(m, "noshare,"); if (opt->flags & CEPH_OPT_NOCRC) seq_puts(m, "nocrc,"); if (opt->flags & CEPH_OPT_NOMSGSIGN) seq_puts(m, "nocephx_sign_messages,"); if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) seq_puts(m, "notcp_nodelay,"); if (show_all && (opt->flags & CEPH_OPT_ABORT_ON_FULL)) seq_puts(m, "abort_on_full,"); if (opt->flags & CEPH_OPT_RXBOUNCE) seq_puts(m, "rxbounce,"); if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) seq_printf(m, "mount_timeout=%d,", jiffies_to_msecs(opt->mount_timeout) / 1000); if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) seq_printf(m, "osd_idle_ttl=%d,", jiffies_to_msecs(opt->osd_idle_ttl) / 1000); if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) seq_printf(m, "osdkeepalivetimeout=%d,", jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000); if (opt->osd_request_timeout != CEPH_OSD_REQUEST_TIMEOUT_DEFAULT) seq_printf(m, "osd_request_timeout=%d,", jiffies_to_msecs(opt->osd_request_timeout) / 1000); /* drop redundant comma */ if (m->count != pos) m->count--; return 0; } EXPORT_SYMBOL(ceph_print_client_options); struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client) { return &client->msgr.inst.addr; } EXPORT_SYMBOL(ceph_client_addr); u64 ceph_client_gid(struct ceph_client *client) { return client->monc.auth->global_id; } EXPORT_SYMBOL(ceph_client_gid); /* * create a fresh client instance */ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private) { struct ceph_client *client; struct ceph_entity_addr *myaddr = NULL; int err; err = wait_for_random_bytes(); if (err < 0) return ERR_PTR(err); client = kzalloc(sizeof(*client), GFP_KERNEL); if (client == NULL) return ERR_PTR(-ENOMEM); client->private = private; client->options = opt; mutex_init(&client->mount_mutex); init_waitqueue_head(&client->auth_wq); client->auth_err = 0; client->extra_mon_dispatch = NULL; client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT; client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT; if (!ceph_test_opt(client, NOMSGSIGN)) client->required_features |= CEPH_FEATURE_MSG_AUTH; /* msgr */ if (ceph_test_opt(client, MYIP)) myaddr = &client->options->my_addr; ceph_messenger_init(&client->msgr, myaddr); /* subsystems */ err = ceph_monc_init(&client->monc, client); if (err < 0) goto fail; err = ceph_osdc_init(&client->osdc, client); if (err < 0) goto fail_monc; return client; fail_monc: ceph_monc_stop(&client->monc); fail: ceph_messenger_fini(&client->msgr); kfree(client); return ERR_PTR(err); } EXPORT_SYMBOL(ceph_create_client); void ceph_destroy_client(struct ceph_client *client) { dout("destroy_client %p\n", client); atomic_set(&client->msgr.stopping, 1); /* unmount */ ceph_osdc_stop(&client->osdc); ceph_monc_stop(&client->monc); ceph_messenger_fini(&client->msgr); ceph_debugfs_client_cleanup(client); ceph_destroy_options(client->options); kfree(client); dout("destroy_client %p done\n", client); } EXPORT_SYMBOL(ceph_destroy_client); void ceph_reset_client_addr(struct ceph_client *client) { ceph_messenger_reset_nonce(&client->msgr); ceph_monc_reopen_session(&client->monc); ceph_osdc_reopen_osds(&client->osdc); } EXPORT_SYMBOL(ceph_reset_client_addr); /* * true if we have the mon map (and have thus joined the cluster) */ static bool have_mon_and_osd_map(struct ceph_client *client) { return client->monc.monmap && client->monc.monmap->epoch && client->osdc.osdmap && client->osdc.osdmap->epoch; } /* * mount: join the ceph cluster, and open root directory. */ int __ceph_open_session(struct ceph_client *client, unsigned long started) { unsigned long timeout = client->options->mount_timeout; long err; /* open session, and wait for mon and osd maps */ err = ceph_monc_open_session(&client->monc); if (err < 0) return err; while (!have_mon_and_osd_map(client)) { if (timeout && time_after_eq(jiffies, started + timeout)) return -ETIMEDOUT; /* wait */ dout("mount waiting for mon_map\n"); err = wait_event_interruptible_timeout(client->auth_wq, have_mon_and_osd_map(client) || (client->auth_err < 0), ceph_timeout_jiffies(timeout)); if (err < 0) return err; if (client->auth_err < 0) return client->auth_err; } pr_info("client%llu fsid %pU\n", ceph_client_gid(client), &client->fsid); ceph_debugfs_client_init(client); return 0; } EXPORT_SYMBOL(__ceph_open_session); int ceph_open_session(struct ceph_client *client) { int ret; unsigned long started = jiffies; /* note the start time */ dout("open_session start\n"); mutex_lock(&client->mount_mutex); ret = __ceph_open_session(client, started); mutex_unlock(&client->mount_mutex); return ret; } EXPORT_SYMBOL(ceph_open_session); int ceph_wait_for_latest_osdmap(struct ceph_client *client, unsigned long timeout) { u64 newest_epoch; int ret; ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch); if (ret) return ret; if (client->osdc.osdmap->epoch >= newest_epoch) return 0; ceph_osdc_maybe_request_map(&client->osdc); return ceph_monc_wait_osdmap(&client->monc, newest_epoch, timeout); } EXPORT_SYMBOL(ceph_wait_for_latest_osdmap); static int __init init_ceph_lib(void) { int ret = 0; ceph_debugfs_init(); ret = ceph_crypto_init(); if (ret < 0) goto out_debugfs; ret = ceph_msgr_init(); if (ret < 0) goto out_crypto; ret = ceph_osdc_setup(); if (ret < 0) goto out_msgr; pr_info("loaded (mon/osd proto %d/%d)\n", CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL); return 0; out_msgr: ceph_msgr_exit(); out_crypto: ceph_crypto_shutdown(); out_debugfs: ceph_debugfs_cleanup(); return ret; } static void __exit exit_ceph_lib(void) { dout("exit_ceph_lib\n"); WARN_ON(!ceph_strings_empty()); ceph_osdc_cleanup(); ceph_msgr_exit(); ceph_crypto_shutdown(); ceph_debugfs_cleanup(); } module_init(init_ceph_lib); module_exit(exit_ceph_lib); MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); MODULE_AUTHOR("Patience Warnick <patience@newdream.net>"); MODULE_DESCRIPTION("Ceph core library"); MODULE_LICENSE("GPL");
9 4 4 4 4 4 4 4 1 4 4 4 4 4 4 4 1 1 1 1 1 4 8 2 7 6 3 6 6 6 2 8 1 5 1 1 5 5 5 2 4 4 4 4 4 4 4 4 4 4 4 4 4 1 1 1 1 6 11 7 4 2 2 1 1 1 1 1 1 1 1 1 5 5 5 5 5 2 2 2 1 1 2 2 2 2 2 2 2 6 6 6 6 6 6 6 6 6 6 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler. * * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente. * Copyright (c) 2012 Paolo Valente. */ #include <linux/module.h> #include <linux/init.h> #include <linux/bitops.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/pkt_sched.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> /* Quick Fair Queueing Plus ======================== Sources: [1] Paolo Valente, "Reducing the Execution Time of Fair-Queueing Schedulers." http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf Sources for QFQ: [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient Packet Scheduling with Tight Bandwidth Distribution Guarantees." See also: http://retis.sssup.it/~fabio/linux/qfq/ */ /* QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES classes. Each aggregate is timestamped with a virtual start time S and a virtual finish time F, and scheduled according to its timestamps. S and F are computed as a function of a system virtual time function V. The classes within each aggregate are instead scheduled with DRR. To speed up operations, QFQ+ divides also aggregates into a limited number of groups. Which group a class belongs to depends on the ratio between the maximum packet length for the class and the weight of the class. Groups have their own S and F. In the end, QFQ+ schedules groups, then aggregates within groups, then classes within aggregates. See [1] and [2] for a full description. Virtual time computations. S, F and V are all computed in fixed point arithmetic with FRAC_BITS decimal bits. QFQ_MAX_INDEX is the maximum index allowed for a group. We need one bit per index. QFQ_MAX_WSHIFT is the maximum power of two supported as a weight. The layout of the bits is as below: [ MTU_SHIFT ][ FRAC_BITS ] [ MAX_INDEX ][ MIN_SLOT_SHIFT ] ^.__grp->index = 0 *.__grp->slot_shift where MIN_SLOT_SHIFT is derived by difference from the others. The max group index corresponds to Lmax/w_min, where Lmax=1<<MTU_SHIFT, w_min = 1 . From this, and knowing how many groups (MAX_INDEX) we want, we can derive the shift corresponding to each group. Because we often need to compute F = S + len/w_i and V = V + len/wsum instead of storing w_i store the value inv_w = (1<<FRAC_BITS)/w_i so we can do F = S + len * inv_w * wsum. We use W_TOT in the formulas so we can easily move between static and adaptive weight sum. The per-scheduler-instance data contain all the data structures for the scheduler: bitmaps and bucket lists. */ /* * Maximum number of consecutive slots occupied by backlogged classes * inside a group. */ #define QFQ_MAX_SLOTS 32 /* * Shifts used for aggregate<->group mapping. We allow class weights that are * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the * group with the smallest index that can support the L_i / r_i configured * for the classes in the aggregate. * * grp->index is the index of the group; and grp->slot_shift * is the shift for the corresponding (scaled) sigma_i. */ #define QFQ_MAX_INDEX 24 #define QFQ_MAX_WSHIFT 10 #define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */ #define QFQ_MAX_WSUM (64*QFQ_MAX_WEIGHT) #define FRAC_BITS 30 /* fixed point arithmetic */ #define ONE_FP (1UL << FRAC_BITS) #define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */ #define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */ #define QFQ_MAX_LMAX (1UL << QFQ_MTU_SHIFT) #define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */ /* * Possible group states. These values are used as indexes for the bitmaps * array of struct qfq_queue. */ enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE }; struct qfq_group; struct qfq_aggregate; struct qfq_class { struct Qdisc_class_common common; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct Qdisc *qdisc; struct list_head alist; /* Link for active-classes list. */ struct qfq_aggregate *agg; /* Parent aggregate. */ int deficit; /* DRR deficit counter. */ }; struct qfq_aggregate { struct hlist_node next; /* Link for the slot list. */ u64 S, F; /* flow timestamps (exact) */ /* group we belong to. In principle we would need the index, * which is log_2(lmax/weight), but we never reference it * directly, only the group. */ struct qfq_group *grp; /* these are copied from the flowset. */ u32 class_weight; /* Weight of each class in this aggregate. */ /* Max pkt size for the classes in this aggregate, DRR quantum. */ int lmax; u32 inv_w; /* ONE_FP/(sum of weights of classes in aggr.). */ u32 budgetmax; /* Max budget for this aggregate. */ u32 initial_budget, budget; /* Initial and current budget. */ int num_classes; /* Number of classes in this aggr. */ struct list_head active; /* DRR queue of active classes. */ struct hlist_node nonfull_next; /* See nonfull_aggs in qfq_sched. */ }; struct qfq_group { u64 S, F; /* group timestamps (approx). */ unsigned int slot_shift; /* Slot shift. */ unsigned int index; /* Group index. */ unsigned int front; /* Index of the front slot. */ unsigned long full_slots; /* non-empty slots */ /* Array of RR lists of active aggregates. */ struct hlist_head slots[QFQ_MAX_SLOTS]; }; struct qfq_sched { struct tcf_proto __rcu *filter_list; struct tcf_block *block; struct Qdisc_class_hash clhash; u64 oldV, V; /* Precise virtual times. */ struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */ u32 wsum; /* weight sum */ u32 iwsum; /* inverse weight sum */ unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ u32 min_slot_shift; /* Index of the group-0 bit in the bitmaps. */ u32 max_agg_classes; /* Max number of classes per aggr. */ struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */ }; /* * Possible reasons why the timestamps of an aggregate are updated * enqueue: the aggregate switches from idle to active and must scheduled * for service * requeue: the aggregate finishes its budget, so it stops being served and * must be rescheduled for service */ enum update_reason {enqueue, requeue}; static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) { struct qfq_sched *q = qdisc_priv(sch); struct Qdisc_class_common *clc; clc = qdisc_class_find(&q->clhash, classid); if (clc == NULL) return NULL; return container_of(clc, struct qfq_class, common); } static const struct netlink_range_validation lmax_range = { .min = QFQ_MIN_LMAX, .max = QFQ_MAX_LMAX, }; static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = { [TCA_QFQ_WEIGHT] = NLA_POLICY_RANGE(NLA_U32, 1, QFQ_MAX_WEIGHT), [TCA_QFQ_LMAX] = NLA_POLICY_FULL_RANGE(NLA_U32, &lmax_range), }; /* * Calculate a flow index, given its weight and maximum packet length. * index = log_2(maxlen/weight) but we need to apply the scaling. * This is used only once at flow creation. */ static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift) { u64 slot_size = (u64)maxlen * inv_w; unsigned long size_map; int index = 0; size_map = slot_size >> min_slot_shift; if (!size_map) goto out; index = __fls(size_map) + 1; /* basically a log_2 */ index -= !(slot_size - (1ULL << (index + min_slot_shift - 1))); if (index < 0) index = 0; out: pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n", (unsigned long) ONE_FP/inv_w, maxlen, index); return index; } static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *); static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *, enum update_reason); static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg, u32 lmax, u32 weight) { INIT_LIST_HEAD(&agg->active); hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs); agg->lmax = lmax; agg->class_weight = weight; } static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q, u32 lmax, u32 weight) { struct qfq_aggregate *agg; hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next) if (agg->lmax == lmax && agg->class_weight == weight) return agg; return NULL; } /* Update aggregate as a function of the new number of classes. */ static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg, int new_num_classes) { u32 new_agg_weight; if (new_num_classes == q->max_agg_classes) hlist_del_init(&agg->nonfull_next); if (agg->num_classes > new_num_classes && new_num_classes == q->max_agg_classes - 1) /* agg no more full */ hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs); /* The next assignment may let * agg->initial_budget > agg->budgetmax * hold, we will take it into account in charge_actual_service(). */ agg->budgetmax = new_num_classes * agg->lmax; new_agg_weight = agg->class_weight * new_num_classes; agg->inv_w = ONE_FP/new_agg_weight; if (agg->grp == NULL) { int i = qfq_calc_index(agg->inv_w, agg->budgetmax, q->min_slot_shift); agg->grp = &q->groups[i]; } q->wsum += (int) agg->class_weight * (new_num_classes - agg->num_classes); q->iwsum = ONE_FP / q->wsum; agg->num_classes = new_num_classes; } /* Add class to aggregate. */ static void qfq_add_to_agg(struct qfq_sched *q, struct qfq_aggregate *agg, struct qfq_class *cl) { cl->agg = agg; qfq_update_agg(q, agg, agg->num_classes+1); if (cl->qdisc->q.qlen > 0) { /* adding an active class */ list_add_tail(&cl->alist, &agg->active); if (list_first_entry(&agg->active, struct qfq_class, alist) == cl && q->in_serv_agg != agg) /* agg was inactive */ qfq_activate_agg(q, agg, enqueue); /* schedule agg */ } } static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *); static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { hlist_del_init(&agg->nonfull_next); q->wsum -= agg->class_weight; if (q->wsum != 0) q->iwsum = ONE_FP / q->wsum; if (q->in_serv_agg == agg) q->in_serv_agg = qfq_choose_next_agg(q); kfree(agg); } /* Deschedule class from within its parent aggregate. */ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) { struct qfq_aggregate *agg = cl->agg; list_del(&cl->alist); /* remove from RR queue of the aggregate */ if (list_empty(&agg->active)) /* agg is now inactive */ qfq_deactivate_agg(q, agg); } /* Remove class from its parent aggregate. */ static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) { struct qfq_aggregate *agg = cl->agg; cl->agg = NULL; if (agg->num_classes == 1) { /* agg being emptied, destroy it */ qfq_destroy_agg(q, agg); return; } qfq_update_agg(q, agg, agg->num_classes-1); } /* Deschedule class and remove it from its parent aggregate. */ static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) { if (cl->qdisc->q.qlen > 0) /* class is active */ qfq_deactivate_class(q, cl); qfq_rm_from_agg(q, cl); } /* Move class to a new aggregate, matching the new class weight and/or lmax */ static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight, u32 lmax) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_aggregate *new_agg; /* 'lmax' can range from [QFQ_MIN_LMAX, pktlen + stab overhead] */ if (lmax > QFQ_MAX_LMAX) return -EINVAL; new_agg = qfq_find_agg(q, lmax, weight); if (new_agg == NULL) { /* create new aggregate */ new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC); if (new_agg == NULL) return -ENOBUFS; qfq_init_agg(q, new_agg, lmax, weight); } qfq_deact_rm_from_agg(q, cl); qfq_add_to_agg(q, new_agg, cl); return 0; } static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, unsigned long *arg, struct netlink_ext_ack *extack) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)*arg; bool existing = false; struct nlattr *tb[TCA_QFQ_MAX + 1]; struct qfq_aggregate *new_agg = NULL; u32 weight, lmax, inv_w; int err; int delta_w; if (NL_REQ_ATTR_CHECK(extack, NULL, tca, TCA_OPTIONS)) { NL_SET_ERR_MSG_MOD(extack, "missing options"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy, extack); if (err < 0) return err; if (tb[TCA_QFQ_WEIGHT]) weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]); else weight = 1; if (tb[TCA_QFQ_LMAX]) { lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); } else { /* MTU size is user controlled */ lmax = psched_mtu(qdisc_dev(sch)); if (lmax < QFQ_MIN_LMAX || lmax > QFQ_MAX_LMAX) { NL_SET_ERR_MSG_MOD(extack, "MTU size out of bounds for qfq"); return -EINVAL; } } inv_w = ONE_FP / weight; weight = ONE_FP / inv_w; if (cl != NULL && lmax == cl->agg->lmax && weight == cl->agg->class_weight) return 0; /* nothing to change */ delta_w = weight - (cl ? cl->agg->class_weight : 0); if (q->wsum + delta_w > QFQ_MAX_WSUM) { NL_SET_ERR_MSG_FMT_MOD(extack, "total weight out of range (%d + %u)\n", delta_w, q->wsum); return -EINVAL; } if (cl != NULL) { /* modify existing class */ if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, true, tca[TCA_RATE]); if (err) return err; } existing = true; goto set_change_agg; } /* create and init new class */ cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL); if (cl == NULL) return -ENOBUFS; gnet_stats_basic_sync_init(&cl->bstats); cl->common.classid = classid; cl->deficit = lmax; cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid, NULL); if (cl->qdisc == NULL) cl->qdisc = &noop_qdisc; if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, true, tca[TCA_RATE]); if (err) goto destroy_class; } if (cl->qdisc != &noop_qdisc) qdisc_hash_add(cl->qdisc, true); set_change_agg: sch_tree_lock(sch); new_agg = qfq_find_agg(q, lmax, weight); if (new_agg == NULL) { /* create new aggregate */ sch_tree_unlock(sch); new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL); if (new_agg == NULL) { err = -ENOBUFS; gen_kill_estimator(&cl->rate_est); goto destroy_class; } sch_tree_lock(sch); qfq_init_agg(q, new_agg, lmax, weight); } if (existing) qfq_deact_rm_from_agg(q, cl); else qdisc_class_hash_insert(&q->clhash, &cl->common); qfq_add_to_agg(q, new_agg, cl); sch_tree_unlock(sch); qdisc_class_hash_grow(sch, &q->clhash); *arg = (unsigned long)cl; return 0; destroy_class: qdisc_put(cl->qdisc); kfree(cl); return err; } static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) { struct qfq_sched *q = qdisc_priv(sch); qfq_rm_from_agg(q, cl); gen_kill_estimator(&cl->rate_est); qdisc_put(cl->qdisc); kfree(cl); } static int qfq_delete_class(struct Qdisc *sch, unsigned long arg, struct netlink_ext_ack *extack) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)arg; if (qdisc_class_in_use(&cl->common)) { NL_SET_ERR_MSG_MOD(extack, "QFQ class in use"); return -EBUSY; } sch_tree_lock(sch); qdisc_purge_queue(cl->qdisc); qdisc_class_hash_remove(&q->clhash, &cl->common); sch_tree_unlock(sch); qfq_destroy_class(sch, cl); return 0; } static unsigned long qfq_search_class(struct Qdisc *sch, u32 classid) { return (unsigned long)qfq_find_class(sch, classid); } static struct tcf_block *qfq_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct qfq_sched *q = qdisc_priv(sch); if (cl) return NULL; return q->block; } static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid) { struct qfq_class *cl = qfq_find_class(sch, classid); if (cl) qdisc_class_get(&cl->common); return (unsigned long)cl; } static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg) { struct qfq_class *cl = (struct qfq_class *)arg; qdisc_class_put(&cl->common); } static int qfq_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct qfq_class *cl = (struct qfq_class *)arg; if (new == NULL) { new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, cl->common.classid, NULL); if (new == NULL) new = &noop_qdisc; } *old = qdisc_replace(sch, new, &cl->qdisc); return 0; } static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg) { struct qfq_class *cl = (struct qfq_class *)arg; return cl->qdisc; } static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { struct qfq_class *cl = (struct qfq_class *)arg; struct nlattr *nest; tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle = cl->common.classid; tcm->tcm_info = cl->qdisc->handle; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) || nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax)) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { struct qfq_class *cl = (struct qfq_class *)arg; struct tc_qfq_stats xstats; memset(&xstats, 0, sizeof(xstats)); xstats.weight = cl->agg->class_weight; xstats.lmax = cl->agg->lmax; if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || qdisc_qstats_copy(d, cl->qdisc) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); } static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; unsigned int i; if (arg->stop) return; for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg)) return; } } } static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; struct tcf_result res; struct tcf_proto *fl; int result; if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) { pr_debug("qfq_classify: found %d\n", skb->priority); cl = qfq_find_class(sch, skb->priority); if (cl != NULL) return cl; } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return NULL; } #endif cl = (struct qfq_class *)res.class; if (cl == NULL) cl = qfq_find_class(sch, res.classid); return cl; } return NULL; } /* Generic comparison function, handling wraparound. */ static inline int qfq_gt(u64 a, u64 b) { return (s64)(a - b) > 0; } /* Round a precise timestamp to its slotted value. */ static inline u64 qfq_round_down(u64 ts, unsigned int shift) { return ts & ~((1ULL << shift) - 1); } /* return the pointer to the group with lowest index in the bitmap */ static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, unsigned long bitmap) { int index = __ffs(bitmap); return &q->groups[index]; } /* Calculate a mask to mimic what would be ffs_from(). */ static inline unsigned long mask_from(unsigned long bitmap, int from) { return bitmap & ~((1UL << from) - 1); } /* * The state computation relies on ER=0, IR=1, EB=2, IB=3 * First compute eligibility comparing grp->S, q->V, * then check if someone is blocking us and possibly add EB */ static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp) { /* if S > V we are not eligible */ unsigned int state = qfq_gt(grp->S, q->V); unsigned long mask = mask_from(q->bitmaps[ER], grp->index); struct qfq_group *next; if (mask) { next = qfq_ffs(q, mask); if (qfq_gt(grp->F, next->F)) state |= EB; } return state; } /* * In principle * q->bitmaps[dst] |= q->bitmaps[src] & mask; * q->bitmaps[src] &= ~mask; * but we should make sure that src != dst */ static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst) { q->bitmaps[dst] |= q->bitmaps[src] & mask; q->bitmaps[src] &= ~mask; } static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F) { unsigned long mask = mask_from(q->bitmaps[ER], index + 1); struct qfq_group *next; if (mask) { next = qfq_ffs(q, mask); if (!qfq_gt(next->F, old_F)) return; } mask = (1UL << index) - 1; qfq_move_groups(q, mask, EB, ER); qfq_move_groups(q, mask, IB, IR); } /* * perhaps * old_V ^= q->V; old_V >>= q->min_slot_shift; if (old_V) { ... } * */ static void qfq_make_eligible(struct qfq_sched *q) { unsigned long vslot = q->V >> q->min_slot_shift; unsigned long old_vslot = q->oldV >> q->min_slot_shift; if (vslot != old_vslot) { unsigned long mask; int last_flip_pos = fls(vslot ^ old_vslot); if (last_flip_pos > 31) /* higher than the number of groups */ mask = ~0UL; /* make all groups eligible */ else mask = (1UL << last_flip_pos) - 1; qfq_move_groups(q, mask, IR, ER); qfq_move_groups(q, mask, IB, EB); } } /* * The index of the slot in which the input aggregate agg is to be * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2' * and not a '-1' because the start time of the group may be moved * backward by one slot after the aggregate has been inserted, and * this would cause non-empty slots to be right-shifted by one * position. * * QFQ+ fully satisfies this bound to the slot index if the parameters * of the classes are not changed dynamically, and if QFQ+ never * happens to postpone the service of agg unjustly, i.e., it never * happens that the aggregate becomes backlogged and eligible, or just * eligible, while an aggregate with a higher approximated finish time * is being served. In particular, in this case QFQ+ guarantees that * the timestamps of agg are low enough that the slot index is never * higher than 2. Unfortunately, QFQ+ cannot provide the same * guarantee if it happens to unjustly postpone the service of agg, or * if the parameters of some class are changed. * * As for the first event, i.e., an out-of-order service, the * upper bound to the slot index guaranteed by QFQ+ grows to * 2 + * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) * * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1. * * The following function deals with this problem by backward-shifting * the timestamps of agg, if needed, so as to guarantee that the slot * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may * cause the service of other aggregates to be postponed, yet the * worst-case guarantees of these aggregates are not violated. In * fact, in case of no out-of-order service, the timestamps of agg * would have been even lower than they are after the backward shift, * because QFQ+ would have guaranteed a maximum value equal to 2 for * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose * service is postponed because of the backward-shift would have * however waited for the service of agg before being served. * * The other event that may cause the slot index to be higher than 2 * for agg is a recent change of the parameters of some class. If the * weight of a class is increased or the lmax (max_pkt_size) of the * class is decreased, then a new aggregate with smaller slot size * than the original parent aggregate of the class may happen to be * activated. The activation of this aggregate should be properly * delayed to when the service of the class has finished in the ideal * system tracked by QFQ+. If the activation of the aggregate is not * delayed to this reference time instant, then this aggregate may be * unjustly served before other aggregates waiting for service. This * may cause the above bound to the slot index to be violated for some * of these unlucky aggregates. * * Instead of delaying the activation of the new aggregate, which is * quite complex, the above-discussed capping of the slot index is * used to handle also the consequences of a change of the parameters * of a class. */ static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg, u64 roundedS) { u64 slot = (roundedS - grp->S) >> grp->slot_shift; unsigned int i; /* slot index in the bucket list */ if (unlikely(slot > QFQ_MAX_SLOTS - 2)) { u64 deltaS = roundedS - grp->S - ((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift); agg->S -= deltaS; agg->F -= deltaS; slot = QFQ_MAX_SLOTS - 2; } i = (grp->front + slot) % QFQ_MAX_SLOTS; hlist_add_head(&agg->next, &grp->slots[i]); __set_bit(slot, &grp->full_slots); } /* Maybe introduce hlist_first_entry?? */ static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp) { return hlist_entry(grp->slots[grp->front].first, struct qfq_aggregate, next); } /* * remove the entry from the slot */ static void qfq_front_slot_remove(struct qfq_group *grp) { struct qfq_aggregate *agg = qfq_slot_head(grp); BUG_ON(!agg); hlist_del(&agg->next); if (hlist_empty(&grp->slots[grp->front])) __clear_bit(0, &grp->full_slots); } /* * Returns the first aggregate in the first non-empty bucket of the * group. As a side effect, adjusts the bucket list so the first * non-empty bucket is at position 0 in full_slots. */ static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp) { unsigned int i; pr_debug("qfq slot_scan: grp %u full %#lx\n", grp->index, grp->full_slots); if (grp->full_slots == 0) return NULL; i = __ffs(grp->full_slots); /* zero based */ if (i > 0) { grp->front = (grp->front + i) % QFQ_MAX_SLOTS; grp->full_slots >>= i; } return qfq_slot_head(grp); } /* * adjust the bucket list. When the start time of a group decreases, * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to * move the objects. The mask of occupied slots must be shifted * because we use ffs() to find the first non-empty slot. * This covers decreases in the group's start time, but what about * increases of the start time ? * Here too we should make sure that i is less than 32 */ static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS) { unsigned int i = (grp->S - roundedS) >> grp->slot_shift; grp->full_slots <<= i; grp->front = (grp->front - i) % QFQ_MAX_SLOTS; } static void qfq_update_eligible(struct qfq_sched *q) { struct qfq_group *grp; unsigned long ineligible; ineligible = q->bitmaps[IR] | q->bitmaps[IB]; if (ineligible) { if (!q->bitmaps[ER]) { grp = qfq_ffs(q, ineligible); if (qfq_gt(grp->S, q->V)) q->V = grp->S; } qfq_make_eligible(q); } } /* Dequeue head packet of the head class in the DRR queue of the aggregate. */ static struct sk_buff *agg_dequeue(struct qfq_aggregate *agg, struct qfq_class *cl, unsigned int len) { struct sk_buff *skb = qdisc_dequeue_peeked(cl->qdisc); if (!skb) return NULL; cl->deficit -= (int) len; if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */ list_del(&cl->alist); else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) { cl->deficit += agg->lmax; list_move_tail(&cl->alist, &agg->active); } return skb; } static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg, struct qfq_class **cl, unsigned int *len) { struct sk_buff *skb; *cl = list_first_entry(&agg->active, struct qfq_class, alist); skb = (*cl)->qdisc->ops->peek((*cl)->qdisc); if (skb == NULL) qdisc_warn_nonwc("qfq_dequeue", (*cl)->qdisc); else *len = qdisc_pkt_len(skb); return skb; } /* Update F according to the actual service received by the aggregate. */ static inline void charge_actual_service(struct qfq_aggregate *agg) { /* Compute the service received by the aggregate, taking into * account that, after decreasing the number of classes in * agg, it may happen that * agg->initial_budget - agg->budget > agg->bugdetmax */ u32 service_received = min(agg->budgetmax, agg->initial_budget - agg->budget); agg->F = agg->S + (u64)service_received * agg->inv_w; } /* Assign a reasonable start time for a new aggregate in group i. * Admissible values for \hat(F) are multiples of \sigma_i * no greater than V+\sigma_i . Larger values mean that * we had a wraparound so we consider the timestamp to be stale. * * If F is not stale and F >= V then we set S = F. * Otherwise we should assign S = V, but this may violate * the ordering in EB (see [2]). So, if we have groups in ER, * set S to the F_j of the first group j which would be blocking us. * We are guaranteed not to move S backward because * otherwise our group i would still be blocked. */ static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg) { unsigned long mask; u64 limit, roundedF; int slot_shift = agg->grp->slot_shift; roundedF = qfq_round_down(agg->F, slot_shift); limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift); if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) { /* timestamp was stale */ mask = mask_from(q->bitmaps[ER], agg->grp->index); if (mask) { struct qfq_group *next = qfq_ffs(q, mask); if (qfq_gt(roundedF, next->F)) { if (qfq_gt(limit, next->F)) agg->S = next->F; else /* preserve timestamp correctness */ agg->S = limit; return; } } agg->S = q->V; } else /* timestamp is not stale */ agg->S = agg->F; } /* Update the timestamps of agg before scheduling/rescheduling it for * service. In particular, assign to agg->F its maximum possible * value, i.e., the virtual finish time with which the aggregate * should be labeled if it used all its budget once in service. */ static inline void qfq_update_agg_ts(struct qfq_sched *q, struct qfq_aggregate *agg, enum update_reason reason) { if (reason != requeue) qfq_update_start(q, agg); else /* just charge agg for the service received */ agg->S = agg->F; agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w; } static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg); static struct sk_buff *qfq_dequeue(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_aggregate *in_serv_agg = q->in_serv_agg; struct qfq_class *cl; struct sk_buff *skb = NULL; /* next-packet len, 0 means no more active classes in in-service agg */ unsigned int len = 0; if (in_serv_agg == NULL) return NULL; if (!list_empty(&in_serv_agg->active)) skb = qfq_peek_skb(in_serv_agg, &cl, &len); /* * If there are no active classes in the in-service aggregate, * or if the aggregate has not enough budget to serve its next * class, then choose the next aggregate to serve. */ if (len == 0 || in_serv_agg->budget < len) { charge_actual_service(in_serv_agg); /* recharge the budget of the aggregate */ in_serv_agg->initial_budget = in_serv_agg->budget = in_serv_agg->budgetmax; if (!list_empty(&in_serv_agg->active)) { /* * Still active: reschedule for * service. Possible optimization: if no other * aggregate is active, then there is no point * in rescheduling this aggregate, and we can * just keep it as the in-service one. This * should be however a corner case, and to * handle it, we would need to maintain an * extra num_active_aggs field. */ qfq_update_agg_ts(q, in_serv_agg, requeue); qfq_schedule_agg(q, in_serv_agg); } else if (sch->q.qlen == 0) { /* no aggregate to serve */ q->in_serv_agg = NULL; return NULL; } /* * If we get here, there are other aggregates queued: * choose the new aggregate to serve. */ in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q); skb = qfq_peek_skb(in_serv_agg, &cl, &len); } if (!skb) return NULL; sch->q.qlen--; skb = agg_dequeue(in_serv_agg, cl, len); if (!skb) { sch->q.qlen++; return NULL; } qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); /* If lmax is lowered, through qfq_change_class, for a class * owning pending packets with larger size than the new value * of lmax, then the following condition may hold. */ if (unlikely(in_serv_agg->budget < len)) in_serv_agg->budget = 0; else in_serv_agg->budget -= len; q->V += (u64)len * q->iwsum; pr_debug("qfq dequeue: len %u F %lld now %lld\n", len, (unsigned long long) in_serv_agg->F, (unsigned long long) q->V); return skb; } static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q) { struct qfq_group *grp; struct qfq_aggregate *agg, *new_front_agg; u64 old_F; qfq_update_eligible(q); q->oldV = q->V; if (!q->bitmaps[ER]) return NULL; grp = qfq_ffs(q, q->bitmaps[ER]); old_F = grp->F; agg = qfq_slot_head(grp); /* agg starts to be served, remove it from schedule */ qfq_front_slot_remove(grp); new_front_agg = qfq_slot_scan(grp); if (new_front_agg == NULL) /* group is now inactive, remove from ER */ __clear_bit(grp->index, &q->bitmaps[ER]); else { u64 roundedS = qfq_round_down(new_front_agg->S, grp->slot_shift); unsigned int s; if (grp->S == roundedS) return agg; grp->S = roundedS; grp->F = roundedS + (2ULL << grp->slot_shift); __clear_bit(grp->index, &q->bitmaps[ER]); s = qfq_calc_state(q, grp); __set_bit(grp->index, &q->bitmaps[s]); } qfq_unblock_groups(q, grp->index, old_F); return agg; } static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb), gso_segs; struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; struct qfq_aggregate *agg; int err = 0; bool first; cl = qfq_classify(skb, sch, &err); if (cl == NULL) { if (err & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return err; } pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid); if (unlikely(cl->agg->lmax < len)) { pr_debug("qfq: increasing maxpkt from %u to %u for class %u", cl->agg->lmax, len, cl->common.classid); err = qfq_change_agg(sch, cl, cl->agg->class_weight, len); if (err) { cl->qstats.drops++; return qdisc_drop(skb, sch, to_free); } } gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { pr_debug("qfq_enqueue: enqueue failed %d\n", err); if (net_xmit_drop_count(err)) { cl->qstats.drops++; qdisc_qstats_drop(sch); } return err; } _bstats_update(&cl->bstats, len, gso_segs); sch->qstats.backlog += len; ++sch->q.qlen; agg = cl->agg; /* if the queue was not empty, then done here */ if (!first) { if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) && list_first_entry(&agg->active, struct qfq_class, alist) == cl && cl->deficit < len) list_move_tail(&cl->alist, &agg->active); return err; } /* schedule class for service within the aggregate */ cl->deficit = agg->lmax; list_add_tail(&cl->alist, &agg->active); if (list_first_entry(&agg->active, struct qfq_class, alist) != cl || q->in_serv_agg == agg) return err; /* non-empty or in service, nothing else to do */ qfq_activate_agg(q, agg, enqueue); return err; } /* * Schedule aggregate according to its timestamps. */ static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { struct qfq_group *grp = agg->grp; u64 roundedS; int s; roundedS = qfq_round_down(agg->S, grp->slot_shift); /* * Insert agg in the correct bucket. * If agg->S >= grp->S we don't need to adjust the * bucket list and simply go to the insertion phase. * Otherwise grp->S is decreasing, we must make room * in the bucket list, and also recompute the group state. * Finally, if there were no flows in this group and nobody * was in ER make sure to adjust V. */ if (grp->full_slots) { if (!qfq_gt(grp->S, agg->S)) goto skip_update; /* create a slot for this agg->S */ qfq_slot_rotate(grp, roundedS); /* group was surely ineligible, remove */ __clear_bit(grp->index, &q->bitmaps[IR]); __clear_bit(grp->index, &q->bitmaps[IB]); } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V) && q->in_serv_agg == NULL) q->V = roundedS; grp->S = roundedS; grp->F = roundedS + (2ULL << grp->slot_shift); s = qfq_calc_state(q, grp); __set_bit(grp->index, &q->bitmaps[s]); pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n", s, q->bitmaps[s], (unsigned long long) agg->S, (unsigned long long) agg->F, (unsigned long long) q->V); skip_update: qfq_slot_insert(grp, agg, roundedS); } /* Update agg ts and schedule agg for service */ static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg, enum update_reason reason) { agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */ qfq_update_agg_ts(q, agg, reason); if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */ q->in_serv_agg = agg; /* start serving this aggregate */ /* update V: to be in service, agg must be eligible */ q->oldV = q->V = agg->S; } else if (agg != q->in_serv_agg) qfq_schedule_agg(q, agg); } static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, struct qfq_aggregate *agg) { unsigned int i, offset; u64 roundedS; roundedS = qfq_round_down(agg->S, grp->slot_shift); offset = (roundedS - grp->S) >> grp->slot_shift; i = (grp->front + offset) % QFQ_MAX_SLOTS; hlist_del(&agg->next); if (hlist_empty(&grp->slots[i])) __clear_bit(offset, &grp->full_slots); } /* * Called to forcibly deschedule an aggregate. If the aggregate is * not in the front bucket, or if the latter has other aggregates in * the front bucket, we can simply remove the aggregate with no other * side effects. * Otherwise we must propagate the event up. */ static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { struct qfq_group *grp = agg->grp; unsigned long mask; u64 roundedS; int s; if (agg == q->in_serv_agg) { charge_actual_service(agg); q->in_serv_agg = qfq_choose_next_agg(q); return; } agg->F = agg->S; qfq_slot_remove(q, grp, agg); if (!grp->full_slots) { __clear_bit(grp->index, &q->bitmaps[IR]); __clear_bit(grp->index, &q->bitmaps[EB]); __clear_bit(grp->index, &q->bitmaps[IB]); if (test_bit(grp->index, &q->bitmaps[ER]) && !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) { mask = q->bitmaps[ER] & ((1UL << grp->index) - 1); if (mask) mask = ~((1UL << __fls(mask)) - 1); else mask = ~0UL; qfq_move_groups(q, mask, EB, ER); qfq_move_groups(q, mask, IB, IR); } __clear_bit(grp->index, &q->bitmaps[ER]); } else if (hlist_empty(&grp->slots[grp->front])) { agg = qfq_slot_scan(grp); roundedS = qfq_round_down(agg->S, grp->slot_shift); if (grp->S != roundedS) { __clear_bit(grp->index, &q->bitmaps[ER]); __clear_bit(grp->index, &q->bitmaps[IR]); __clear_bit(grp->index, &q->bitmaps[EB]); __clear_bit(grp->index, &q->bitmaps[IB]); grp->S = roundedS; grp->F = roundedS + (2ULL << grp->slot_shift); s = qfq_calc_state(q, grp); __set_bit(grp->index, &q->bitmaps[s]); } } } static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)arg; qfq_deactivate_class(q, cl); } static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_group *grp; int i, j, err; u32 max_cl_shift, maxbudg_shift, max_classes; err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; err = qdisc_class_hash_init(&q->clhash); if (err < 0) return err; max_classes = min_t(u64, (u64)qdisc_dev(sch)->tx_queue_len + 1, QFQ_MAX_AGG_CLASSES); /* max_cl_shift = floor(log_2(max_classes)) */ max_cl_shift = __fls(max_classes); q->max_agg_classes = 1<<max_cl_shift; /* maxbudg_shift = log2(max_len * max_classes_per_agg) */ maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift; q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX; for (i = 0; i <= QFQ_MAX_INDEX; i++) { grp = &q->groups[i]; grp->index = i; grp->slot_shift = q->min_slot_shift + i; for (j = 0; j < QFQ_MAX_SLOTS; j++) INIT_HLIST_HEAD(&grp->slots[j]); } INIT_HLIST_HEAD(&q->nonfull_aggs); return 0; } static void qfq_reset_qdisc(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (cl->qdisc->q.qlen > 0) qfq_deactivate_class(q, cl); qdisc_reset(cl->qdisc); } } } static void qfq_destroy_qdisc(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; struct hlist_node *next; unsigned int i; tcf_block_put(q->block); for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], common.hnode) { qfq_destroy_class(sch, cl); } } qdisc_class_hash_destroy(&q->clhash); } static const struct Qdisc_class_ops qfq_class_ops = { .change = qfq_change_class, .delete = qfq_delete_class, .find = qfq_search_class, .tcf_block = qfq_tcf_block, .bind_tcf = qfq_bind_tcf, .unbind_tcf = qfq_unbind_tcf, .graft = qfq_graft_class, .leaf = qfq_class_leaf, .qlen_notify = qfq_qlen_notify, .dump = qfq_dump_class, .dump_stats = qfq_dump_class_stats, .walk = qfq_walk, }; static struct Qdisc_ops qfq_qdisc_ops __read_mostly = { .cl_ops = &qfq_class_ops, .id = "qfq", .priv_size = sizeof(struct qfq_sched), .enqueue = qfq_enqueue, .dequeue = qfq_dequeue, .peek = qdisc_peek_dequeued, .init = qfq_init_qdisc, .reset = qfq_reset_qdisc, .destroy = qfq_destroy_qdisc, .owner = THIS_MODULE, }; static int __init qfq_init(void) { return register_qdisc(&qfq_qdisc_ops); } static void __exit qfq_exit(void) { unregister_qdisc(&qfq_qdisc_ops); } module_init(qfq_init); module_exit(qfq_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Quick Fair Queueing Plus qdisc");
1 1 1 189 189 189 4 3 3 3 3 4 2 2 1 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 // SPDX-License-Identifier: GPL-2.0 /* * KVM coalesced MMIO * * Copyright (c) 2008 Bull S.A.S. * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * Author: Laurent Vivier <Laurent.Vivier@bull.net> * */ #include <kvm/iodev.h> #include <linux/kvm_host.h> #include <linux/slab.h> #include <linux/kvm.h> #include "coalesced_mmio.h" static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev) { return container_of(dev, struct kvm_coalesced_mmio_dev, dev); } static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev, gpa_t addr, int len) { /* is it in a batchable area ? * (addr,len) is fully included in * (zone->addr, zone->size) */ if (len < 0) return 0; if (addr + len < addr) return 0; if (addr < dev->zone.addr) return 0; if (addr + len > dev->zone.addr + dev->zone.size) return 0; return 1; } static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev, u32 last) { struct kvm_coalesced_mmio_ring *ring; unsigned avail; /* Are we able to batch it ? */ /* last is the first free entry * check if we don't meet the first used entry * there is always one unused entry in the buffer */ ring = dev->kvm->coalesced_mmio_ring; avail = (ring->first - last - 1) % KVM_COALESCED_MMIO_MAX; if (avail == 0) { /* full */ return 0; } return 1; } static int coalesced_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, const void *val) { struct kvm_coalesced_mmio_dev *dev = to_mmio(this); struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; __u32 insert; if (!coalesced_mmio_in_range(dev, addr, len)) return -EOPNOTSUPP; spin_lock(&dev->kvm->ring_lock); insert = READ_ONCE(ring->last); if (!coalesced_mmio_has_room(dev, insert) || insert >= KVM_COALESCED_MMIO_MAX) { spin_unlock(&dev->kvm->ring_lock); return -EOPNOTSUPP; } /* copy data in first free entry of the ring */ ring->coalesced_mmio[insert].phys_addr = addr; ring->coalesced_mmio[insert].len = len; memcpy(ring->coalesced_mmio[insert].data, val, len); ring->coalesced_mmio[insert].pio = dev->zone.pio; smp_wmb(); ring->last = (insert + 1) % KVM_COALESCED_MMIO_MAX; spin_unlock(&dev->kvm->ring_lock); return 0; } static void coalesced_mmio_destructor(struct kvm_io_device *this) { struct kvm_coalesced_mmio_dev *dev = to_mmio(this); list_del(&dev->list); kfree(dev); } static const struct kvm_io_device_ops coalesced_mmio_ops = { .write = coalesced_mmio_write, .destructor = coalesced_mmio_destructor, }; int kvm_coalesced_mmio_init(struct kvm *kvm) { struct page *page; page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!page) return -ENOMEM; kvm->coalesced_mmio_ring = page_address(page); /* * We're using this spinlock to sync access to the coalesced ring. * The list doesn't need its own lock since device registration and * unregistration should only happen when kvm->slots_lock is held. */ spin_lock_init(&kvm->ring_lock); INIT_LIST_HEAD(&kvm->coalesced_zones); return 0; } void kvm_coalesced_mmio_free(struct kvm *kvm) { if (kvm->coalesced_mmio_ring) free_page((unsigned long)kvm->coalesced_mmio_ring); } int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, struct kvm_coalesced_mmio_zone *zone) { int ret; struct kvm_coalesced_mmio_dev *dev; if (zone->pio != 1 && zone->pio != 0) return -EINVAL; dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL_ACCOUNT); if (!dev) return -ENOMEM; kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); dev->kvm = kvm; dev->zone = *zone; mutex_lock(&kvm->slots_lock); ret = kvm_io_bus_register_dev(kvm, zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, zone->addr, zone->size, &dev->dev); if (ret < 0) goto out_free_dev; list_add_tail(&dev->list, &kvm->coalesced_zones); mutex_unlock(&kvm->slots_lock); return 0; out_free_dev: mutex_unlock(&kvm->slots_lock); kfree(dev); return ret; } int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, struct kvm_coalesced_mmio_zone *zone) { struct kvm_coalesced_mmio_dev *dev, *tmp; int r; if (zone->pio != 1 && zone->pio != 0) return -EINVAL; mutex_lock(&kvm->slots_lock); list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list) { if (zone->pio == dev->zone.pio && coalesced_mmio_in_range(dev, zone->addr, zone->size)) { r = kvm_io_bus_unregister_dev(kvm, zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, &dev->dev); /* * On failure, unregister destroys all devices on the * bus, including the target device. There's no need * to restart the walk as there aren't any zones left. */ if (r) break; } } mutex_unlock(&kvm->slots_lock); /* * Ignore the result of kvm_io_bus_unregister_dev(), from userspace's * perspective, the coalesced MMIO is most definitely unregistered. */ return 0; }
1 5 4 1 16 2 114 16 1 6 2 6 226 87 14 13 3 20 14 15 14 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the UDP module. * * Version: @(#)udp.h 1.0.2 05/07/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Fixes: * Alan Cox : Turned on udp checksums. I don't want to * chase 'memory corruption' bugs that aren't! */ #ifndef _UDP_H #define _UDP_H #include <linux/list.h> #include <linux/bug.h> #include <net/inet_sock.h> #include <net/gso.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ip.h> #include <linux/ipv6.h> #include <linux/seq_file.h> #include <linux/poll.h> #include <linux/indirect_call_wrapper.h> /** * struct udp_skb_cb - UDP(-Lite) private variables * * @header: private variables used by IPv4/IPv6 * @cscov: checksum coverage length (UDP-Lite only) * @partial_cov: if set indicates partial csum coverage */ struct udp_skb_cb { union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif } header; __u16 cscov; __u8 partial_cov; }; #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) /** * struct udp_hslot - UDP hash slot * * @head: head of list of sockets * @count: number of sockets in 'head' list * @lock: spinlock protecting changes to head/count */ struct udp_hslot { struct hlist_head head; int count; spinlock_t lock; } __attribute__((aligned(2 * sizeof(long)))); /** * struct udp_table - UDP table * * @hash: hash table, sockets are hashed on (local port) * @hash2: hash table, sockets are hashed on (local port, local address) * @mask: number of slots in hash tables, minus 1 * @log: log2(number of slots in hash table) */ struct udp_table { struct udp_hslot *hash; struct udp_hslot *hash2; unsigned int mask; unsigned int log; }; extern struct udp_table udp_table; void udp_table_init(struct udp_table *, const char *); static inline struct udp_hslot *udp_hashslot(struct udp_table *table, struct net *net, unsigned int num) { return &table->hash[udp_hashfn(net, num, table->mask)]; } /* * For secondary hash, net_hash_mix() is performed before calling * udp_hashslot2(), this explains difference with udp_hashslot() */ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table, unsigned int hash) { return &table->hash2[hash & table->mask]; } extern struct proto udp_prot; extern atomic_long_t udp_memory_allocated; DECLARE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); /* sysctl variables for udp */ extern long sysctl_udp_mem[3]; extern int sysctl_udp_rmem_min; extern int sysctl_udp_wmem_min; struct sk_buff; /* * Generic checksumming routines for UDP(-Lite) v4 and v6 */ static inline __sum16 __udp_lib_checksum_complete(struct sk_buff *skb) { return (UDP_SKB_CB(skb)->cscov == skb->len ? __skb_checksum_complete(skb) : __skb_checksum_complete_head(skb, UDP_SKB_CB(skb)->cscov)); } static inline int udp_lib_checksum_complete(struct sk_buff *skb) { return !skb_csum_unnecessary(skb) && __udp_lib_checksum_complete(skb); } /** * udp_csum_outgoing - compute UDPv4/v6 checksum over fragments * @sk: socket we are writing to * @skb: sk_buff containing the filled-in UDP header * (checksum field must be zeroed out) */ static inline __wsum udp_csum_outgoing(struct sock *sk, struct sk_buff *skb) { __wsum csum = csum_partial(skb_transport_header(skb), sizeof(struct udphdr), 0); skb_queue_walk(&sk->sk_write_queue, skb) { csum = csum_add(csum, skb->csum); } return csum; } static inline __wsum udp_csum(struct sk_buff *skb) { __wsum csum = csum_partial(skb_transport_header(skb), sizeof(struct udphdr), skb->csum); for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next) { csum = csum_add(csum, skb->csum); } return csum; } static inline __sum16 udp_v4_check(int len, __be32 saddr, __be32 daddr, __wsum base) { return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base); } void udp_set_csum(bool nocheck, struct sk_buff *skb, __be32 saddr, __be32 daddr, int len); static inline void udp_csum_pull_header(struct sk_buff *skb) { if (!skb->csum_valid && skb->ip_summed == CHECKSUM_NONE) skb->csum = csum_partial(skb->data, sizeof(struct udphdr), skb->csum); skb_pull_rcsum(skb, sizeof(struct udphdr)); UDP_SKB_CB(skb)->cscov -= sizeof(struct udphdr); } typedef struct sock *(*udp_lookup_t)(const struct sk_buff *skb, __be16 sport, __be16 dport); void udp_v6_early_demux(struct sk_buff *skb); INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features, bool is_ipv6); static inline void udp_lib_init_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); skb_queue_head_init(&up->reader_queue); up->forward_threshold = sk->sk_rcvbuf >> 2; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); } /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ static inline int udp_lib_hash(struct sock *sk) { BUG(); return 0; } void udp_lib_unhash(struct sock *sk); void udp_lib_rehash(struct sock *sk, u16 new_hash); static inline void udp_lib_close(struct sock *sk, long timeout) { sk_common_release(sk); } int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned int hash2_nulladdr); u32 udp_flow_hashrnd(void); static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb, int min, int max, bool use_eth) { u32 hash; if (min >= max) { /* Use default range */ inet_get_local_port_range(net, &min, &max); } hash = skb_get_hash(skb); if (unlikely(!hash)) { if (use_eth) { /* Can't find a normal hash, caller has indicated an * Ethernet packet so use that to compute a hash. */ hash = jhash(skb->data, 2 * ETH_ALEN, (__force u32) skb->protocol); } else { /* Can't derive any sort of hash for the packet, set * to some consistent random value. */ hash = udp_flow_hashrnd(); } } /* Since this is being sent on the wire obfuscate hash a bit * to minimize possbility that any useful information to an * attacker is leaked. Only upper 16 bits are relevant in the * computation for 16 bit port value. */ hash ^= hash << 16; return htons((((u64) hash * (max - min)) >> 32) + min); } static inline int udp_rqueue_get(struct sock *sk) { return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit); } static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_udp_l3mdev_accept), bound_dev_if, dif, sdif); #else return inet_bound_dev_eq(true, bound_dev_if, dif, sdif); #endif } /* net/ipv4/udp.c */ void udp_destruct_common(struct sock *sk); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb); void udp_skb_destructor(struct sock *sk, struct sk_buff *skb); struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int *off, int *err); static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, int *err) { int off = 0; return __skb_recv_udp(sk, flags, &off, err); } int udp_v4_early_demux(struct sk_buff *skb); bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_err(struct sk_buff *, u32); int udp_abort(struct sock *sk, int err); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); void udp_splice_eof(struct socket *sock); int udp_push_pending_frames(struct sock *sk); void udp_flush_pending_frames(struct sock *sk); int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size); void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, int *karg); int udp_init_sock(struct sock *sk); int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, bool is_ipv6); int udp_lib_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int udp_lib_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen, int (*push_pending_frames)(struct sock *)); struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif); struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, struct sk_buff *skb); struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport); struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif); struct sock *__udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, struct sk_buff *skb); struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport); int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); /* UDP uses skb->dev_scratch to cache as much information as possible and avoid * possibly multiple cache miss on dequeue() */ struct udp_dev_scratch { /* skb->truesize and the stateless bit are embedded in a single field; * do not use a bitfield since the compiler emits better/smaller code * this way */ u32 _tsize_state; #if BITS_PER_LONG == 64 /* len and the bit needed to compute skb_csum_unnecessary * will be on cold cache lines at recvmsg time. * skb->len can be stored on 16 bits since the udp header has been * already validated and pulled. */ u16 len; bool is_linear; bool csum_unnecessary; #endif }; static inline struct udp_dev_scratch *udp_skb_scratch(struct sk_buff *skb) { return (struct udp_dev_scratch *)&skb->dev_scratch; } #if BITS_PER_LONG == 64 static inline unsigned int udp_skb_len(struct sk_buff *skb) { return udp_skb_scratch(skb)->len; } static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb) { return udp_skb_scratch(skb)->csum_unnecessary; } static inline bool udp_skb_is_linear(struct sk_buff *skb) { return udp_skb_scratch(skb)->is_linear; } #else static inline unsigned int udp_skb_len(struct sk_buff *skb) { return skb->len; } static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb) { return skb_csum_unnecessary(skb); } static inline bool udp_skb_is_linear(struct sk_buff *skb) { return !skb_is_nonlinear(skb); } #endif static inline int copy_linear_skb(struct sk_buff *skb, int len, int off, struct iov_iter *to) { int n; n = copy_to_iter(skb->data + off, len, to); if (n == len) return 0; iov_iter_revert(to, n); return -EFAULT; } /* * SNMP statistics for UDP and UDP-Lite */ #define UDP_INC_STATS(net, field, is_udplite) do { \ if (is_udplite) SNMP_INC_STATS((net)->mib.udplite_statistics, field); \ else SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0) #define __UDP_INC_STATS(net, field, is_udplite) do { \ if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_statistics, field); \ else __SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0) #define __UDP6_INC_STATS(net, field, is_udplite) do { \ if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_stats_in6, field);\ else __SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \ } while(0) #define UDP6_INC_STATS(net, field, __lite) do { \ if (__lite) SNMP_INC_STATS((net)->mib.udplite_stats_in6, field); \ else SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \ } while(0) #if IS_ENABLED(CONFIG_IPV6) #define __UDPX_MIB(sk, ipv4) \ ({ \ ipv4 ? (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \ sock_net(sk)->mib.udp_statistics) : \ (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_stats_in6 : \ sock_net(sk)->mib.udp_stats_in6); \ }) #else #define __UDPX_MIB(sk, ipv4) \ ({ \ IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \ sock_net(sk)->mib.udp_statistics; \ }) #endif #define __UDPX_INC_STATS(sk, field) \ __SNMP_INC_STATS(__UDPX_MIB(sk, (sk)->sk_family == AF_INET), field) #ifdef CONFIG_PROC_FS struct udp_seq_afinfo { sa_family_t family; struct udp_table *udp_table; }; struct udp_iter_state { struct seq_net_private p; int bucket; }; void *udp_seq_start(struct seq_file *seq, loff_t *pos); void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos); void udp_seq_stop(struct seq_file *seq, void *v); extern const struct seq_operations udp_seq_ops; extern const struct seq_operations udp6_seq_ops; int udp4_proc_init(void); void udp4_proc_exit(void); #endif /* CONFIG_PROC_FS */ int udpv4_offload_init(void); void udp_init(void); DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key); void udp_encap_enable(void); void udp_encap_disable(void); #if IS_ENABLED(CONFIG_IPV6) DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void); #endif static inline struct sk_buff *udp_rcv_segment(struct sock *sk, struct sk_buff *skb, bool ipv4) { netdev_features_t features = NETIF_F_SG; struct sk_buff *segs; /* Avoid csum recalculation by skb_segment unless userspace explicitly * asks for the final checksum values */ if (!inet_get_convert_csum(sk)) features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; /* UDP segmentation expects packets of type CHECKSUM_PARTIAL or * CHECKSUM_NONE in __udp_gso_segment. UDP GRO indeed builds partial * packets in udp_gro_complete_segment. As does UDP GSO, verified by * udp_send_skb. But when those packets are looped in dev_loopback_xmit * their ip_summed CHECKSUM_NONE is changed to CHECKSUM_UNNECESSARY. * Reset in this specific case, where PARTIAL is both correct and * required. */ if (skb->pkt_type == PACKET_LOOPBACK) skb->ip_summed = CHECKSUM_PARTIAL; /* the GSO CB lays after the UDP one, no need to save and restore any * CB fragment */ segs = __skb_gso_segment(skb, features, false); if (IS_ERR_OR_NULL(segs)) { int segs_nr = skb_shinfo(skb)->gso_segs; atomic_add(segs_nr, &sk->sk_drops); SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, segs_nr); kfree_skb(skb); return NULL; } consume_skb(skb); return segs; } static inline void udp_post_segment_fix_csum(struct sk_buff *skb) { /* UDP-lite can't land here - no GRO */ WARN_ON_ONCE(UDP_SKB_CB(skb)->partial_cov); /* UDP packets generated with UDP_SEGMENT and traversing: * * UDP tunnel(xmit) -> veth (segmentation) -> veth (gro) -> UDP tunnel (rx) * * can reach an UDP socket with CHECKSUM_NONE, because * __iptunnel_pull_header() converts CHECKSUM_PARTIAL into NONE. * SKB_GSO_UDP_L4 or SKB_GSO_FRAGLIST packets with no UDP tunnel will * have a valid checksum, as the GRO engine validates the UDP csum * before the aggregation and nobody strips such info in between. * Instead of adding another check in the tunnel fastpath, we can force * a valid csum after the segmentation. * Additionally fixup the UDP CB. */ UDP_SKB_CB(skb)->cscov = skb->len; if (skb->ip_summed == CHECKSUM_NONE && !skb->csum_valid) skb->csum_valid = 1; } #ifdef CONFIG_BPF_SYSCALL struct sk_psock; int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); #endif #endif /* _UDP_H */
394 394 297 297 86 297 346 346 216 345 346 340 345 337 291 114 53 53 337 398 398 283 1 124 279 234 235 235 235 275 5 2 789 274 4 862 789 866 1694 1478 1824 1611 3342 3109 1642 1444 1760 1571 822 665 1533 1502 1475 1393 1609 1527 646 611 139 131 6 73 73 73 74 75 54 75 73 72 1 73 9 73 73 63 73 71 71 9 30 30 30 29 28 44 42 41 4 37 38 39 40 5 5 5 5 5 5 5 5 2 2 187 167 188 184 18 18 18 6 6 5 5 5 6 3 3 3 3 3 3 22 22 22 21 21 22 1 1 281 189 189 189 189 189 189 189 4 43 43 43 42 43 43 4 4 4 4 4 4 4 4 1 1 4 4 4 4 4 4 4 4 4 4 1797 1796 1796 379 248 379 376 352 352 122 352 57 352 29 29 1 1 30 452 1 132 132 302 302 301 3 3 3 3 3 3 14 14 7 7 12 279 258 5 13 299 286 285 286 286 5 3 3 3 286 299 286 13 11 2 4 3 1 3 3 30 279 30 4 14 1 13 1 13 30 1 29 21 5 3 3 1 2 11 9 11 8 6 26 26 37 37 1 1 1 1 1 1 5 1 4 7 1 1 25 17 8 8 101 101 25 101 97 101 3 95 70 111 271 112 15 15 15 2 15 15 112 5 275 6 275 5 275 16 16 16 16 16 1 281 274 23 23 17 258 258 258 257 258 234 258 241 258 2 256 2 2 8 1 4 4 275 4 4 281 47 2 279 1 278 2 276 271 13 268 13 13 13 281 275 11 275 275 275 275 274 275 275 275 275 275 275 45 1 1 46 46 46 46 46 46 3 3 46 46 46 46 46 45 46 3 3 46 46 1 46 46 46 1 1 46 46 46 46 45 2 46 43 2 2 2 46 2 46 46 46 1 44 1 45 45 46 46 46 45 46 46 197 3 194 179 1 178 1 177 1 177 6 5 179 179 179 179 179 197 17 197 98 98 98 98 98 98 98 98 98 98 104 6 98 98 98 98 98 98 98 98 2154 2087 2087 2087 2087 2087 2087 2087 2154 70 70 36 36 231 231 62 2202 2183 2102 225 229 3 229 229 229 229 229 3 229 1 229 1 228 228 3 228 1 228 228 228 3 228 228 228 1 227 227 5 2 225 95 225 179 247 247 179 247 71 71 247 229 129 229 124 23 23 21 124 225 2 2 222 4 29 14 20 10 255 2 255 241 1 240 2 253 7 7 3 11 3 8 7 8 31 6 4 5 5 5 31 31 172 172 172 172 194 4 190 172 172 31 172 172 172 31 190 193 153 153 153 153 153 153 153 153 153 153 153 153 153 153 143 143 11 32 11 274 83 279 279 274 15 274 41 274 48 274 7 274 4 274 38 236 22 214 2 274 274 272 267 6 61 274 223 274 204 274 274 241 73 1 72 1 71 1 168 1 167 1 166 236 183 38 236 236 235 234 3 233 233 123 2 2 2 1 1 1 1 1 275 1 274 274 274 1 273 220 35 1 35 264 5 5 1 263 12 7 5 3 258 11 3 8 7 250 247 248 248 1 236 236 236 236 2 234 1 233 32 1 232 232 2 230 130 1 229 229 229 229 229 230 229 229 229 229 229 4 8 8 8 264 38 38 38 38 7 38 37 226 279 279 1 278 2 1 276 1 275 275 99 176 176 176 176 176 3 48 3 45 27 22 27 247 247 247 79 79 168 247 247 3 1 244 242 3 241 279 275 279 279 279 279 1 275 275 274 274 274 271 271 265 1 264 264 264 247 48 241 236 230 225 225 8 5 225 225 132 132 225 129 222 8 6 5 213 1 213 1 212 1 211 1 210 210 205 205 174 210 210 210 57 153 210 210 13 196 1 195 195 3 192 195 195 176 4 191 191 191 191 1 189 189 189 104 189 189 62 61 11 50 24 24 7 24 1 1 5 19 19 19 34 39 84 83 84 84 84 84 84 85 279 279 2 279 24 24 19 22 103 83 282 3 3 3 3 4 4 4 1 1 5 5 5 5 4 3 3 4 4 4 8 8 8 4 4 4 232 232 222 232 70 232 70 232 73 54 53 53 53 73 73 73 53 73 232 7 232 232 232 232 232 232 231 227 231 229 4 79 79 79 79 71 71 71 60 69 11 12 28 13 13 13 13 13 13 3 10 13 13 13 3 13 13 1 13 3 13 13 8 5 5 5 5 5 5 3 3 3 3 3 3 3 3 13 13 7 13 3 1 2 13 5 13 7 6 6 13 12 13 13 13 13 220 220 220 220 220 220 220 220 220 1 220 129 119 119 119 118 82 1 81 82 119 119 119 119 2 2 2 16 3 17 17 17 17 13 2 17 3 17 17 17 10 10 10 10 10 89 89 89 89 89 2 89 79 80 79 80 80 123 124 124 124 124 124 123 124 124 124 22 124 7 6 4 3 3 88 4 84 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/super.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ #include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/time.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/parser.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/vfs.h> #include <linux/random.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/seq_file.h> #include <linux/ctype.h> #include <linux/log2.h> #include <linux/crc16.h> #include <linux/dax.h> #include <linux/uaccess.h> #include <linux/iversion.h> #include <linux/unicode.h> #include <linux/part_stat.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/fsnotify.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include "ext4.h" #include "ext4_extents.h" /* Needed for trace points definition */ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "mballoc.h" #include "fsmap.h" #define CREATE_TRACE_POINTS #include <trace/events/ext4.h> static struct ext4_lazy_init *ext4_li_info; static DEFINE_MUTEX(ext4_li_mtx); static struct ratelimit_state ext4_mount_msg_ratelimit; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); static int ext4_show_options(struct seq_file *seq, struct dentry *root); static void ext4_update_super(struct super_block *sb); static int ext4_commit_super(struct super_block *sb); static int ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); static int ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); static int ext4_freeze(struct super_block *sb); static inline int ext2_feature_set_ok(struct super_block *sb); static inline int ext3_feature_set_ok(struct super_block *sb); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); static struct inode *ext4_get_journal_inode(struct super_block *sb, unsigned int journal_inum); static int ext4_validate_options(struct fs_context *fc); static int ext4_check_opt_consistency(struct fs_context *fc, struct super_block *sb); static void ext4_apply_options(struct fs_context *fc, struct super_block *sb); static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param); static int ext4_get_tree(struct fs_context *fc); static int ext4_reconfigure(struct fs_context *fc); static void ext4_fc_free(struct fs_context *fc); static int ext4_init_fs_context(struct fs_context *fc); static void ext4_kill_sb(struct super_block *sb); static const struct fs_parameter_spec ext4_param_specs[]; /* * Lock ordering * * page fault path: * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start * -> page lock -> i_data_sem (rw) * * buffered write path: * sb_start_write -> i_mutex -> mmap_lock * sb_start_write -> i_mutex -> transaction start -> page lock -> * i_data_sem (rw) * * truncate: * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) -> * page lock * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start -> * i_data_sem (rw) * * direct IO: * sb_start_write -> i_mutex -> mmap_lock * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw) * * writepages: * transaction start -> page lock(s) -> i_data_sem (rw) */ static const struct fs_context_operations ext4_context_ops = { .parse_param = ext4_parse_param, .get_tree = ext4_get_tree, .reconfigure = ext4_reconfigure, .free = ext4_fc_free, }; #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, .name = "ext2", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext2"); MODULE_ALIAS("ext2"); #define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type) #else #define IS_EXT2_SB(sb) (0) #endif static struct file_system_type ext3_fs_type = { .owner = THIS_MODULE, .name = "ext3", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type) static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) { /* * buffer's verified bit is no longer valid after reading from * disk again due to write out error, clear it to make sure we * recheck the buffer contents. */ clear_buffer_verified(bh); bh->b_end_io = end_io ? end_io : end_buffer_read_sync; get_bh(bh); submit_bh(REQ_OP_READ | op_flags, bh); } void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) { BUG_ON(!buffer_locked(bh)); if (ext4_buffer_uptodate(bh)) { unlock_buffer(bh); return; } __ext4_read_bh(bh, op_flags, end_io); } int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) { BUG_ON(!buffer_locked(bh)); if (ext4_buffer_uptodate(bh)) { unlock_buffer(bh); return 0; } __ext4_read_bh(bh, op_flags, end_io); wait_on_buffer(bh); if (buffer_uptodate(bh)) return 0; return -EIO; } int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { lock_buffer(bh); if (!wait) { ext4_read_bh_nowait(bh, op_flags, NULL); return 0; } return ext4_read_bh(bh, op_flags, NULL); } /* * This works like __bread_gfp() except it uses ERR_PTR for error * returns. Currently with sb_bread it's impossible to distinguish * between ENOMEM and EIO situations (since both result in a NULL * return. */ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, sector_t block, blk_opf_t op_flags, gfp_t gfp) { struct buffer_head *bh; int ret; bh = sb_getblk_gfp(sb, block, gfp); if (bh == NULL) return ERR_PTR(-ENOMEM); if (ext4_buffer_uptodate(bh)) return bh; ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true); if (ret) { put_bh(bh); return ERR_PTR(ret); } return bh; } struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags) { gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping, ~__GFP_FS) | __GFP_MOVABLE; return __ext4_sb_bread_gfp(sb, block, op_flags, gfp); } struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block) { gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping, ~__GFP_FS); return __ext4_sb_bread_gfp(sb, block, 0, gfp); } void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) { struct buffer_head *bh = bdev_getblk(sb->s_bdev, block, sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN); if (likely(bh)) { if (trylock_buffer(bh)) ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL); brelse(bh); } } static int ext4_verify_csum_type(struct super_block *sb, struct ext4_super_block *es) { if (!ext4_has_feature_metadata_csum(sb)) return 1; return es->s_checksum_type == EXT4_CRC32C_CHKSUM; } __le32 ext4_superblock_csum(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); int offset = offsetof(struct ext4_super_block, s_checksum); __u32 csum; csum = ext4_chksum(sbi, ~0, (char *)es, offset); return cpu_to_le32(csum); } static int ext4_superblock_csum_verify(struct super_block *sb, struct ext4_super_block *es) { if (!ext4_has_metadata_csum(sb)) return 1; return es->s_checksum == ext4_superblock_csum(sb, es); } void ext4_superblock_csum_set(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (!ext4_has_metadata_csum(sb)) return; es->s_checksum = ext4_superblock_csum(sb, es); } ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_block_bitmap_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0); } ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_inode_bitmap_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); } ext4_fsblk_t ext4_inode_table(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_inode_table_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); } __u32 ext4_free_group_clusters(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_free_blocks_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); } __u32 ext4_free_inodes_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_free_inodes_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); } __u32 ext4_used_dirs_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_used_dirs_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); } __u32 ext4_itable_unused_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_itable_unused_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); } void ext4_block_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32); } void ext4_inode_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_inode_bitmap_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32); } void ext4_inode_table_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_inode_table_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); } void ext4_free_group_clusters_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16); } void ext4_free_inodes_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16); } void ext4_used_dirs_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16); } void ext4_itable_unused_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_itable_unused_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); } static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now) { now = clamp_val(now, 0, (1ull << 40) - 1); *lo = cpu_to_le32(lower_32_bits(now)); *hi = upper_32_bits(now); } static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) { return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo); } #define ext4_update_tstamp(es, tstamp) \ __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \ ktime_get_real_seconds()) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) #define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */ #define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */ /* * The ext4_maybe_update_superblock() function checks and updates the * superblock if needed. * * This function is designed to update the on-disk superblock only under * certain conditions to prevent excessive disk writes and unnecessary * waking of the disk from sleep. The superblock will be updated if: * 1. More than an hour has passed since the last superblock update, and * 2. More than 16MB have been written since the last superblock update. * * @sb: The superblock */ static void ext4_maybe_update_superblock(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; journal_t *journal = sbi->s_journal; time64_t now; __u64 last_update; __u64 lifetime_write_kbytes; __u64 diff_size; if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) || !journal || (journal->j_flags & JBD2_UNMOUNT)) return; now = ktime_get_real_seconds(); last_update = ext4_get_tstamp(es, s_wtime); if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC)) return; lifetime_write_kbytes = sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - sbi->s_sectors_written_start) >> 1); /* Get the number of kilobytes not written to disk to account * for statistics and compare with a multiple of 16 MB. This * is used to determine when the next superblock commit should * occur (i.e. not more often than once per 16MB if there was * less written in an hour). */ diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } /* * The del_gendisk() function uninitializes the disk-specific data * structures, including the bdi structure, without telling anyone * else. Once this happens, any attempt to call mark_buffer_dirty() * (for example, by ext4_commit_super), will cause a kernel OOPS. * This is a kludge to prevent these oops until we can put in a proper * hook in del_gendisk() to inform the VFS and file system layers. */ static int block_device_ejected(struct super_block *sb) { struct inode *bd_inode = sb->s_bdev->bd_inode; struct backing_dev_info *bdi = inode_to_bdi(bd_inode); return bdi->dev == NULL; } static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); int error = is_journal_aborted(journal); struct ext4_journal_cb_entry *jce; BUG_ON(txn->t_state == T_FINISHED); ext4_process_freed_data(sb, txn->t_tid); ext4_maybe_update_superblock(sb); spin_lock(&sbi->s_md_lock); while (!list_empty(&txn->t_private_list)) { jce = list_entry(txn->t_private_list.next, struct ext4_journal_cb_entry, jce_list); list_del_init(&jce->jce_list); spin_unlock(&sbi->s_md_lock); jce->jce_func(sb, jce, error); spin_lock(&sbi->s_md_lock); } spin_unlock(&sbi->s_md_lock); } /* * This writepage callback for write_cache_pages() * takes care of a few cases after page cleaning. * * write_cache_pages() already checks for dirty pages * and calls clear_page_dirty_for_io(), which we want, * to write protect the pages. * * However, we may have to redirty a page (see below.) */ static int ext4_journalled_writepage_callback(struct folio *folio, struct writeback_control *wbc, void *data) { transaction_t *transaction = (transaction_t *) data; struct buffer_head *bh, *head; struct journal_head *jh; bh = head = folio_buffers(folio); do { /* * We have to redirty a page in these cases: * 1) If buffer is dirty, it means the page was dirty because it * contains a buffer that needs checkpointing. So the dirty bit * needs to be preserved so that checkpointing writes the buffer * properly. * 2) If buffer is not part of the committing transaction * (we may have just accidentally come across this buffer because * inode range tracking is not exact) or if the currently running * transaction already contains this buffer as well, dirty bit * needs to be preserved so that the buffer gets writeprotected * properly on running transaction's commit. */ jh = bh2jh(bh); if (buffer_dirty(bh) || (jh && (jh->b_transaction != transaction || jh->b_next_transaction))) { folio_redirty_for_writepage(wbc, folio); goto out; } } while ((bh = bh->b_this_page) != head); out: return AOP_WRITEPAGE_ACTIVATE; } static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) { struct address_space *mapping = jinode->i_vfs_inode->i_mapping; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; return write_cache_pages(mapping, &wbc, ext4_journalled_writepage_callback, jinode->i_transaction); } static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) { int ret; if (ext4_should_journal_data(jinode->i_vfs_inode)) ret = ext4_journalled_submit_inode_data_buffers(jinode); else ret = ext4_normal_submit_inode_data_buffers(jinode); return ret; } static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) { int ret = 0; if (!ext4_should_journal_data(jinode->i_vfs_inode)) ret = jbd2_journal_finish_inode_data_buffers(jinode); return ret; } static bool system_going_down(void) { return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || system_state == SYSTEM_RESTART; } struct ext4_err_translation { int code; int errno; }; #define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err } static struct ext4_err_translation err_translation[] = { EXT4_ERR_TRANSLATE(EIO), EXT4_ERR_TRANSLATE(ENOMEM), EXT4_ERR_TRANSLATE(EFSBADCRC), EXT4_ERR_TRANSLATE(EFSCORRUPTED), EXT4_ERR_TRANSLATE(ENOSPC), EXT4_ERR_TRANSLATE(ENOKEY), EXT4_ERR_TRANSLATE(EROFS), EXT4_ERR_TRANSLATE(EFBIG), EXT4_ERR_TRANSLATE(EEXIST), EXT4_ERR_TRANSLATE(ERANGE), EXT4_ERR_TRANSLATE(EOVERFLOW), EXT4_ERR_TRANSLATE(EBUSY), EXT4_ERR_TRANSLATE(ENOTDIR), EXT4_ERR_TRANSLATE(ENOTEMPTY), EXT4_ERR_TRANSLATE(ESHUTDOWN), EXT4_ERR_TRANSLATE(EFAULT), }; static int ext4_errno_to_code(int errno) { int i; for (i = 0; i < ARRAY_SIZE(err_translation); i++) if (err_translation[i].errno == errno) return err_translation[i].code; return EXT4_ERR_UNKNOWN; } static void save_error_info(struct super_block *sb, int error, __u32 ino, __u64 block, const char *func, unsigned int line) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* We default to EFSCORRUPTED error... */ if (error == 0) error = EFSCORRUPTED; spin_lock(&sbi->s_error_lock); sbi->s_add_error_count++; sbi->s_last_error_code = error; sbi->s_last_error_line = line; sbi->s_last_error_ino = ino; sbi->s_last_error_block = block; sbi->s_last_error_func = func; sbi->s_last_error_time = ktime_get_real_seconds(); if (!sbi->s_first_error_time) { sbi->s_first_error_code = error; sbi->s_first_error_line = line; sbi->s_first_error_ino = ino; sbi->s_first_error_block = block; sbi->s_first_error_func = func; sbi->s_first_error_time = sbi->s_last_error_time; } spin_unlock(&sbi->s_error_lock); } /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * * On ext2, we can store the error state of the filesystem in the * superblock. That is not possible on ext4, because we may have other * write ordering constraints on the superblock which prevent us from * writing it out straight away; and given that the journal is about to * be aborted, we can't rely on the current, or future, transactions to * write out the superblock safely. * * We'll just use the jbd2_journal_abort() error code to record an error in * the journal instead. On recovery, the journal will complain about * that error until we've noted it down and cleared it. * * If force_ro is set, we unconditionally force the filesystem into an * ABORT|READONLY state, unless the error response on the fs has been set to * panic in which case we take the easy way out and panic immediately. This is * used to deal with unrecoverable failures such as journal IO errors or ENOMEM * at a critical moment in log management. */ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, __u32 ino, __u64 block, const char *func, unsigned int line) { journal_t *journal = EXT4_SB(sb)->s_journal; bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); if (!continue_fs && !sb_rdonly(sb)) { set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); if (journal) jbd2_journal_abort(journal, -EIO); } if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, error, ino, block, func, line); /* * In case the fs should keep running, we need to writeout * superblock through the journal. Due to lock ordering * constraints, it may not be safe to do it right here so we * defer superblock flushing to a workqueue. */ if (continue_fs && journal) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); else ext4_commit_super(sb); } /* * We force ERRORS_RO behavior when system is rebooting. Otherwise we * could panic during 'reboot -f' as the underlying device got already * disabled. */ if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); } if (sb_rdonly(sb) || continue_fs) return; ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* * Make sure updated value of ->s_mount_flags will be visible before * ->s_flags update */ smp_wmb(); sb->s_flags |= SB_RDONLY; } static void update_super_work(struct work_struct *work) { struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, s_sb_upd_work); journal_t *journal = sbi->s_journal; handle_t *handle; /* * If the journal is still running, we have to write out superblock * through the journal to avoid collisions of other journalled sb * updates. * * We use directly jbd2 functions here to avoid recursing back into * ext4 error handling code during handling of previous errors. */ if (!sb_rdonly(sbi->s_sb) && journal) { struct buffer_head *sbh = sbi->s_sbh; bool call_notify_err = false; handle = jbd2_journal_start(journal, 1); if (IS_ERR(handle)) goto write_directly; if (jbd2_journal_get_write_access(handle, sbh)) { jbd2_journal_stop(handle); goto write_directly; } if (sbi->s_add_error_count > 0) call_notify_err = true; ext4_update_super(sbi->s_sb); if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " "superblock detected"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } if (jbd2_journal_dirty_metadata(handle, sbh)) { jbd2_journal_stop(handle); goto write_directly; } jbd2_journal_stop(handle); if (call_notify_err) ext4_notify_error_sysfs(sbi); return; } write_directly: /* * Write through journal failed. Write sb directly to get error info * out and hope for the best. */ ext4_commit_super(sbi->s_sb); ext4_notify_error_sysfs(sbi); } #define ext4_error_ratelimit(sb) \ ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state), \ "EXT4-fs error") void __ext4_error(struct super_block *sb, const char *function, unsigned int line, bool force_ro, int error, __u64 block, const char *fmt, ...) { struct va_format vaf; va_list args; if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); if (ext4_error_ratelimit(sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", sb->s_id, function, line, current->comm, &vaf); va_end(args); } fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED); ext4_handle_error(sb, force_ro, error, 0, block, function, line); } void __ext4_error_inode(struct inode *inode, const char *function, unsigned int line, ext4_fsblk_t block, int error, const char *fmt, ...) { va_list args; struct va_format vaf; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); if (ext4_error_ratelimit(inode->i_sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (block) printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " "inode #%lu: block %llu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, block, current->comm, &vaf); else printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); } fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED); ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block, function, line); } void __ext4_error_file(struct file *file, const char *function, unsigned int line, ext4_fsblk_t block, const char *fmt, ...) { va_list args; struct va_format vaf; struct inode *inode = file_inode(file); char pathname[80], *path; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); if (ext4_error_ratelimit(inode->i_sb)) { path = file_path(file, pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (block) printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: " "block %llu: comm %s: path %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, block, current->comm, path, &vaf); else printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: " "comm %s: path %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, path, &vaf); va_end(args); } fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED); ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block, function, line); } const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]) { char *errstr = NULL; switch (errno) { case -EFSCORRUPTED: errstr = "Corrupt filesystem"; break; case -EFSBADCRC: errstr = "Filesystem failed CRC"; break; case -EIO: errstr = "IO failure"; break; case -ENOMEM: errstr = "Out of memory"; break; case -EROFS: if (!sb || (EXT4_SB(sb)->s_journal && EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) errstr = "Journal has aborted"; else errstr = "Readonly filesystem"; break; default: /* If the caller passed in an extra buffer for unknown * errors, textualise them now. Else we just return * NULL. */ if (nbuf) { /* Check for truncated error codes... */ if (snprintf(nbuf, 16, "error %d", -errno) >= 0) errstr = nbuf; } break; } return errstr; } /* __ext4_std_error decodes expected errors from journaling functions * automatically and invokes the appropriate error response. */ void __ext4_std_error(struct super_block *sb, const char *function, unsigned int line, int errno) { char nbuf[16]; const char *errstr; if (unlikely(ext4_forced_shutdown(sb))) return; /* Special case: if the error is EROFS, and we're not already * inside a transaction, then there's really no point in logging * an error. */ if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb)) return; if (ext4_error_ratelimit(sb)) { errstr = ext4_decode_error(sb, errno, nbuf); printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", sb->s_id, function, line, errstr); } fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED); ext4_handle_error(sb, false, -errno, 0, 0, function, line); } void __ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) { struct va_format vaf; va_list args; if (sb) { atomic_inc(&EXT4_SB(sb)->s_msg_count); if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs")) return; } va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (sb) printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); else printk("%sEXT4-fs: %pV\n", prefix, &vaf); va_end(args); } static int ext4_warning_ratelimit(struct super_block *sb) { atomic_inc(&EXT4_SB(sb)->s_warning_count); return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), "EXT4-fs warning"); } void __ext4_warning(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; if (!ext4_warning_ratelimit(sb)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n", sb->s_id, function, line, &vaf); va_end(args); } void __ext4_warning_inode(const struct inode *inode, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; if (!ext4_warning_ratelimit(inode->i_sb)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: " "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); } void __ext4_grp_locked_error(const char *function, unsigned int line, struct super_block *sb, ext4_group_t grp, unsigned long ino, ext4_fsblk_t block, const char *fmt, ...) __releases(bitlock) __acquires(bitlock) { struct va_format vaf; va_list args; if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); if (ext4_error_ratelimit(sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", sb->s_id, function, line, grp); if (ino) printk(KERN_CONT "inode %lu: ", ino); if (block) printk(KERN_CONT "block %llu:", (unsigned long long) block); printk(KERN_CONT "%pV\n", &vaf); va_end(args); } if (test_opt(sb, ERRORS_CONT)) { if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, EFSCORRUPTED, ino, block, function, line); schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } return; } ext4_unlock_group(sb, grp); ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line); /* * We only get here in the ERRORS_RO case; relocking the group * may be dangerous, but nothing bad will happen since the * filesystem will have already been marked read/only and the * journal has been aborted. We return 1 as a hint to callers * who might what to use the return value from * ext4_grp_locked_error() to distinguish between the * ERRORS_CONT and ERRORS_RO case, and perhaps return more * aggressively from the ext4 function in question, with a * more appropriate error code. */ ext4_lock_group(sb, grp); return; } void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t group, unsigned int flags) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); int ret; if (!grp || !gdp) return; if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) { ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); if (!ret) percpu_counter_sub(&sbi->s_freeclusters_counter, grp->bb_free); } if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) { ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); if (!ret && gdp) { int count; count = ext4_free_inodes_count(sb, gdp); percpu_counter_sub(&sbi->s_freeinodes_counter, count); } } } void ext4_update_dynamic_rev(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) return; ext4_warning(sb, "updating to rev %d because of new feature flag, " "running e2fsck is recommended", EXT4_DYNAMIC_REV); es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO); es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE); es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV); /* leave es->s_feature_*compat flags alone */ /* es->s_uuid will be set by e2fsck if empty */ /* * The rest of the superblock fields should be zero, and if not it * means they are likely already in use, so leave them alone. We * can leave it up to e2fsck to clean up any inconsistencies there. */ } static inline struct inode *orphan_list_entry(struct list_head *l) { return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; } static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) { struct list_head *l; ext4_msg(sb, KERN_ERR, "sb orphan head is %d", le32_to_cpu(sbi->s_es->s_last_orphan)); printk(KERN_ERR "sb_info orphan list:\n"); list_for_each(l, &sbi->s_orphan) { struct inode *inode = orphan_list_entry(l); printk(KERN_ERR " " "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", inode->i_sb->s_id, inode->i_ino, inode, inode->i_mode, inode->i_nlink, NEXT_ORPHAN(inode)); } } #ifdef CONFIG_QUOTA static int ext4_quota_off(struct super_block *sb, int type); static inline void ext4_quotas_off(struct super_block *sb, int type) { BUG_ON(type > EXT4_MAXQUOTAS); /* Use our quota_off function to clear inode flags etc. */ for (type--; type >= 0; type--) ext4_quota_off(sb, type); } /* * This is a helper function which is used in the mount/remount * codepaths (which holds s_umount) to fetch the quota file name. */ static inline char *get_qf_name(struct super_block *sb, struct ext4_sb_info *sbi, int type) { return rcu_dereference_protected(sbi->s_qf_names[type], lockdep_is_held(&sb->s_umount)); } #else static inline void ext4_quotas_off(struct super_block *sb, int type) { } #endif static int ext4_percpu_param_init(struct ext4_sb_info *sbi) { ext4_fsblk_t block; int err; block = ext4_count_free_clusters(sbi->s_sb); ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); err = percpu_counter_init(&sbi->s_freeclusters_counter, block, GFP_KERNEL); if (!err) { unsigned long freei = ext4_count_free_inodes(sbi->s_sb); sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, GFP_KERNEL); } if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, ext4_count_dirs(sbi->s_sb), GFP_KERNEL); if (!err) err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, GFP_KERNEL); if (!err) err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, GFP_KERNEL); if (!err) err = percpu_init_rwsem(&sbi->s_writepages_rwsem); if (err) ext4_msg(sbi->s_sb, KERN_ERR, "insufficient memory"); return err; } static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi) { percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); percpu_free_rwsem(&sbi->s_writepages_rwsem); } static void ext4_group_desc_free(struct ext4_sb_info *sbi) { struct buffer_head **group_desc; int i; rcu_read_lock(); group_desc = rcu_dereference(sbi->s_group_desc); for (i = 0; i < sbi->s_gdb_count; i++) brelse(group_desc[i]); kvfree(group_desc); rcu_read_unlock(); } static void ext4_flex_groups_free(struct ext4_sb_info *sbi) { struct flex_groups **flex_groups; int i; rcu_read_lock(); flex_groups = rcu_dereference(sbi->s_flex_groups); if (flex_groups) { for (i = 0; i < sbi->s_flex_groups_allocated; i++) kvfree(flex_groups[i]); kvfree(flex_groups); } rcu_read_unlock(); } static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int aborted = 0; int err; /* * Unregister sysfs before destroying jbd2 journal. * Since we could still access attr_journal_task attribute via sysfs * path which could have sbi->s_journal->j_task as NULL * Unregister sysfs before flush sbi->s_sb_upd_work. * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If * read metadata verify failed then will queue error work. * update_super_work will call start_this_handle may trigger * BUG_ON. */ ext4_unregister_sysfs(sb); if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount")) ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.", &sb->s_uuid); ext4_unregister_li_request(sb); ext4_quotas_off(sb, EXT4_MAXQUOTAS); flush_work(&sbi->s_sb_upd_work); destroy_workqueue(sbi->rsv_conversion_wq); ext4_release_orphan_info(sb); if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; if ((err < 0) && !aborted) { ext4_abort(sb, -err, "Couldn't clean up the journal"); } } ext4_es_unregister_shrinker(sbi); timer_shutdown_sync(&sbi->s_err_report); ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); if (!sb_rdonly(sb) && !aborted) { ext4_clear_feature_journal_needs_recovery(sb); ext4_clear_feature_orphan_present(sb); es->s_state = cpu_to_le16(sbi->s_mount_state); } if (!sb_rdonly(sb)) ext4_commit_super(sb); ext4_group_desc_free(sbi); ext4_flex_groups_free(sbi); ext4_percpu_param_destroy(sbi); #ifdef CONFIG_QUOTA for (int i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif /* Debugging code just in case the in-memory inode orphan list * isn't empty. The on-disk one can be non-empty if we've * detected an error and taken the fs readonly, but the * in-memory list had better be clean by this point. */ if (!list_empty(&sbi->s_orphan)) dump_orphan_list(sb, sbi); ASSERT(list_empty(&sbi->s_orphan)); sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); if (sbi->s_journal_bdev_handle) { /* * Invalidate the journal device's buffers. We don't want them * floating about in memory - the physical journal device may * hotswapped, and it breaks the `ro-after' testing code. */ sync_blockdev(sbi->s_journal_bdev_handle->bdev); invalidate_bdev(sbi->s_journal_bdev_handle->bdev); } ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); sbi->s_ea_inode_cache = NULL; ext4_xattr_destroy_cache(sbi->s_ea_block_cache); sbi->s_ea_block_cache = NULL; ext4_stop_mmpd(sbi); brelse(sbi->s_sbh); sb->s_fs_info = NULL; /* * Now that we are completely done shutting down the * superblock, we need to actually destroy the kobject. */ kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev, NULL); fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif kfree(sbi); } static struct kmem_cache *ext4_inode_cachep; /* * Called inside transaction, so use GFP_NOFS */ static struct inode *ext4_alloc_inode(struct super_block *sb) { struct ext4_inode_info *ei; ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS); if (!ei) return NULL; inode_set_iversion(&ei->vfs_inode, 1); ei->i_flags = 0; spin_lock_init(&ei->i_raw_lock); ei->i_prealloc_node = RB_ROOT; atomic_set(&ei->i_prealloc_active, 0); rwlock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); INIT_LIST_HEAD(&ei->i_es_list); ei->i_es_all_nr = 0; ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; ei->i_reserved_data_blocks = 0; spin_lock_init(&(ei->i_block_reservation_lock)); ext4_init_pending_tree(&ei->i_pending_tree); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); #endif ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_rsv_conversion_list); spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); ext4_fc_init_inode(&ei->vfs_inode); mutex_init(&ei->i_fc_lock); return &ei->vfs_inode; } static int ext4_drop_inode(struct inode *inode) { int drop = generic_drop_inode(inode); if (!drop) drop = fscrypt_drop_inode(inode); trace_ext4_drop_inode(inode, drop); return drop; } static void ext4_free_in_core_inode(struct inode *inode) { fscrypt_free_inode(inode); if (!list_empty(&(EXT4_I(inode)->i_fc_list))) { pr_warn("%s: inode %ld still in fc list", __func__, inode->i_ino); } kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); } static void ext4_destroy_inode(struct inode *inode) { if (!list_empty(&(EXT4_I(inode)->i_orphan))) { ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): orphan list check failed!", inode->i_ino, EXT4_I(inode)); print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, EXT4_I(inode), sizeof(struct ext4_inode_info), true); dump_stack(); } if (EXT4_I(inode)->i_reserved_data_blocks) ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!", inode->i_ino, EXT4_I(inode), EXT4_I(inode)->i_reserved_data_blocks); } static void ext4_shutdown(struct super_block *sb) { ext4_force_shutdown(sb, EXT4_GOING_FLAGS_NOLOGFLUSH); } static void init_once(void *foo) { struct ext4_inode_info *ei = foo; INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); ext4_fc_init_inode(&ei->vfs_inode); } static int __init init_inodecache(void) { ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache", sizeof(struct ext4_inode_info), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| SLAB_ACCOUNT), offsetof(struct ext4_inode_info, i_data), sizeof_field(struct ext4_inode_info, i_data), init_once); if (ext4_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(ext4_inode_cachep); } void ext4_clear_inode(struct inode *inode) { ext4_fc_del(inode); invalidate_inode_buffers(inode); clear_inode(inode); ext4_discard_preallocations(inode, 0); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); dquot_drop(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); jbd2_free_inode(EXT4_I(inode)->jinode); EXT4_I(inode)->jinode = NULL; } fscrypt_put_encryption_info(inode); fsverity_cleanup_inode(inode); } static struct inode *ext4_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { struct inode *inode; /* * Currently we don't know the generation for parent directory, so * a generation of 0 means "accept any" */ inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { iput(inode); return ERR_PTR(-ESTALE); } return inode; } static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ext4_nfs_get_inode); } static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, ext4_nfs_get_inode); } static int ext4_nfs_commit_metadata(struct inode *inode) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL }; trace_ext4_nfs_commit_metadata(inode); return ext4_write_inode(inode, &wbc); } #ifdef CONFIG_QUOTA static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) static int ext4_write_dquot(struct dquot *dquot); static int ext4_acquire_dquot(struct dquot *dquot); static int ext4_release_dquot(struct dquot *dquot); static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags); static struct dquot **ext4_get_dquots(struct inode *inode) { return EXT4_I(inode)->i_dquot; } static const struct dquot_operations ext4_quota_operations = { .get_reserved_space = ext4_get_reserved_space, .write_dquot = ext4_write_dquot, .acquire_dquot = ext4_acquire_dquot, .release_dquot = ext4_release_dquot, .mark_dirty = ext4_mark_dquot_dirty, .write_info = ext4_write_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, .get_projid = ext4_get_projid, .get_inode_usage = ext4_get_inode_usage, .get_next_id = dquot_get_next_id, }; static const struct quotactl_ops ext4_qctl_operations = { .quota_on = ext4_quota_on, .quota_off = ext4_quota_off, .quota_sync = dquot_quota_sync, .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, .get_dqblk = dquot_get_dqblk, .set_dqblk = dquot_set_dqblk, .get_nextdqblk = dquot_get_next_dqblk, }; #endif static const struct super_operations ext4_sops = { .alloc_inode = ext4_alloc_inode, .free_inode = ext4_free_in_core_inode, .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .put_super = ext4_put_super, .sync_fs = ext4_sync_fs, .freeze_fs = ext4_freeze, .unfreeze_fs = ext4_unfreeze, .statfs = ext4_statfs, .show_options = ext4_show_options, .shutdown = ext4_shutdown, #ifdef CONFIG_QUOTA .quota_read = ext4_quota_read, .quota_write = ext4_quota_write, .get_dquots = ext4_get_dquots, #endif }; static const struct export_operations ext4_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, .get_parent = ext4_get_parent, .commit_metadata = ext4_nfs_commit_metadata, }; enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_nouid32, Opt_debug, Opt_removed, Opt_user_xattr, Opt_acl, Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, Opt_inlinecrypt, Opt_usrjquota, Opt_grpjquota, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type, #ifdef CONFIG_EXT4_DEBUG Opt_fc_debug_max_replay, Opt_fc_debug_force #endif }; static const struct constant_table ext4_param_errors[] = { {"continue", EXT4_MOUNT_ERRORS_CONT}, {"panic", EXT4_MOUNT_ERRORS_PANIC}, {"remount-ro", EXT4_MOUNT_ERRORS_RO}, {} }; static const struct constant_table ext4_param_data[] = { {"journal", EXT4_MOUNT_JOURNAL_DATA}, {"ordered", EXT4_MOUNT_ORDERED_DATA}, {"writeback", EXT4_MOUNT_WRITEBACK_DATA}, {} }; static const struct constant_table ext4_param_data_err[] = { {"abort", Opt_data_err_abort}, {"ignore", Opt_data_err_ignore}, {} }; static const struct constant_table ext4_param_jqfmt[] = { {"vfsold", QFMT_VFS_OLD}, {"vfsv0", QFMT_VFS_V0}, {"vfsv1", QFMT_VFS_V1}, {} }; static const struct constant_table ext4_param_dax[] = { {"always", Opt_dax_always}, {"inode", Opt_dax_inode}, {"never", Opt_dax_never}, {} }; /* String parameter that allows empty argument */ #define fsparam_string_empty(NAME, OPT) \ __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL) /* * Mount option specification * We don't use fsparam_flag_no because of the way we set the * options and the way we show them in _ext4_show_options(). To * keep the changes to a minimum, let's keep the negative options * separate for now. */ static const struct fs_parameter_spec ext4_param_specs[] = { fsparam_flag ("bsddf", Opt_bsd_df), fsparam_flag ("minixdf", Opt_minix_df), fsparam_flag ("grpid", Opt_grpid), fsparam_flag ("bsdgroups", Opt_grpid), fsparam_flag ("nogrpid", Opt_nogrpid), fsparam_flag ("sysvgroups", Opt_nogrpid), fsparam_u32 ("resgid", Opt_resgid), fsparam_u32 ("resuid", Opt_resuid), fsparam_u32 ("sb", Opt_sb), fsparam_enum ("errors", Opt_errors, ext4_param_errors), fsparam_flag ("nouid32", Opt_nouid32), fsparam_flag ("debug", Opt_debug), fsparam_flag ("oldalloc", Opt_removed), fsparam_flag ("orlov", Opt_removed), fsparam_flag ("user_xattr", Opt_user_xattr), fsparam_flag ("acl", Opt_acl), fsparam_flag ("norecovery", Opt_noload), fsparam_flag ("noload", Opt_noload), fsparam_flag ("bh", Opt_removed), fsparam_flag ("nobh", Opt_removed), fsparam_u32 ("commit", Opt_commit), fsparam_u32 ("min_batch_time", Opt_min_batch_time), fsparam_u32 ("max_batch_time", Opt_max_batch_time), fsparam_u32 ("journal_dev", Opt_journal_dev), fsparam_bdev ("journal_path", Opt_journal_path), fsparam_flag ("journal_checksum", Opt_journal_checksum), fsparam_flag ("nojournal_checksum", Opt_nojournal_checksum), fsparam_flag ("journal_async_commit",Opt_journal_async_commit), fsparam_flag ("abort", Opt_abort), fsparam_enum ("data", Opt_data, ext4_param_data), fsparam_enum ("data_err", Opt_data_err, ext4_param_data_err), fsparam_string_empty ("usrjquota", Opt_usrjquota), fsparam_string_empty ("grpjquota", Opt_grpjquota), fsparam_enum ("jqfmt", Opt_jqfmt, ext4_param_jqfmt), fsparam_flag ("grpquota", Opt_grpquota), fsparam_flag ("quota", Opt_quota), fsparam_flag ("noquota", Opt_noquota), fsparam_flag ("usrquota", Opt_usrquota), fsparam_flag ("prjquota", Opt_prjquota), fsparam_flag ("barrier", Opt_barrier), fsparam_u32 ("barrier", Opt_barrier), fsparam_flag ("nobarrier", Opt_nobarrier), fsparam_flag ("i_version", Opt_removed), fsparam_flag ("dax", Opt_dax), fsparam_enum ("dax", Opt_dax_type, ext4_param_dax), fsparam_u32 ("stripe", Opt_stripe), fsparam_flag ("delalloc", Opt_delalloc), fsparam_flag ("nodelalloc", Opt_nodelalloc), fsparam_flag ("warn_on_error", Opt_warn_on_error), fsparam_flag ("nowarn_on_error", Opt_nowarn_on_error), fsparam_u32 ("debug_want_extra_isize", Opt_debug_want_extra_isize), fsparam_flag ("mblk_io_submit", Opt_removed), fsparam_flag ("nomblk_io_submit", Opt_removed), fsparam_flag ("block_validity", Opt_block_validity), fsparam_flag ("noblock_validity", Opt_noblock_validity), fsparam_u32 ("inode_readahead_blks", Opt_inode_readahead_blks), fsparam_u32 ("journal_ioprio", Opt_journal_ioprio), fsparam_u32 ("auto_da_alloc", Opt_auto_da_alloc), fsparam_flag ("auto_da_alloc", Opt_auto_da_alloc), fsparam_flag ("noauto_da_alloc", Opt_noauto_da_alloc), fsparam_flag ("dioread_nolock", Opt_dioread_nolock), fsparam_flag ("nodioread_nolock", Opt_dioread_lock), fsparam_flag ("dioread_lock", Opt_dioread_lock), fsparam_flag ("discard", Opt_discard), fsparam_flag ("nodiscard", Opt_nodiscard), fsparam_u32 ("init_itable", Opt_init_itable), fsparam_flag ("init_itable", Opt_init_itable), fsparam_flag ("noinit_itable", Opt_noinit_itable), #ifdef CONFIG_EXT4_DEBUG fsparam_flag ("fc_debug_force", Opt_fc_debug_force), fsparam_u32 ("fc_debug_max_replay", Opt_fc_debug_max_replay), #endif fsparam_u32 ("max_dir_size_kb", Opt_max_dir_size_kb), fsparam_flag ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_string ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_flag ("inlinecrypt", Opt_inlinecrypt), fsparam_flag ("nombcache", Opt_nombcache), fsparam_flag ("no_mbcache", Opt_nombcache), /* for backward compatibility */ fsparam_flag ("prefetch_block_bitmaps", Opt_removed), fsparam_flag ("no_prefetch_block_bitmaps", Opt_no_prefetch_block_bitmaps), fsparam_s32 ("mb_optimize_scan", Opt_mb_optimize_scan), fsparam_string ("check", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("nocheck", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("reservation", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("noreservation", Opt_removed), /* mount option from ext2/3 */ fsparam_u32 ("journal", Opt_removed), /* mount option from ext2/3 */ {} }; #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 #define MOPT_NOSUPPORT 0x0004 #define MOPT_EXPLICIT 0x0008 #ifdef CONFIG_QUOTA #define MOPT_Q 0 #define MOPT_QFMT 0x0010 #else #define MOPT_Q MOPT_NOSUPPORT #define MOPT_QFMT MOPT_NOSUPPORT #endif #define MOPT_NO_EXT2 0x0020 #define MOPT_NO_EXT3 0x0040 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) #define MOPT_SKIP 0x0080 #define MOPT_2 0x0100 static const struct mount_opts { int token; int mount_opt; int flags; } ext4_mount_opts[] = { {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET}, {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_EXT4_ONLY | MOPT_SET}, {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET}, {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR}, {Opt_commit, 0, MOPT_NO_EXT2}, {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET}, {Opt_data_err, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_NO_EXT2}, {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, {Opt_dax_type, 0, MOPT_EXT4_ONLY}, {Opt_journal_dev, 0, MOPT_NO_EXT2}, {Opt_journal_path, 0, MOPT_NO_EXT2}, {Opt_journal_ioprio, 0, MOPT_NO_EXT2}, {Opt_data, 0, MOPT_NO_EXT2}, {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, #ifdef CONFIG_EXT4_FS_POSIX_ACL {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, #else {Opt_acl, 0, MOPT_NOSUPPORT}, #endif {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, MOPT_SET | MOPT_Q}, {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA, MOPT_SET | MOPT_Q}, {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), MOPT_CLEAR | MOPT_Q}, {Opt_usrjquota, 0, MOPT_Q}, {Opt_grpjquota, 0, MOPT_Q}, {Opt_jqfmt, 0, MOPT_QFMT}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, MOPT_SET}, #ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, #endif {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2}, {Opt_err, 0, 0} }; #if IS_ENABLED(CONFIG_UNICODE) static const struct ext4_sb_encodings { __u16 magic; char *name; unsigned int version; } ext4_sb_encoding_map[] = { {EXT4_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)}, }; static const struct ext4_sb_encodings * ext4_sb_read_encoding(const struct ext4_super_block *es) { __u16 magic = le16_to_cpu(es->s_encoding); int i; for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++) if (magic == ext4_sb_encoding_map[i].magic) return &ext4_sb_encoding_map[i]; return NULL; } #endif #define EXT4_SPEC_JQUOTA (1 << 0) #define EXT4_SPEC_JQFMT (1 << 1) #define EXT4_SPEC_DATAJ (1 << 2) #define EXT4_SPEC_SB_BLOCK (1 << 3) #define EXT4_SPEC_JOURNAL_DEV (1 << 4) #define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5) #define EXT4_SPEC_s_want_extra_isize (1 << 7) #define EXT4_SPEC_s_max_batch_time (1 << 8) #define EXT4_SPEC_s_min_batch_time (1 << 9) #define EXT4_SPEC_s_inode_readahead_blks (1 << 10) #define EXT4_SPEC_s_li_wait_mult (1 << 11) #define EXT4_SPEC_s_max_dir_size_kb (1 << 12) #define EXT4_SPEC_s_stripe (1 << 13) #define EXT4_SPEC_s_resuid (1 << 14) #define EXT4_SPEC_s_resgid (1 << 15) #define EXT4_SPEC_s_commit_interval (1 << 16) #define EXT4_SPEC_s_fc_debug_max_replay (1 << 17) #define EXT4_SPEC_s_sb_block (1 << 18) #define EXT4_SPEC_mb_optimize_scan (1 << 19) struct ext4_fs_context { char *s_qf_names[EXT4_MAXQUOTAS]; struct fscrypt_dummy_policy dummy_enc_policy; int s_jquota_fmt; /* Format of quota to use */ #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; #endif unsigned short qname_spec; unsigned long vals_s_flags; /* Bits to set in s_flags */ unsigned long mask_s_flags; /* Bits changed in s_flags */ unsigned long journal_devnum; unsigned long s_commit_interval; unsigned long s_stripe; unsigned int s_inode_readahead_blks; unsigned int s_want_extra_isize; unsigned int s_li_wait_mult; unsigned int s_max_dir_size_kb; unsigned int journal_ioprio; unsigned int vals_s_mount_opt; unsigned int mask_s_mount_opt; unsigned int vals_s_mount_opt2; unsigned int mask_s_mount_opt2; unsigned int opt_flags; /* MOPT flags */ unsigned int spec; u32 s_max_batch_time; u32 s_min_batch_time; kuid_t s_resuid; kgid_t s_resgid; ext4_fsblk_t s_sb_block; }; static void ext4_fc_free(struct fs_context *fc) { struct ext4_fs_context *ctx = fc->fs_private; int i; if (!ctx) return; for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(ctx->s_qf_names[i]); fscrypt_free_dummy_policy(&ctx->dummy_enc_policy); kfree(ctx); } int ext4_init_fs_context(struct fs_context *fc) { struct ext4_fs_context *ctx; ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; fc->fs_private = ctx; fc->ops = &ext4_context_ops; return 0; } #ifdef CONFIG_QUOTA /* * Note the name of the specified quota file. */ static int note_qf_name(struct fs_context *fc, int qtype, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; char *qname; if (param->size < 1) { ext4_msg(NULL, KERN_ERR, "Missing quota name"); return -EINVAL; } if (strchr(param->string, '/')) { ext4_msg(NULL, KERN_ERR, "quotafile must be on filesystem root"); return -EINVAL; } if (ctx->s_qf_names[qtype]) { if (strcmp(ctx->s_qf_names[qtype], param->string) != 0) { ext4_msg(NULL, KERN_ERR, "%s quota file already specified", QTYPE2NAME(qtype)); return -EINVAL; } return 0; } qname = kmemdup_nul(param->string, param->size, GFP_KERNEL); if (!qname) { ext4_msg(NULL, KERN_ERR, "Not enough memory for storing quotafile name"); return -ENOMEM; } ctx->s_qf_names[qtype] = qname; ctx->qname_spec |= 1 << qtype; ctx->spec |= EXT4_SPEC_JQUOTA; return 0; } /* * Clear the name of the specified quota file. */ static int unnote_qf_name(struct fs_context *fc, int qtype) { struct ext4_fs_context *ctx = fc->fs_private; if (ctx->s_qf_names[qtype]) kfree(ctx->s_qf_names[qtype]); ctx->s_qf_names[qtype] = NULL; ctx->qname_spec |= 1 << qtype; ctx->spec |= EXT4_SPEC_JQUOTA; return 0; } #endif static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param, struct ext4_fs_context *ctx) { int err; if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { ext4_msg(NULL, KERN_WARNING, "test_dummy_encryption option not supported"); return -EINVAL; } err = fscrypt_parse_test_dummy_encryption(param, &ctx->dummy_enc_policy); if (err == -EINVAL) { ext4_msg(NULL, KERN_WARNING, "Value of option \"%s\" is unrecognized", param->key); } else if (err == -EEXIST) { ext4_msg(NULL, KERN_WARNING, "Conflicting test_dummy_encryption options"); return -EINVAL; } return err; } #define EXT4_SET_CTX(name) \ static inline void ctx_set_##name(struct ext4_fs_context *ctx, \ unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name |= flag; \ } #define EXT4_CLEAR_CTX(name) \ static inline void ctx_clear_##name(struct ext4_fs_context *ctx, \ unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name &= ~flag; \ } #define EXT4_TEST_CTX(name) \ static inline unsigned long \ ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ return (ctx->vals_s_##name & flag); \ } EXT4_SET_CTX(flags); /* set only */ EXT4_SET_CTX(mount_opt); EXT4_CLEAR_CTX(mount_opt); EXT4_TEST_CTX(mount_opt); EXT4_SET_CTX(mount_opt2); EXT4_CLEAR_CTX(mount_opt2); EXT4_TEST_CTX(mount_opt2); static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; struct fs_parse_result result; const struct mount_opts *m; int is_remount; kuid_t uid; kgid_t gid; int token; token = fs_parse(fc, ext4_param_specs, param, &result); if (token < 0) return token; is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; for (m = ext4_mount_opts; m->token != Opt_err; m++) if (token == m->token) break; ctx->opt_flags |= m->flags; if (m->flags & MOPT_EXPLICIT) { if (m->mount_opt & EXT4_MOUNT_DELALLOC) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_DELALLOC); } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM); } else return -EINVAL; } if (m->flags & MOPT_NOSUPPORT) { ext4_msg(NULL, KERN_ERR, "%s option not supported", param->key); return 0; } switch (token) { #ifdef CONFIG_QUOTA case Opt_usrjquota: if (!*param->string) return unnote_qf_name(fc, USRQUOTA); else return note_qf_name(fc, USRQUOTA, param); case Opt_grpjquota: if (!*param->string) return unnote_qf_name(fc, GRPQUOTA); else return note_qf_name(fc, GRPQUOTA, param); #endif case Opt_sb: if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { ext4_msg(NULL, KERN_WARNING, "Ignoring %s option on remount", param->key); } else { ctx->s_sb_block = result.uint_32; ctx->spec |= EXT4_SPEC_s_sb_block; } return 0; case Opt_removed: ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option", param->key); return 0; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT ctx_set_flags(ctx, SB_INLINECRYPT); #else ext4_msg(NULL, KERN_ERR, "inline encryption not supported"); #endif return 0; case Opt_errors: ctx_clear_mount_opt(ctx, EXT4_MOUNT_ERRORS_MASK); ctx_set_mount_opt(ctx, result.uint_32); return 0; #ifdef CONFIG_QUOTA case Opt_jqfmt: ctx->s_jquota_fmt = result.uint_32; ctx->spec |= EXT4_SPEC_JQFMT; return 0; #endif case Opt_data: ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); ctx_set_mount_opt(ctx, result.uint_32); ctx->spec |= EXT4_SPEC_DATAJ; return 0; case Opt_commit: if (result.uint_32 == 0) result.uint_32 = JBD2_DEFAULT_MAX_COMMIT_AGE; else if (result.uint_32 > INT_MAX / HZ) { ext4_msg(NULL, KERN_ERR, "Invalid commit interval %d, " "must be smaller than %d", result.uint_32, INT_MAX / HZ); return -EINVAL; } ctx->s_commit_interval = HZ * result.uint_32; ctx->spec |= EXT4_SPEC_s_commit_interval; return 0; case Opt_debug_want_extra_isize: if ((result.uint_32 & 1) || (result.uint_32 < 4)) { ext4_msg(NULL, KERN_ERR, "Invalid want_extra_isize %d", result.uint_32); return -EINVAL; } ctx->s_want_extra_isize = result.uint_32; ctx->spec |= EXT4_SPEC_s_want_extra_isize; return 0; case Opt_max_batch_time: ctx->s_max_batch_time = result.uint_32; ctx->spec |= EXT4_SPEC_s_max_batch_time; return 0; case Opt_min_batch_time: ctx->s_min_batch_time = result.uint_32; ctx->spec |= EXT4_SPEC_s_min_batch_time; return 0; case Opt_inode_readahead_blks: if (result.uint_32 && (result.uint_32 > (1 << 30) || !is_power_of_2(result.uint_32))) { ext4_msg(NULL, KERN_ERR, "EXT4-fs: inode_readahead_blks must be " "0 or a power of 2 smaller than 2^31"); return -EINVAL; } ctx->s_inode_readahead_blks = result.uint_32; ctx->spec |= EXT4_SPEC_s_inode_readahead_blks; return 0; case Opt_init_itable: ctx_set_mount_opt(ctx, EXT4_MOUNT_INIT_INODE_TABLE); ctx->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; if (param->type == fs_value_is_string) ctx->s_li_wait_mult = result.uint_32; ctx->spec |= EXT4_SPEC_s_li_wait_mult; return 0; case Opt_max_dir_size_kb: ctx->s_max_dir_size_kb = result.uint_32; ctx->spec |= EXT4_SPEC_s_max_dir_size_kb; return 0; #ifdef CONFIG_EXT4_DEBUG case Opt_fc_debug_max_replay: ctx->s_fc_debug_max_replay = result.uint_32; ctx->spec |= EXT4_SPEC_s_fc_debug_max_replay; return 0; #endif case Opt_stripe: ctx->s_stripe = result.uint_32; ctx->spec |= EXT4_SPEC_s_stripe; return 0; case Opt_resuid: uid = make_kuid(current_user_ns(), result.uint_32); if (!uid_valid(uid)) { ext4_msg(NULL, KERN_ERR, "Invalid uid value %d", result.uint_32); return -EINVAL; } ctx->s_resuid = uid; ctx->spec |= EXT4_SPEC_s_resuid; return 0; case Opt_resgid: gid = make_kgid(current_user_ns(), result.uint_32); if (!gid_valid(gid)) { ext4_msg(NULL, KERN_ERR, "Invalid gid value %d", result.uint_32); return -EINVAL; } ctx->s_resgid = gid; ctx->spec |= EXT4_SPEC_s_resgid; return 0; case Opt_journal_dev: if (is_remount) { ext4_msg(NULL, KERN_ERR, "Cannot specify journal on remount"); return -EINVAL; } ctx->journal_devnum = result.uint_32; ctx->spec |= EXT4_SPEC_JOURNAL_DEV; return 0; case Opt_journal_path: { struct inode *journal_inode; struct path path; int error; if (is_remount) { ext4_msg(NULL, KERN_ERR, "Cannot specify journal on remount"); return -EINVAL; } error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path); if (error) { ext4_msg(NULL, KERN_ERR, "error: could not find " "journal device path"); return -EINVAL; } journal_inode = d_inode(path.dentry); ctx->journal_devnum = new_encode_dev(journal_inode->i_rdev); ctx->spec |= EXT4_SPEC_JOURNAL_DEV; path_put(&path); return 0; } case Opt_journal_ioprio: if (result.uint_32 > 7) { ext4_msg(NULL, KERN_ERR, "Invalid journal IO priority" " (must be 0-7)"); return -EINVAL; } ctx->journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, result.uint_32); ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO; return 0; case Opt_test_dummy_encryption: return ext4_parse_test_dummy_encryption(param, ctx); case Opt_dax: case Opt_dax_type: #ifdef CONFIG_FS_DAX { int type = (token == Opt_dax) ? Opt_dax : result.uint_32; switch (type) { case Opt_dax: case Opt_dax_always: ctx_set_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); break; case Opt_dax_never: ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); break; case Opt_dax_inode: ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); /* Strictly for printing options */ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE); break; } return 0; } #else ext4_msg(NULL, KERN_INFO, "dax option not supported"); return -EINVAL; #endif case Opt_data_err: if (result.uint_32 == Opt_data_err_abort) ctx_set_mount_opt(ctx, m->mount_opt); else if (result.uint_32 == Opt_data_err_ignore) ctx_clear_mount_opt(ctx, m->mount_opt); return 0; case Opt_mb_optimize_scan: if (result.int_32 == 1) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN); ctx->spec |= EXT4_SPEC_mb_optimize_scan; } else if (result.int_32 == 0) { ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN); ctx->spec |= EXT4_SPEC_mb_optimize_scan; } else { ext4_msg(NULL, KERN_WARNING, "mb_optimize_scan should be set to 0 or 1."); return -EINVAL; } return 0; } /* * At this point we should only be getting options requiring MOPT_SET, * or MOPT_CLEAR. Anything else is a bug */ if (m->token == Opt_err) { ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s", param->key); WARN_ON(1); return -EINVAL; } else { unsigned int set = 0; if ((param->type == fs_value_is_flag) || result.uint_32 > 0) set = 1; if (m->flags & MOPT_CLEAR) set = !set; else if (unlikely(!(m->flags & MOPT_SET))) { ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s", param->key); WARN_ON(1); return -EINVAL; } if (m->flags & MOPT_2) { if (set != 0) ctx_set_mount_opt2(ctx, m->mount_opt); else ctx_clear_mount_opt2(ctx, m->mount_opt); } else { if (set != 0) ctx_set_mount_opt(ctx, m->mount_opt); else ctx_clear_mount_opt(ctx, m->mount_opt); } } return 0; } static int parse_options(struct fs_context *fc, char *options) { struct fs_parameter param; int ret; char *key; if (!options) return 0; while ((key = strsep(&options, ",")) != NULL) { if (*key) { size_t v_len = 0; char *value = strchr(key, '='); param.type = fs_value_is_flag; param.string = NULL; if (value) { if (value == key) continue; *value++ = 0; v_len = strlen(value); param.string = kmemdup_nul(value, v_len, GFP_KERNEL); if (!param.string) return -ENOMEM; param.type = fs_value_is_string; } param.key = key; param.size = v_len; ret = ext4_parse_param(fc, &param); if (param.string) kfree(param.string); if (ret < 0) return ret; } } ret = ext4_validate_options(fc); if (ret < 0) return ret; return 0; } static int parse_apply_sb_mount_options(struct super_block *sb, struct ext4_fs_context *m_ctx) { struct ext4_sb_info *sbi = EXT4_SB(sb); char *s_mount_opts = NULL; struct ext4_fs_context *s_ctx = NULL; struct fs_context *fc = NULL; int ret = -ENOMEM; if (!sbi->s_es->s_mount_opts[0]) return 0; s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, sizeof(sbi->s_es->s_mount_opts), GFP_KERNEL); if (!s_mount_opts) return ret; fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); if (!fc) goto out_free; s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); if (!s_ctx) goto out_free; fc->fs_private = s_ctx; fc->s_fs_info = sbi; ret = parse_options(fc, s_mount_opts); if (ret < 0) goto parse_failed; ret = ext4_check_opt_consistency(fc, sb); if (ret < 0) { parse_failed: ext4_msg(sb, KERN_WARNING, "failed to parse options in superblock: %s", s_mount_opts); ret = 0; goto out_free; } if (s_ctx->spec & EXT4_SPEC_JOURNAL_DEV) m_ctx->journal_devnum = s_ctx->journal_devnum; if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO) m_ctx->journal_ioprio = s_ctx->journal_ioprio; ext4_apply_options(fc, sb); ret = 0; out_free: if (fc) { ext4_fc_free(fc); kfree(fc); } kfree(s_mount_opts); return ret; } static void ext4_apply_quota_options(struct fs_context *fc, struct super_block *sb) { #ifdef CONFIG_QUOTA bool quota_feature = ext4_has_feature_quota(sb); struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = EXT4_SB(sb); char *qname; int i; if (quota_feature) return; if (ctx->spec & EXT4_SPEC_JQUOTA) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (!(ctx->qname_spec & (1 << i))) continue; qname = ctx->s_qf_names[i]; /* May be NULL */ if (qname) set_opt(sb, QUOTA); ctx->s_qf_names[i] = NULL; qname = rcu_replace_pointer(sbi->s_qf_names[i], qname, lockdep_is_held(&sb->s_umount)); if (qname) kfree_rcu_mightsleep(qname); } } if (ctx->spec & EXT4_SPEC_JQFMT) sbi->s_jquota_fmt = ctx->s_jquota_fmt; #endif } /* * Check quota settings consistency. */ static int ext4_check_quota_consistency(struct fs_context *fc, struct super_block *sb) { #ifdef CONFIG_QUOTA struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = EXT4_SB(sb); bool quota_feature = ext4_has_feature_quota(sb); bool quota_loaded = sb_any_quota_loaded(sb); bool usr_qf_name, grp_qf_name, usrquota, grpquota; int quota_flags, i; /* * We do the test below only for project quotas. 'usrquota' and * 'grpquota' mount options are allowed even without quota feature * to support legacy quotas in quota files. */ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_PRJQUOTA) && !ext4_has_feature_project(sb)) { ext4_msg(NULL, KERN_ERR, "Project quota feature not enabled. " "Cannot enable project quota enforcement."); return -EINVAL; } quota_flags = EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA; if (quota_loaded && ctx->mask_s_mount_opt & quota_flags && !ctx_test_mount_opt(ctx, quota_flags)) goto err_quota_change; if (ctx->spec & EXT4_SPEC_JQUOTA) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (!(ctx->qname_spec & (1 << i))) continue; if (quota_loaded && !!sbi->s_qf_names[i] != !!ctx->s_qf_names[i]) goto err_jquota_change; if (sbi->s_qf_names[i] && ctx->s_qf_names[i] && strcmp(get_qf_name(sb, sbi, i), ctx->s_qf_names[i]) != 0) goto err_jquota_specified; } if (quota_feature) { ext4_msg(NULL, KERN_INFO, "Journaled quota options ignored when " "QUOTA feature is enabled"); return 0; } } if (ctx->spec & EXT4_SPEC_JQFMT) { if (sbi->s_jquota_fmt != ctx->s_jquota_fmt && quota_loaded) goto err_jquota_change; if (quota_feature) { ext4_msg(NULL, KERN_INFO, "Quota format mount options " "ignored when QUOTA feature is enabled"); return 0; } } /* Make sure we don't mix old and new quota format */ usr_qf_name = (get_qf_name(sb, sbi, USRQUOTA) || ctx->s_qf_names[USRQUOTA]); grp_qf_name = (get_qf_name(sb, sbi, GRPQUOTA) || ctx->s_qf_names[GRPQUOTA]); usrquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || test_opt(sb, USRQUOTA)); grpquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) || test_opt(sb, GRPQUOTA)); if (usr_qf_name) { ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); usrquota = false; } if (grp_qf_name) { ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); grpquota = false; } if (usr_qf_name || grp_qf_name) { if (usrquota || grpquota) { ext4_msg(NULL, KERN_ERR, "old and new quota " "format mixing"); return -EINVAL; } if (!(ctx->spec & EXT4_SPEC_JQFMT || sbi->s_jquota_fmt)) { ext4_msg(NULL, KERN_ERR, "journaled quota format " "not specified"); return -EINVAL; } } return 0; err_quota_change: ext4_msg(NULL, KERN_ERR, "Cannot change quota options when quota turned on"); return -EINVAL; err_jquota_change: ext4_msg(NULL, KERN_ERR, "Cannot change journaled quota " "options when quota turned on"); return -EINVAL; err_jquota_specified: ext4_msg(NULL, KERN_ERR, "%s quota file already specified", QTYPE2NAME(i)); return -EINVAL; #else return 0; #endif } static int ext4_check_test_dummy_encryption(const struct fs_context *fc, struct super_block *sb) { const struct ext4_fs_context *ctx = fc->fs_private; const struct ext4_sb_info *sbi = EXT4_SB(sb); if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy)) return 0; if (!ext4_has_feature_encrypt(sb)) { ext4_msg(NULL, KERN_WARNING, "test_dummy_encryption requires encrypt feature"); return -EINVAL; } /* * This mount option is just for testing, and it's not worthwhile to * implement the extra complexity (e.g. RCU protection) that would be * needed to allow it to be set or changed during remount. We do allow * it to be specified during remount, but only if there is no change. */ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, &ctx->dummy_enc_policy)) return 0; ext4_msg(NULL, KERN_WARNING, "Can't set or change test_dummy_encryption on remount"); return -EINVAL; } /* Also make sure s_mount_opts didn't contain a conflicting value. */ if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) { if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, &ctx->dummy_enc_policy)) return 0; ext4_msg(NULL, KERN_WARNING, "Conflicting test_dummy_encryption options"); return -EINVAL; } return 0; } static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx, struct super_block *sb) { if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) || /* if already set, it was already verified to be the same */ fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy)) return; EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy; memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy)); ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); } static int ext4_check_opt_consistency(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = fc->s_fs_info; int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; int err; if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { ext4_msg(NULL, KERN_ERR, "Mount option(s) incompatible with ext2"); return -EINVAL; } if ((ctx->opt_flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) { ext4_msg(NULL, KERN_ERR, "Mount option(s) incompatible with ext3"); return -EINVAL; } if (ctx->s_want_extra_isize > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE)) { ext4_msg(NULL, KERN_ERR, "Invalid want_extra_isize %d", ctx->s_want_extra_isize); return -EINVAL; } err = ext4_check_test_dummy_encryption(fc, sb); if (err) return err; if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) { if (!sbi->s_journal) { ext4_msg(NULL, KERN_WARNING, "Remounting file system with no journal " "so ignoring journalled data option"); ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); } else if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS) != test_opt(sb, DATA_FLAGS)) { ext4_msg(NULL, KERN_ERR, "Cannot change data mode " "on remount"); return -EINVAL; } } if (is_remount) { if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(NULL, KERN_ERR, "can't mount with " "both data=journal and dax"); return -EINVAL; } if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) { fail_dax_change_remount: ext4_msg(NULL, KERN_ERR, "can't change " "dax mount option while remounting"); return -EINVAL; } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER) && (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) { goto fail_dax_change_remount; } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE) && ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) { goto fail_dax_change_remount; } } return ext4_check_quota_consistency(fc, sb); } static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = fc->s_fs_info; sbi->s_mount_opt &= ~ctx->mask_s_mount_opt; sbi->s_mount_opt |= ctx->vals_s_mount_opt; sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2; sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2; sb->s_flags &= ~ctx->mask_s_flags; sb->s_flags |= ctx->vals_s_flags; #define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; }) APPLY(s_commit_interval); APPLY(s_stripe); APPLY(s_max_batch_time); APPLY(s_min_batch_time); APPLY(s_want_extra_isize); APPLY(s_inode_readahead_blks); APPLY(s_max_dir_size_kb); APPLY(s_li_wait_mult); APPLY(s_resgid); APPLY(s_resuid); #ifdef CONFIG_EXT4_DEBUG APPLY(s_fc_debug_max_replay); #endif ext4_apply_quota_options(fc, sb); ext4_apply_test_dummy_encryption(ctx, sb); } static int ext4_validate_options(struct fs_context *fc) { #ifdef CONFIG_QUOTA struct ext4_fs_context *ctx = fc->fs_private; char *usr_qf_name, *grp_qf_name; usr_qf_name = ctx->s_qf_names[USRQUOTA]; grp_qf_name = ctx->s_qf_names[GRPQUOTA]; if (usr_qf_name || grp_qf_name) { if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) && usr_qf_name) ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); if (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) && grp_qf_name) ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA)) { ext4_msg(NULL, KERN_ERR, "old and new quota " "format mixing"); return -EINVAL; } } #endif return 1; } static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) { #if defined(CONFIG_QUOTA) struct ext4_sb_info *sbi = EXT4_SB(sb); char *usr_qf_name, *grp_qf_name; if (sbi->s_jquota_fmt) { char *fmtname = ""; switch (sbi->s_jquota_fmt) { case QFMT_VFS_OLD: fmtname = "vfsold"; break; case QFMT_VFS_V0: fmtname = "vfsv0"; break; case QFMT_VFS_V1: fmtname = "vfsv1"; break; } seq_printf(seq, ",jqfmt=%s", fmtname); } rcu_read_lock(); usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]); grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]); if (usr_qf_name) seq_show_option(seq, "usrjquota", usr_qf_name); if (grp_qf_name) seq_show_option(seq, "grpjquota", grp_qf_name); rcu_read_unlock(); #endif } static const char *token2str(int token) { const struct fs_parameter_spec *spec; for (spec = ext4_param_specs; spec->name != NULL; spec++) if (spec->opt == token && !spec->type) break; return spec->name; } /* * Show an option if * - it's set to a non-default value OR * - if the per-sb default is different from the global default */ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, int nodefs) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int def_errors; const struct mount_opts *m; char sep = nodefs ? '\n' : ','; #define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep) #define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg) if (sbi->s_sb_block != 1) SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block); for (m = ext4_mount_opts; m->token != Opt_err; m++) { int want_set = m->flags & MOPT_SET; int opt_2 = m->flags & MOPT_2; unsigned int mount_opt, def_mount_opt; if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || m->flags & MOPT_SKIP) continue; if (opt_2) { mount_opt = sbi->s_mount_opt2; def_mount_opt = sbi->s_def_mount_opt2; } else { mount_opt = sbi->s_mount_opt; def_mount_opt = sbi->s_def_mount_opt; } /* skip if same as the default */ if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt))) continue; /* select Opt_noFoo vs Opt_Foo */ if ((want_set && (mount_opt & m->mount_opt) != m->mount_opt) || (!want_set && (mount_opt & m->mount_opt))) continue; SEQ_OPTS_PRINT("%s", token2str(m->token)); } if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) || le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) SEQ_OPTS_PRINT("resuid=%u", from_kuid_munged(&init_user_ns, sbi->s_resuid)); if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) || le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) SEQ_OPTS_PRINT("resgid=%u", from_kgid_munged(&init_user_ns, sbi->s_resgid)); def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) SEQ_OPTS_PUTS("errors=remount-ro"); if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) SEQ_OPTS_PUTS("errors=continue"); if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) SEQ_OPTS_PUTS("errors=panic"); if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ); if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); if (nodefs || EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) SEQ_OPTS_PUTS("data=journal"); else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) SEQ_OPTS_PUTS("data=ordered"); else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) SEQ_OPTS_PUTS("data=writeback"); } if (nodefs || sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) SEQ_OPTS_PRINT("inode_readahead_blks=%u", sbi->s_inode_readahead_blks); if (test_opt(sb, INIT_INODE_TABLE) && (nodefs || (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); if (nodefs || sbi->s_max_dir_size_kb) SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); if (test_opt(sb, DATA_ERR_ABORT)) SEQ_OPTS_PUTS("data_err=abort"); fscrypt_show_test_dummy_encryption(seq, sep, sb); if (sb->s_flags & SB_INLINECRYPT) SEQ_OPTS_PUTS("inlinecrypt"); if (test_opt(sb, DAX_ALWAYS)) { if (IS_EXT2_SB(sb)) SEQ_OPTS_PUTS("dax"); else SEQ_OPTS_PUTS("dax=always"); } else if (test_opt2(sb, DAX_NEVER)) { SEQ_OPTS_PUTS("dax=never"); } else if (test_opt2(sb, DAX_INODE)) { SEQ_OPTS_PUTS("dax=inode"); } if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD && !test_opt2(sb, MB_OPTIMIZE_SCAN)) { SEQ_OPTS_PUTS("mb_optimize_scan=0"); } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD && test_opt2(sb, MB_OPTIMIZE_SCAN)) { SEQ_OPTS_PUTS("mb_optimize_scan=1"); } ext4_show_quota_options(seq, sb); return 0; } static int ext4_show_options(struct seq_file *seq, struct dentry *root) { return _ext4_show_options(seq, root->d_sb, 0); } int ext4_seq_options_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; int rc; seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw"); rc = _ext4_show_options(seq, sb, 1); seq_puts(seq, "\n"); return rc; } static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int read_only) { struct ext4_sb_info *sbi = EXT4_SB(sb); int err = 0; if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { ext4_msg(sb, KERN_ERR, "revision level too high, " "forcing read-only mode"); err = -EROFS; goto done; } if (read_only) goto done; if (!(sbi->s_mount_state & EXT4_VALID_FS)) ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " "running e2fsck is recommended"); else if (sbi->s_mount_state & EXT4_ERROR_FS) ext4_msg(sb, KERN_WARNING, "warning: mounting fs with errors, " "running e2fsck is recommended"); else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && le16_to_cpu(es->s_mnt_count) >= (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) ext4_msg(sb, KERN_WARNING, "warning: maximal mount count reached, " "running e2fsck is recommended"); else if (le32_to_cpu(es->s_checkinterval) && (ext4_get_tstamp(es, s_lastcheck) + le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds())) ext4_msg(sb, KERN_WARNING, "warning: checktime reached, " "running e2fsck is recommended"); if (!sbi->s_journal) es->s_state &= cpu_to_le16(~EXT4_VALID_FS); if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); ext4_update_tstamp(es, s_mtime); if (sbi->s_journal) { ext4_set_feature_journal_needs_recovery(sb); if (ext4_has_feature_orphan_file(sb)) ext4_set_feature_orphan_present(sb); } err = ext4_commit_super(sb); done: if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", sb->s_blocksize, sbi->s_groups_count, EXT4_BLOCKS_PER_GROUP(sb), EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt, sbi->s_mount_opt2); return err; } int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct flex_groups **old_groups, **new_groups; int size, i, j; if (!sbi->s_log_groups_per_flex) return 0; size = ext4_flex_group(sbi, ngroup - 1) + 1; if (size <= sbi->s_flex_groups_allocated) return 0; new_groups = kvzalloc(roundup_pow_of_two(size * sizeof(*sbi->s_flex_groups)), GFP_KERNEL); if (!new_groups) { ext4_msg(sb, KERN_ERR, "not enough memory for %d flex group pointers", size); return -ENOMEM; } for (i = sbi->s_flex_groups_allocated; i < size; i++) { new_groups[i] = kvzalloc(roundup_pow_of_two( sizeof(struct flex_groups)), GFP_KERNEL); if (!new_groups[i]) { for (j = sbi->s_flex_groups_allocated; j < i; j++) kvfree(new_groups[j]); kvfree(new_groups); ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", size); return -ENOMEM; } } rcu_read_lock(); old_groups = rcu_dereference(sbi->s_flex_groups); if (old_groups) memcpy(new_groups, old_groups, (sbi->s_flex_groups_allocated * sizeof(struct flex_groups *))); rcu_read_unlock(); rcu_assign_pointer(sbi->s_flex_groups, new_groups); sbi->s_flex_groups_allocated = size; if (old_groups) ext4_kvfree_array_rcu(old_groups); return 0; } static int ext4_fill_flex_info(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; struct flex_groups *fg; ext4_group_t flex_group; int i, err; sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { sbi->s_log_groups_per_flex = 0; return 1; } err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); if (err) goto failed; for (i = 0; i < sbi->s_groups_count; i++) { gdp = ext4_get_group_desc(sb, i, NULL); flex_group = ext4_flex_group(sbi, i); fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes); atomic64_add(ext4_free_group_clusters(sb, gdp), &fg->free_clusters); atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs); } return 1; failed: return 0; } static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { int offset = offsetof(struct ext4_group_desc, bg_checksum); __u16 crc = 0; __le32 le_group = cpu_to_le32(block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ __u32 csum32; __u16 dummy_csum = 0; csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset); csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); if (offset < sbi->s_desc_size) csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset, sbi->s_desc_size - offset); crc = csum32 & 0xFFFF; goto out; } /* old crc16 code */ if (!ext4_has_feature_gdt_csum(sb)) return 0; crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); crc = crc16(crc, (__u8 *)gdp, offset); offset += sizeof(gdp->bg_checksum); /* skip checksum */ /* for checksum of struct ext4_group_desc do the rest...*/ if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size) crc = crc16(crc, (__u8 *)gdp + offset, sbi->s_desc_size - offset); out: return cpu_to_le16(crc); } int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { if (ext4_has_group_desc_csum(sb) && (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp))) return 0; return 1; } void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { if (!ext4_has_group_desc_csum(sb)) return; gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp); } /* Called at mount-time, super-block is locked */ static int ext4_check_descriptors(struct super_block *sb, ext4_fsblk_t sb_block, ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); ext4_fsblk_t last_block; ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0); ext4_fsblk_t block_bitmap; ext4_fsblk_t inode_bitmap; ext4_fsblk_t inode_table; int flexbg_flag = 0; ext4_group_t i, grp = sbi->s_groups_count; if (ext4_has_feature_flex_bg(sb)) flexbg_flag = 1; ext4_debug("Checking group descriptors"); for (i = 0; i < sbi->s_groups_count; i++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (i == sbi->s_groups_count - 1 || flexbg_flag) last_block = ext4_blocks_count(sbi->s_es) - 1; else last_block = first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1); if ((grp == sbi->s_groups_count) && !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) grp = i; block_bitmap = ext4_block_bitmap(sb, gdp); if (block_bitmap == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (block_bitmap >= sb_block + 1 && block_bitmap <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (block_bitmap < first_block || block_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u not in group " "(block %llu)!", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); if (inode_bitmap == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (inode_bitmap >= sb_block + 1 && inode_bitmap <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (inode_bitmap < first_block || inode_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u not in group " "(block %llu)!", i, inode_bitmap); return 0; } inode_table = ext4_inode_table(sb, gdp); if (inode_table == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (inode_table >= sb_block + 1 && inode_table <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u not in group " "(block %llu)!", i, inode_table); return 0; } ext4_lock_group(sb, i); if (!ext4_group_desc_csum_verify(sb, i, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Checksum for group %u failed (%u!=%u)", i, le16_to_cpu(ext4_group_desc_csum(sb, i, gdp)), le16_to_cpu(gdp->bg_checksum)); if (!sb_rdonly(sb)) { ext4_unlock_group(sb, i); return 0; } } ext4_unlock_group(sb, i); if (!flexbg_flag) first_block += EXT4_BLOCKS_PER_GROUP(sb); } if (NULL != first_not_zeroed) *first_not_zeroed = grp; return 1; } /* * Maximal extent format file size. * Resulting logical blkno at s_maxbytes must fit in our on-disk * extent format containers, within a sector_t, and within i_blocks * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, * so that won't be a limiting factor. * * However there is other limiting factor. We do store extents in the form * of starting block and length, hence the resulting length of the extent * covering maximum file size must fit into on-disk format containers as * well. Given that length is always by 1 unit bigger than max unit (because * we count 0 as well) we have to lower the s_maxbytes by one fs block. * * Note, this does *not* consider any metadata overhead for vfs i_blocks. */ static loff_t ext4_max_size(int blkbits, int has_huge_files) { loff_t res; loff_t upper_limit = MAX_LFS_FILESIZE; BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64)); if (!has_huge_files) { upper_limit = (1LL << 32) - 1; /* total blocks in file system block size */ upper_limit >>= (blkbits - 9); upper_limit <<= blkbits; } /* * 32-bit extent-start container, ee_block. We lower the maxbytes * by one fs block, so ee_len can cover the extent of maximum file * size */ res = (1LL << 32) - 1; res <<= blkbits; /* Sanity check against vm- & vfs- imposed limits */ if (res > upper_limit) res = upper_limit; return res; } /* * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. * We need to be 1 filesystem block less than the 2^48 sector limit. */ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) { loff_t upper_limit, res = EXT4_NDIR_BLOCKS; int meta_blocks; unsigned int ppb = 1 << (bits - 2); /* * This is calculated to be the largest file size for a dense, block * mapped file such that the file's total number of 512-byte sectors, * including data and all indirect blocks, does not exceed (2^48 - 1). * * __u32 i_blocks_lo and _u16 i_blocks_high represent the total * number of 512-byte sectors of the file. */ if (!has_huge_files) { /* * !has_huge_files or implies that the inode i_block field * represents total file blocks in 2^32 512-byte sectors == * size of vfs inode i_blocks * 8 */ upper_limit = (1LL << 32) - 1; /* total blocks in file system block size */ upper_limit >>= (bits - 9); } else { /* * We use 48 bit ext4_inode i_blocks * With EXT4_HUGE_FILE_FL set the i_blocks * represent total number of blocks in * file system block size */ upper_limit = (1LL << 48) - 1; } /* Compute how many blocks we can address by block tree */ res += ppb; res += ppb * ppb; res += ((loff_t)ppb) * ppb * ppb; /* Compute how many metadata blocks are needed */ meta_blocks = 1; meta_blocks += 1 + ppb; meta_blocks += 1 + ppb + ppb * ppb; /* Does block tree limit file size? */ if (res + meta_blocks <= upper_limit) goto check_lfs; res = upper_limit; /* How many metadata blocks are needed for addressing upper_limit? */ upper_limit -= EXT4_NDIR_BLOCKS; /* indirect blocks */ meta_blocks = 1; upper_limit -= ppb; /* double indirect blocks */ if (upper_limit < ppb * ppb) { meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb); res -= meta_blocks; goto check_lfs; } meta_blocks += 1 + ppb; upper_limit -= ppb * ppb; /* tripple indirect blocks for the rest */ meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) + DIV_ROUND_UP_ULL(upper_limit, ppb*ppb); res -= meta_blocks; check_lfs: res <<= bits; if (res > MAX_LFS_FILESIZE) res = MAX_LFS_FILESIZE; return res; } static ext4_fsblk_t descriptor_loc(struct super_block *sb, ext4_fsblk_t logical_sb_block, int nr) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t bg, first_meta_bg; int has_super = 0; first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg) return logical_sb_block + nr + 1; bg = sbi->s_desc_per_block * nr; if (ext4_bg_has_super(sb, bg)) has_super = 1; /* * If we have a meta_bg fs with 1k blocks, group 0's GDT is at * block 2, not 1. If s_first_data_block == 0 (bigalloc is enabled * on modern mke2fs or blksize > 1k on older mke2fs) then we must * compensate. */ if (sb->s_blocksize == 1024 && nr == 0 && le32_to_cpu(sbi->s_es->s_first_data_block) == 0) has_super++; return (has_super + ext4_group_first_block_no(sb, bg)); } /** * ext4_get_stripe_size: Get the stripe size. * @sbi: In memory super block info * * If we have specified it via mount option, then * use the mount option value. If the value specified at mount time is * greater than the blocks per group use the super block value. * If the super block value is greater than blocks per group return 0. * Allocator needs it be less than blocks per group. * */ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) { unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); unsigned long stripe_width = le32_to_cpu(sbi->s_es->s_raid_stripe_width); int ret; if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) ret = sbi->s_stripe; else if (stripe_width && stripe_width <= sbi->s_blocks_per_group) ret = stripe_width; else if (stride && stride <= sbi->s_blocks_per_group) ret = stride; else ret = 0; /* * If the stripe width is 1, this makes no sense and * we set it to 0 to turn off stripe handling code. */ if (ret <= 1) ret = 0; return ret; } /* * Check whether this filesystem can be mounted based on * the features present and the RDONLY/RDWR mount requested. * Returns 1 if this filesystem can be mounted as requested, * 0 if it cannot be. */ int ext4_feature_set_ok(struct super_block *sb, int readonly) { if (ext4_has_unknown_ext4_incompat_features(sb)) { ext4_msg(sb, KERN_ERR, "Couldn't mount because of " "unsupported optional features (%x)", (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & ~EXT4_FEATURE_INCOMPAT_SUPP)); return 0; } #if !IS_ENABLED(CONFIG_UNICODE) if (ext4_has_feature_casefold(sb)) { ext4_msg(sb, KERN_ERR, "Filesystem with casefold feature cannot be " "mounted without CONFIG_UNICODE"); return 0; } #endif if (readonly) return 1; if (ext4_has_feature_readonly(sb)) { ext4_msg(sb, KERN_INFO, "filesystem is read-only"); sb->s_flags |= SB_RDONLY; return 1; } /* Check that feature set is OK for a read-write mount */ if (ext4_has_unknown_ext4_ro_compat_features(sb)) { ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " "unsupported optional features (%x)", (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & ~EXT4_FEATURE_RO_COMPAT_SUPP)); return 0; } if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) { ext4_msg(sb, KERN_ERR, "Can't support bigalloc feature without " "extents feature\n"); return 0; } #if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2) if (!readonly && (ext4_has_feature_quota(sb) || ext4_has_feature_project(sb))) { ext4_msg(sb, KERN_ERR, "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2"); return 0; } #endif /* CONFIG_QUOTA */ return 1; } /* * This function is called once a day if we have errors logged * on the file system */ static void print_daily_error_info(struct timer_list *t) { struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report); struct super_block *sb = sbi->s_sb; struct ext4_super_block *es = sbi->s_es; if (es->s_error_count) /* fsck newer than v1.41.13 is needed to clean this condition. */ ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", le32_to_cpu(es->s_error_count)); if (es->s_first_error_time) { printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d", sb->s_id, ext4_get_tstamp(es, s_first_error_time), (int) sizeof(es->s_first_error_func), es->s_first_error_func, le32_to_cpu(es->s_first_error_line)); if (es->s_first_error_ino) printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_first_error_ino)); if (es->s_first_error_block) printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_first_error_block)); printk(KERN_CONT "\n"); } if (es->s_last_error_time) { printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d", sb->s_id, ext4_get_tstamp(es, s_last_error_time), (int) sizeof(es->s_last_error_func), es->s_last_error_func, le32_to_cpu(es->s_last_error_line)); if (es->s_last_error_ino) printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_last_error_ino)); if (es->s_last_error_block) printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_last_error_block)); printk(KERN_CONT "\n"); } mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ } /* Find next suitable group and run ext4_init_inode_table */ static int ext4_run_li_request(struct ext4_li_request *elr) { struct ext4_group_desc *gdp = NULL; struct super_block *sb = elr->lr_super; ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; ext4_group_t group = elr->lr_next_group; unsigned int prefetch_ios = 0; int ret = 0; int nr = EXT4_SB(sb)->s_mb_prefetch; u64 start_time; if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) { elr->lr_next_group = ext4_mb_prefetch(sb, group, nr, &prefetch_ios); ext4_mb_prefetch_fini(sb, elr->lr_next_group, nr); trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, nr); if (group >= elr->lr_next_group) { ret = 1; if (elr->lr_first_not_zeroed != ngroups && !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { elr->lr_next_group = elr->lr_first_not_zeroed; elr->lr_mode = EXT4_LI_MODE_ITABLE; ret = 0; } } return ret; } for (; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { ret = 1; break; } if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) break; } if (group >= ngroups) ret = 1; if (!ret) { start_time = ktime_get_real_ns(); ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); trace_ext4_lazy_itable_init(sb, group); if (elr->lr_timeout == 0) { elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) * EXT4_SB(elr->lr_super)->s_li_wait_mult); } elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_group = group + 1; } return ret; } /* * Remove lr_request from the list_request and free the * request structure. Should be called with li_list_mtx held */ static void ext4_remove_li_request(struct ext4_li_request *elr) { if (!elr) return; list_del(&elr->lr_request); EXT4_SB(elr->lr_super)->s_li_request = NULL; kfree(elr); } static void ext4_unregister_li_request(struct super_block *sb) { mutex_lock(&ext4_li_mtx); if (!ext4_li_info) { mutex_unlock(&ext4_li_mtx); return; } mutex_lock(&ext4_li_info->li_list_mtx); ext4_remove_li_request(EXT4_SB(sb)->s_li_request); mutex_unlock(&ext4_li_info->li_list_mtx); mutex_unlock(&ext4_li_mtx); } static struct task_struct *ext4_lazyinit_task; /* * This is the function where ext4lazyinit thread lives. It walks * through the request list searching for next scheduled filesystem. * When such a fs is found, run the lazy initialization request * (ext4_rn_li_request) and keep track of the time spend in this * function. Based on that time we compute next schedule time of * the request. When walking through the list is complete, compute * next waking time and put itself into sleep. */ static int ext4_lazyinit_thread(void *arg) { struct ext4_lazy_init *eli = arg; struct list_head *pos, *n; struct ext4_li_request *elr; unsigned long next_wakeup, cur; BUG_ON(NULL == eli); set_freezable(); cont_thread: while (true) { next_wakeup = MAX_JIFFY_OFFSET; mutex_lock(&eli->li_list_mtx); if (list_empty(&eli->li_request_list)) { mutex_unlock(&eli->li_list_mtx); goto exit_thread; } list_for_each_safe(pos, n, &eli->li_request_list) { int err = 0; int progress = 0; elr = list_entry(pos, struct ext4_li_request, lr_request); if (time_before(jiffies, elr->lr_next_sched)) { if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; continue; } if (down_read_trylock(&elr->lr_super->s_umount)) { if (sb_start_write_trylock(elr->lr_super)) { progress = 1; /* * We hold sb->s_umount, sb can not * be removed from the list, it is * now safe to drop li_list_mtx */ mutex_unlock(&eli->li_list_mtx); err = ext4_run_li_request(elr); sb_end_write(elr->lr_super); mutex_lock(&eli->li_list_mtx); n = pos->next; } up_read((&elr->lr_super->s_umount)); } /* error, remove the lazy_init job */ if (err) { ext4_remove_li_request(elr); continue; } if (!progress) { elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); } if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; } mutex_unlock(&eli->li_list_mtx); try_to_freeze(); cur = jiffies; if ((time_after_eq(cur, next_wakeup)) || (MAX_JIFFY_OFFSET == next_wakeup)) { cond_resched(); continue; } schedule_timeout_interruptible(next_wakeup - cur); if (kthread_should_stop()) { ext4_clear_request_list(); goto exit_thread; } } exit_thread: /* * It looks like the request list is empty, but we need * to check it under the li_list_mtx lock, to prevent any * additions into it, and of course we should lock ext4_li_mtx * to atomically free the list and ext4_li_info, because at * this point another ext4 filesystem could be registering * new one. */ mutex_lock(&ext4_li_mtx); mutex_lock(&eli->li_list_mtx); if (!list_empty(&eli->li_request_list)) { mutex_unlock(&eli->li_list_mtx); mutex_unlock(&ext4_li_mtx); goto cont_thread; } mutex_unlock(&eli->li_list_mtx); kfree(ext4_li_info); ext4_li_info = NULL; mutex_unlock(&ext4_li_mtx); return 0; } static void ext4_clear_request_list(void) { struct list_head *pos, *n; struct ext4_li_request *elr; mutex_lock(&ext4_li_info->li_list_mtx); list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { elr = list_entry(pos, struct ext4_li_request, lr_request); ext4_remove_li_request(elr); } mutex_unlock(&ext4_li_info->li_list_mtx); } static int ext4_run_lazyinit_thread(void) { ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit"); if (IS_ERR(ext4_lazyinit_task)) { int err = PTR_ERR(ext4_lazyinit_task); ext4_clear_request_list(); kfree(ext4_li_info); ext4_li_info = NULL; printk(KERN_CRIT "EXT4-fs: error %d creating inode table " "initialization thread\n", err); return err; } ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; return 0; } /* * Check whether it make sense to run itable init. thread or not. * If there is at least one uninitialized inode table, return * corresponding group number, else the loop goes through all * groups and return total number of groups. */ static ext4_group_t ext4_has_uninit_itable(struct super_block *sb) { ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count; struct ext4_group_desc *gdp = NULL; if (!ext4_has_group_desc_csum(sb)) return ngroups; for (group = 0; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) continue; if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) break; } return group; } static int ext4_li_info_new(void) { struct ext4_lazy_init *eli = NULL; eli = kzalloc(sizeof(*eli), GFP_KERNEL); if (!eli) return -ENOMEM; INIT_LIST_HEAD(&eli->li_request_list); mutex_init(&eli->li_list_mtx); eli->li_state |= EXT4_LAZYINIT_QUIT; ext4_li_info = eli; return 0; } static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, ext4_group_t start) { struct ext4_li_request *elr; elr = kzalloc(sizeof(*elr), GFP_KERNEL); if (!elr) return NULL; elr->lr_super = sb; elr->lr_first_not_zeroed = start; if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) { elr->lr_mode = EXT4_LI_MODE_ITABLE; elr->lr_next_group = start; } else { elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; } /* * Randomize first schedule time of the request to * spread the inode table initialization requests * better. */ elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); return elr; } int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_li_request *elr = NULL; ext4_group_t ngroups = sbi->s_groups_count; int ret = 0; mutex_lock(&ext4_li_mtx); if (sbi->s_li_request != NULL) { /* * Reset timeout so it can be computed again, because * s_li_wait_mult might have changed. */ sbi->s_li_request->lr_timeout = 0; goto out; } if (sb_rdonly(sb) || (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE)))) goto out; elr = ext4_li_request_new(sb, first_not_zeroed); if (!elr) { ret = -ENOMEM; goto out; } if (NULL == ext4_li_info) { ret = ext4_li_info_new(); if (ret) goto out; } mutex_lock(&ext4_li_info->li_list_mtx); list_add(&elr->lr_request, &ext4_li_info->li_request_list); mutex_unlock(&ext4_li_info->li_list_mtx); sbi->s_li_request = elr; /* * set elr to NULL here since it has been inserted to * the request_list and the removal and free of it is * handled by ext4_clear_request_list from now on. */ elr = NULL; if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { ret = ext4_run_lazyinit_thread(); if (ret) goto out; } out: mutex_unlock(&ext4_li_mtx); if (ret) kfree(elr); return ret; } /* * We do not need to lock anything since this is called on * module unload. */ static void ext4_destroy_lazyinit_thread(void) { /* * If thread exited earlier * there's nothing to be done. */ if (!ext4_li_info || !ext4_lazyinit_task) return; kthread_stop(ext4_lazyinit_task); } static int set_journal_csum_feature_set(struct super_block *sb) { int ret = 1; int compat, incompat; struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_metadata_csum(sb)) { /* journal checksum v3 */ compat = 0; incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; } else { /* journal checksum v1 */ compat = JBD2_FEATURE_COMPAT_CHECKSUM; incompat = 0; } jbd2_journal_clear_features(sbi->s_journal, JBD2_FEATURE_COMPAT_CHECKSUM, 0, JBD2_FEATURE_INCOMPAT_CSUM_V3 | JBD2_FEATURE_INCOMPAT_CSUM_V2); if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ret = jbd2_journal_set_features(sbi->s_journal, compat, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | incompat); } else if (test_opt(sb, JOURNAL_CHECKSUM)) { ret = jbd2_journal_set_features(sbi->s_journal, compat, 0, incompat); jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } else { jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } return ret; } /* * Note: calculating the overhead so we can be compatible with * historical BSD practice is quite difficult in the face of * clusters/bigalloc. This is because multiple metadata blocks from * different block group can end up in the same allocation cluster. * Calculating the exact overhead in the face of clustered allocation * requires either O(all block bitmaps) in memory or O(number of block * groups**2) in time. We will still calculate the superblock for * older file systems --- and if we come across with a bigalloc file * system with zero in s_overhead_clusters the estimate will be close to * correct especially for very large cluster sizes --- but for newer * file systems, it's better to calculate this figure once at mkfs * time, and store it in the superblock. If the superblock value is * present (even for non-bigalloc file systems), we will use it. */ static int count_overhead(struct super_block *sb, ext4_group_t grp, char *buf) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp; ext4_fsblk_t first_block, last_block, b; ext4_group_t i, ngroups = ext4_get_groups_count(sb); int s, j, count = 0; int has_super = ext4_bg_has_super(sb, grp); if (!ext4_has_feature_bigalloc(sb)) return (has_super + ext4_bg_num_gdb(sb, grp) + (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) + sbi->s_itb_per_group + 2); first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + (grp * EXT4_BLOCKS_PER_GROUP(sb)); last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); b = ext4_block_bitmap(sb, gdp); if (b >= first_block && b <= last_block) { ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); count++; } b = ext4_inode_bitmap(sb, gdp); if (b >= first_block && b <= last_block) { ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); count++; } b = ext4_inode_table(sb, gdp); if (b >= first_block && b + sbi->s_itb_per_group <= last_block) for (j = 0; j < sbi->s_itb_per_group; j++, b++) { int c = EXT4_B2C(sbi, b - first_block); ext4_set_bit(c, buf); count++; } if (i != grp) continue; s = 0; if (ext4_bg_has_super(sb, grp)) { ext4_set_bit(s++, buf); count++; } j = ext4_bg_num_gdb(sb, grp); if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) { ext4_error(sb, "Invalid number of block group " "descriptor blocks: %d", j); j = EXT4_BLOCKS_PER_GROUP(sb) - s; } count += j; for (; j > 0; j--) ext4_set_bit(EXT4_B2C(sbi, s++), buf); } if (!count) return 0; return EXT4_CLUSTERS_PER_GROUP(sb) - ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8); } /* * Compute the overhead and stash it in sbi->s_overhead */ int ext4_calculate_overhead(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct inode *j_inode; unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum); ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; char *buf = (char *) get_zeroed_page(GFP_NOFS); if (!buf) return -ENOMEM; /* * Compute the overhead (FS structures). This is constant * for a given filesystem unless the number of block groups * changes so we cache the previous value until it does. */ /* * All of the blocks before first_data_block are overhead */ overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); /* * Add the overhead found in each block group */ for (i = 0; i < ngroups; i++) { int blks; blks = count_overhead(sb, i, buf); overhead += blks; if (blks) memset(buf, 0, PAGE_SIZE); cond_resched(); } /* * Add the internal journal blocks whether the journal has been * loaded or not */ if (sbi->s_journal && !sbi->s_journal_bdev_handle) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); if (!IS_ERR(j_inode)) { j_blocks = j_inode->i_size >> sb->s_blocksize_bits; overhead += EXT4_NUM_B2C(sbi, j_blocks); iput(j_inode); } else { ext4_msg(sb, KERN_ERR, "can't get journal size"); } } sbi->s_overhead = overhead; smp_wmb(); free_page((unsigned long) buf); return 0; } static void ext4_set_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* * There's no need to reserve anything when we aren't using extents. * The space estimates are exact, there are no unwritten extents, * hole punching doesn't need new metadata... This is needed especially * to keep ext2/3 backward compatibility. */ if (!ext4_has_feature_extents(sb)) return; /* * By default we reserve 2% or 4096 clusters, whichever is smaller. * This should cover the situations where we can not afford to run * out of space like for example punch hole, or converting * unwritten extents in delalloc path. In most cases such * allocation would require 1, or 2 blocks, higher numbers are * very rare. */ resv_clusters = (ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits); do_div(resv_clusters, 50); resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); atomic64_set(&sbi->s_resv_clusters, resv_clusters); } static const char *ext4_quota_mode(struct super_block *sb) { #ifdef CONFIG_QUOTA if (!ext4_quota_capable(sb)) return "none"; if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb)) return "journalled"; else return "writeback"; #else return "disabled"; #endif } static void ext4_setup_csum_trigger(struct super_block *sb, enum ext4_journal_trigger_type type, void (*trigger)( struct jbd2_buffer_trigger_type *type, struct buffer_head *bh, void *mapped_data, size_t size)) { struct ext4_sb_info *sbi = EXT4_SB(sb); sbi->s_journal_triggers[type].sb = sb; sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger; } static void ext4_free_sbi(struct ext4_sb_info *sbi) { if (!sbi) return; kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); } static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) { struct ext4_sb_info *sbi; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return NULL; sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off, NULL, NULL); sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); if (!sbi->s_blockgroup_lock) goto err_out; sb->s_fs_info = sbi; sbi->s_sb = sb; return sbi; err_out: fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); return NULL; } static void ext4_set_def_opts(struct super_block *sb, struct ext4_super_block *es) { unsigned long def_mount_opts; /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); set_opt(sb, INIT_INODE_TABLE); if (def_mount_opts & EXT4_DEFM_DEBUG) set_opt(sb, DEBUG); if (def_mount_opts & EXT4_DEFM_BSDGROUPS) set_opt(sb, GRPID); if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ set_opt(sb, XATTR_USER); #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif if (ext4_has_feature_fast_commit(sb)) set_opt2(sb, JOURNAL_FAST_COMMIT); /* don't forget to enable journal_csum when metadata_csum is enabled. */ if (ext4_has_metadata_csum(sb)) set_opt(sb, JOURNAL_CHECKSUM); if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) set_opt(sb, JOURNAL_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) set_opt(sb, ORDERED_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) set_opt(sb, WRITEBACK_DATA); if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC) set_opt(sb, ERRORS_PANIC); else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE) set_opt(sb, ERRORS_CONT); else set_opt(sb, ERRORS_RO); /* block_validity enabled by default; disable with noblock_validity */ set_opt(sb, BLOCK_VALIDITY); if (def_mount_opts & EXT4_DEFM_DISCARD) set_opt(sb, DISCARD); if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) set_opt(sb, BARRIER); /* * enable delayed allocation by default * Use -o nodelalloc to turn it off */ if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) && ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); if (sb->s_blocksize <= PAGE_SIZE) set_opt(sb, DIOREAD_NOLOCK); } static int ext4_handle_clustersize(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int clustersize; /* Handle clustersize */ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); if (ext4_has_feature_bigalloc(sb)) { if (clustersize < sb->s_blocksize) { ext4_msg(sb, KERN_ERR, "cluster size (%d) smaller than " "block size (%lu)", clustersize, sb->s_blocksize); return -EINVAL; } sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - le32_to_cpu(es->s_log_block_size); sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group); if (sbi->s_clusters_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu", sbi->s_clusters_per_group); return -EINVAL; } if (sbi->s_blocks_per_group != (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) { ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " "clusters per group (%lu) inconsistent", sbi->s_blocks_per_group, sbi->s_clusters_per_group); return -EINVAL; } } else { if (clustersize != sb->s_blocksize) { ext4_msg(sb, KERN_ERR, "fragment/cluster size (%d) != " "block size (%lu)", clustersize, sb->s_blocksize); return -EINVAL; } if (sbi->s_blocks_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "#blocks per group too big: %lu", sbi->s_blocks_per_group); return -EINVAL; } sbi->s_clusters_per_group = sbi->s_blocks_per_group; sbi->s_cluster_bits = 0; } sbi->s_cluster_ratio = clustersize / sb->s_blocksize; /* Do we have standard group size of clustersize * 8 blocks ? */ if (sbi->s_blocks_per_group == clustersize << 3) set_opt2(sb, STD_GROUP_SIZE); return 0; } static void ext4_fast_commit_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* Initialize fast commit stuff */ atomic_set(&sbi->s_fc_subtid, 0); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); sbi->s_fc_ineligible_tid = 0; spin_lock_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; sbi->s_fc_replay_state.fc_regions_size = 0; sbi->s_fc_replay_state.fc_regions_used = 0; sbi->s_fc_replay_state.fc_regions_valid = 0; sbi->s_fc_replay_state.fc_modified_inodes = NULL; sbi->s_fc_replay_state.fc_modified_inodes_size = 0; sbi->s_fc_replay_state.fc_modified_inodes_used = 0; } static int ext4_inode_info_init(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; } else { sbi->s_inode_size = le16_to_cpu(es->s_inode_size); sbi->s_first_ino = le32_to_cpu(es->s_first_ino); if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { ext4_msg(sb, KERN_ERR, "invalid first ino: %u", sbi->s_first_ino); return -EINVAL; } if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || (!is_power_of_2(sbi->s_inode_size)) || (sbi->s_inode_size > sb->s_blocksize)) { ext4_msg(sb, KERN_ERR, "unsupported inode size: %d", sbi->s_inode_size); ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize); return -EINVAL; } /* * i_atime_extra is the last extra field available for * [acm]times in struct ext4_inode. Checking for that * field should suffice to ensure we have extra space * for all three. */ if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + sizeof(((struct ext4_inode *)0)->i_atime_extra)) { sb->s_time_gran = 1; sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; } else { sb->s_time_gran = NSEC_PER_SEC; sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; } sb->s_time_min = EXT4_TIMESTAMP_MIN; } if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { sbi->s_want_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; if (ext4_has_feature_extra_isize(sb)) { unsigned v, max = (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE); v = le16_to_cpu(es->s_want_extra_isize); if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_want_extra_isize: %d", v); return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; v = le16_to_cpu(es->s_min_extra_isize); if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_min_extra_isize: %d", v); return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; } } return 0; } #if IS_ENABLED(CONFIG_UNICODE) static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) { const struct ext4_sb_encodings *encoding_info; struct unicode_map *encoding; __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags); if (!ext4_has_feature_casefold(sb) || sb->s_encoding) return 0; encoding_info = ext4_sb_read_encoding(es); if (!encoding_info) { ext4_msg(sb, KERN_ERR, "Encoding requested by superblock is unknown"); return -EINVAL; } encoding = utf8_load(encoding_info->version); if (IS_ERR(encoding)) { ext4_msg(sb, KERN_ERR, "can't mount with superblock charset: %s-%u.%u.%u " "not supported by the kernel. flags: 0x%x.", encoding_info->name, unicode_major(encoding_info->version), unicode_minor(encoding_info->version), unicode_rev(encoding_info->version), encoding_flags); return -EINVAL; } ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: " "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, unicode_major(encoding_info->version), unicode_minor(encoding_info->version), unicode_rev(encoding_info->version), encoding_flags); sb->s_encoding = encoding; sb->s_encoding_flags = encoding_flags; return 0; } #else static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) { return 0; } #endif static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* Warn if metadata_csum and gdt_csum are both set. */ if (ext4_has_feature_metadata_csum(sb) && ext4_has_feature_gdt_csum(sb)) ext4_warning(sb, "metadata_csum and uninit_bg are " "redundant flags; please run fsck."); /* Check for a known checksum algorithm */ if (!ext4_verify_csum_type(sb, es)) { ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " "unknown checksum algorithm."); return -EINVAL; } ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, ext4_orphan_file_block_trigger); /* Load the checksum driver */ sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { int ret = PTR_ERR(sbi->s_chksum_driver); ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); sbi->s_chksum_driver = NULL; return ret; } /* Check superblock checksum */ if (!ext4_superblock_csum_verify(sb, es)) { ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " "invalid superblock checksum. Run e2fsck?"); return -EFSBADCRC; } /* Precompute checksum seed for all metadata */ if (ext4_has_feature_csum_seed(sb)) sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, sizeof(es->s_uuid)); return 0; } static int ext4_check_feature_compatibility(struct super_block *sb, struct ext4_super_block *es, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && (ext4_has_compat_features(sb) || ext4_has_ro_compat_features(sb) || ext4_has_incompat_features(sb))) ext4_msg(sb, KERN_WARNING, "feature flags set on rev 0 fs, " "running e2fsck is recommended"); if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) { set_opt2(sb, HURD_COMPAT); if (ext4_has_feature_64bit(sb)) { ext4_msg(sb, KERN_ERR, "The Hurd can't support 64-bit file systems"); return -EINVAL; } /* * ea_inode feature uses l_i_version field which is not * available in HURD_COMPAT mode. */ if (ext4_has_feature_ea_inode(sb)) { ext4_msg(sb, KERN_ERR, "ea_inode feature is not supported for Hurd"); return -EINVAL; } } if (IS_EXT2_SB(sb)) { if (ext2_feature_set_ok(sb)) ext4_msg(sb, KERN_INFO, "mounting ext2 file system " "using the ext4 subsystem"); else { /* * If we're probing be silent, if this looks like * it's actually an ext[34] filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " "to feature incompatibilities"); return -EINVAL; } } if (IS_EXT3_SB(sb)) { if (ext3_feature_set_ok(sb)) ext4_msg(sb, KERN_INFO, "mounting ext3 file system " "using the ext4 subsystem"); else { /* * If we're probing be silent, if this looks like * it's actually an ext4 filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " "to feature incompatibilities"); return -EINVAL; } } /* * Check feature flags regardless of the revision level, since we * previously didn't change the revision level when setting the flags, * so there is a chance incompat flags are set on a rev 0 filesystem. */ if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) return -EINVAL; if (sbi->s_daxdev) { if (sb->s_blocksize == PAGE_SIZE) set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); else ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); } if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); return -EINVAL; } if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { ext4_msg(sb, KERN_ERR, "DAX unsupported by block device."); return -EINVAL; } } if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", es->s_encryption_level); return -EINVAL; } return 0; } static int ext4_check_geometry(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); __u64 blocks_count; int err; if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) { ext4_msg(sb, KERN_ERR, "Number of reserved GDT blocks insanely large: %d", le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); return -EINVAL; } /* * Test whether we have more sectors than will fit in sector_t, * and whether the max offset is addressable by the page cache. */ err = generic_check_addressable(sb->s_blocksize_bits, ext4_blocks_count(es)); if (err) { ext4_msg(sb, KERN_ERR, "filesystem" " too large to mount safely on this system"); return err; } /* check blocks count against device size */ blocks_count = sb_bdev_nr_blocks(sb); if (blocks_count && ext4_blocks_count(es) > blocks_count) { ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " "exceeds size of device (%llu blocks)", ext4_blocks_count(es), blocks_count); return -EINVAL; } /* * It makes no sense for the first data block to be beyond the end * of the filesystem. */ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { ext4_msg(sb, KERN_WARNING, "bad geometry: first data " "block %u is beyond end of filesystem (%llu)", le32_to_cpu(es->s_first_data_block), ext4_blocks_count(es)); return -EINVAL; } if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) && (sbi->s_cluster_ratio == 1)) { ext4_msg(sb, KERN_WARNING, "bad geometry: first data " "block is 0 with a 1k block and cluster size"); return -EINVAL; } blocks_count = (ext4_blocks_count(es) - le32_to_cpu(es->s_first_data_block) + EXT4_BLOCKS_PER_GROUP(sb) - 1); do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { ext4_msg(sb, KERN_WARNING, "groups count too large: %llu " "(block count %llu, first data block %u, " "blocks per group %lu)", blocks_count, ext4_blocks_count(es), le32_to_cpu(es->s_first_data_block), EXT4_BLOCKS_PER_GROUP(sb)); return -EINVAL; } sbi->s_groups_count = blocks_count; sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != le32_to_cpu(es->s_inodes_count)) { ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", le32_to_cpu(es->s_inodes_count), ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); return -EINVAL; } return 0; } static int ext4_group_desc_init(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t logical_sb_block, ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned int db_count; ext4_fsblk_t block; int i; db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); if (ext4_has_feature_meta_bg(sb)) { if (le32_to_cpu(es->s_first_meta_bg) > db_count) { ext4_msg(sb, KERN_WARNING, "first meta block group too large: %u " "(group descriptor block count %u)", le32_to_cpu(es->s_first_meta_bg), db_count); return -EINVAL; } } rcu_assign_pointer(sbi->s_group_desc, kvmalloc_array(db_count, sizeof(struct buffer_head *), GFP_KERNEL)); if (sbi->s_group_desc == NULL) { ext4_msg(sb, KERN_ERR, "not enough memory"); return -ENOMEM; } bgl_lock_init(sbi->s_blockgroup_lock); /* Pre-read the descriptors into the buffer cache */ for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); ext4_sb_breadahead_unmovable(sb, block); } for (i = 0; i < db_count; i++) { struct buffer_head *bh; block = descriptor_loc(sb, logical_sb_block, i); bh = ext4_sb_bread_unmovable(sb, block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "can't read group descriptor %d", i); sbi->s_gdb_count = i; return PTR_ERR(bh); } rcu_read_lock(); rcu_dereference(sbi->s_group_desc)[i] = bh; rcu_read_unlock(); } sbi->s_gdb_count = db_count; if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); return -EFSCORRUPTED; } return 0; } static int ext4_load_and_init_journal(struct super_block *sb, struct ext4_super_block *es, struct ext4_fs_context *ctx) { struct ext4_sb_info *sbi = EXT4_SB(sb); int err; err = ext4_load_journal(sb, es, ctx->journal_devnum); if (err) return err; if (ext4_has_feature_64bit(sb) && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT)) { ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); goto out; } if (!set_journal_csum_feature_set(sb)) { ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " "feature set"); goto out; } if (test_opt2(sb, JOURNAL_FAST_COMMIT) && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { ext4_msg(sb, KERN_ERR, "Failed to set fast commit journal feature"); goto out; } /* We have now updated the journal if required, so we can * validate the data journaling mode. */ switch (test_opt(sb, DATA_FLAGS)) { case 0: /* No mode set, assume a default based on the journal * capabilities: ORDERED_DATA if the journal can * cope, else JOURNAL_DATA */ if (jbd2_journal_check_available_features (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { set_opt(sb, ORDERED_DATA); sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA; } else { set_opt(sb, JOURNAL_DATA); sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; } break; case EXT4_MOUNT_ORDERED_DATA: case EXT4_MOUNT_WRITEBACK_DATA: if (!jbd2_journal_check_available_features (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { ext4_msg(sb, KERN_ERR, "Journal does not support " "requested data journaling mode"); goto out; } break; default: break; } if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit in data=ordered mode"); goto out; } set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); sbi->s_journal->j_submit_inode_data_buffers = ext4_journal_submit_inode_data_buffers; sbi->s_journal->j_finish_inode_data_buffers = ext4_journal_finish_inode_data_buffers; return 0; out: /* flush s_sb_upd_work before destroying the journal. */ flush_work(&sbi->s_sb_upd_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; return -EINVAL; } static int ext4_check_journal_data_mode(struct super_block *sb) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with " "data=journal disables delayed allocation, " "dioread_nolock, O_DIRECT and fast_commit support!\n"); /* can't mount with both data=journal and dioread_nolock. */ clear_opt(sb, DIOREAD_NOLOCK); clear_opt2(sb, JOURNAL_FAST_COMMIT); if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); return -EINVAL; } if (test_opt(sb, DAX_ALWAYS)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); return -EINVAL; } if (ext4_has_feature_encrypt(sb)) { ext4_msg(sb, KERN_WARNING, "encrypted files will use data=ordered " "instead of data journaling mode"); } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } else { sb->s_iflags |= SB_I_CGROUPWB; } return 0; } static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es; ext4_fsblk_t logical_sb_block; unsigned long offset = 0; struct buffer_head *bh; int ret = -EINVAL; int blocksize; blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); if (!blocksize) { ext4_msg(sb, KERN_ERR, "unable to set blocksize"); return -EINVAL; } /* * The ext4 superblock will not be buffer aligned for other than 1kB * block sizes. We need to calculate the offset from buffer start. */ if (blocksize != EXT4_MIN_BLOCK_SIZE) { logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); } else { logical_sb_block = sbi->s_sb_block; } bh = ext4_sb_bread_unmovable(sb, logical_sb_block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "unable to read superblock"); return PTR_ERR(bh); } /* * Note: s_es must be initialized as soon as possible because * some ext4 macro-instructions depend on its value */ es = (struct ext4_super_block *) (bh->b_data + offset); sbi->s_es = es; sb->s_magic = le16_to_cpu(es->s_magic); if (sb->s_magic != EXT4_SUPER_MAGIC) { if (!silent) ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto out; } if (le32_to_cpu(es->s_log_block_size) > (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { ext4_msg(sb, KERN_ERR, "Invalid log block size: %u", le32_to_cpu(es->s_log_block_size)); goto out; } if (le32_to_cpu(es->s_log_cluster_size) > (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { ext4_msg(sb, KERN_ERR, "Invalid log cluster size: %u", le32_to_cpu(es->s_log_cluster_size)); goto out; } blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); /* * If the default block size is not the same as the real block size, * we need to reload it. */ if (sb->s_blocksize == blocksize) { *lsb = logical_sb_block; sbi->s_sbh = bh; return 0; } /* * bh must be released before kill_bdev(), otherwise * it won't be freed and its page also. kill_bdev() * is called by sb_set_blocksize(). */ brelse(bh); /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { ext4_msg(sb, KERN_ERR, "bad block size %d", blocksize); bh = NULL; goto out; } logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); bh = ext4_sb_bread_unmovable(sb, logical_sb_block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try"); ret = PTR_ERR(bh); bh = NULL; goto out; } es = (struct ext4_super_block *)(bh->b_data + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!"); goto out; } *lsb = logical_sb_block; sbi->s_sbh = bh; return 0; out: brelse(bh); return ret; } static void ext4_hash_info_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; unsigned int i; for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; if (ext4_has_feature_dir_index(sb)) { i = le32_to_cpu(es->s_flags); if (i & EXT2_FLAGS_UNSIGNED_HASH) sbi->s_hash_unsigned = 3; else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { #ifdef __CHAR_UNSIGNED__ if (!sb_rdonly(sb)) es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); sbi->s_hash_unsigned = 3; #else if (!sb_rdonly(sb)) es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); #endif } } } static int ext4_block_group_meta_init(struct super_block *sb, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int has_huge_files; has_huge_files = ext4_has_feature_huge_file(sb); sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, has_huge_files); sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); sbi->s_desc_size = le16_to_cpu(es->s_desc_size); if (ext4_has_feature_64bit(sb)) { if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || sbi->s_desc_size > EXT4_MAX_DESC_SIZE || !is_power_of_2(sbi->s_desc_size)) { ext4_msg(sb, KERN_ERR, "unsupported descriptor size %lu", sbi->s_desc_size); return -EINVAL; } } else sbi->s_desc_size = EXT4_MIN_DESC_SIZE; sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb); if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) { if (!silent) ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); return -EINVAL; } if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || sbi->s_inodes_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", sbi->s_inodes_per_group); return -EINVAL; } sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block; sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb); sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); return 0; } static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) { struct ext4_super_block *es = NULL; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t logical_sb_block; struct inode *root; int needs_recovery; int err; ext4_group_t first_not_zeroed; struct ext4_fs_context *ctx = fc->fs_private; int silent = fc->sb_flags & SB_SILENT; /* Set defaults for the variables that will be set during parsing */ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sectors_written_start = part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); err = ext4_load_super(sb, &logical_sb_block, silent); if (err) goto out_fail; es = sbi->s_es; sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); err = ext4_init_metadata_csum(sb, es); if (err) goto failed_mount; ext4_set_def_opts(sb, es); sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; /* * set default s_li_wait_mult for lazyinit, for the case there is * no mount option specified. */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; err = ext4_inode_info_init(sb, es); if (err) goto failed_mount; err = parse_apply_sb_mount_options(sb, ctx); if (err < 0) goto failed_mount; sbi->s_def_mount_opt = sbi->s_mount_opt; sbi->s_def_mount_opt2 = sbi->s_mount_opt2; err = ext4_check_opt_consistency(fc, sb); if (err < 0) goto failed_mount; ext4_apply_options(fc, sb); err = ext4_encoding_init(sb, es); if (err) goto failed_mount; err = ext4_check_journal_data_mode(sb); if (err) goto failed_mount; sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); /* i_version is always enabled now */ sb->s_flags |= SB_I_VERSION; err = ext4_check_feature_compatibility(sb, es, silent); if (err) goto failed_mount; err = ext4_block_group_meta_init(sb, silent); if (err) goto failed_mount; ext4_hash_info_init(sb); err = ext4_handle_clustersize(sb); if (err) goto failed_mount; err = ext4_check_geometry(sb, es); if (err) goto failed_mount; timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); INIT_WORK(&sbi->s_sb_upd_work, update_super_work); err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); if (err) goto failed_mount3; err = ext4_es_register_shrinker(sbi); if (err) goto failed_mount3; sbi->s_stripe = ext4_get_stripe_size(sbi); /* * It's hard to get stripe aligned blocks if stripe is not aligned with * cluster, just disable stripe and alert user to simpfy code and avoid * stripe aligned allocation which will rarely successes. */ if (sbi->s_stripe > 0 && sbi->s_cluster_ratio > 1 && sbi->s_stripe % sbi->s_cluster_ratio != 0) { ext4_msg(sb, KERN_WARNING, "stripe (%lu) is not aligned with cluster size (%u), " "stripe is disabled", sbi->s_stripe, sbi->s_cluster_ratio); sbi->s_stripe = 0; } sbi->s_extent_max_zeroout_kb = 32; /* * set up enough so that it can read an inode */ sb->s_op = &ext4_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_FS_ENCRYPTION sb->s_cop = &ext4_cryptops; #endif #ifdef CONFIG_FS_VERITY sb->s_vop = &ext4_verityops; #endif #ifdef CONFIG_QUOTA sb->dq_op = &ext4_quota_operations; if (ext4_has_feature_quota(sb)) sb->s_qcop = &dquot_quotactl_sysfile_ops; else sb->s_qcop = &ext4_qctl_operations; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); ext4_fast_commit_init(sb); sb->s_root = NULL; needs_recovery = (es->s_last_orphan != 0 || ext4_has_feature_orphan_present(sb) || ext4_has_feature_journal_needs_recovery(sb)); if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) { err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); if (err) goto failed_mount3a; } err = -EINVAL; /* * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! */ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { err = ext4_load_and_init_journal(sb, es, ctx); if (err) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && ext4_has_feature_journal_needs_recovery(sb)) { ext4_msg(sb, KERN_ERR, "required journal recovery " "suppressed and not mounted read-only"); goto failed_mount3a; } else { /* Nojournal mode, all journal mount options are illegal */ if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit, fs mounted w/o journal"); goto failed_mount3a; } if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_checksum, fs mounted w/o journal"); goto failed_mount3a; } if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { ext4_msg(sb, KERN_ERR, "can't mount with " "commit=%lu, fs mounted w/o journal", sbi->s_commit_interval / HZ); goto failed_mount3a; } if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { ext4_msg(sb, KERN_ERR, "can't mount with " "data=, fs mounted w/o journal"); goto failed_mount3a; } sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); clear_opt2(sb, JOURNAL_FAST_COMMIT); sbi->s_journal = NULL; needs_recovery = 0; } if (!test_opt(sb, NO_MBCACHE)) { sbi->s_ea_block_cache = ext4_xattr_create_cache(); if (!sbi->s_ea_block_cache) { ext4_msg(sb, KERN_ERR, "Failed to create ea_block_cache"); err = -EINVAL; goto failed_mount_wq; } if (ext4_has_feature_ea_inode(sb)) { sbi->s_ea_inode_cache = ext4_xattr_create_cache(); if (!sbi->s_ea_inode_cache) { ext4_msg(sb, KERN_ERR, "Failed to create ea_inode_cache"); err = -EINVAL; goto failed_mount_wq; } } } /* * Get the # of file system overhead blocks from the * superblock if present. */ sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); /* ignore the precalculated value if it is ridiculous */ if (sbi->s_overhead > ext4_blocks_count(es)) sbi->s_overhead = 0; /* * If the bigalloc feature is not enabled recalculating the * overhead doesn't take long, so we might as well just redo * it to make sure we are using the correct value. */ if (!ext4_has_feature_bigalloc(sb)) sbi->s_overhead = 0; if (sbi->s_overhead == 0) { err = ext4_calculate_overhead(sb); if (err) goto failed_mount_wq; } /* * The maximum number of concurrent works can be high and * concurrency isn't really necessary. Limit it to 1. */ EXT4_SB(sb)->rsv_conversion_wq = alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); if (!EXT4_SB(sb)->rsv_conversion_wq) { printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); err = -ENOMEM; goto failed_mount4; } /* * The jbd2_journal_load will have done any necessary log recovery, * so we can safely mount the rest of the filesystem now. */ root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL); if (IS_ERR(root)) { ext4_msg(sb, KERN_ERR, "get root inode failed"); err = PTR_ERR(root); root = NULL; goto failed_mount4; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); iput(root); err = -EFSCORRUPTED; goto failed_mount4; } sb->s_root = d_make_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); err = -ENOMEM; goto failed_mount4; } err = ext4_setup_super(sb, es, sb_rdonly(sb)); if (err == -EROFS) { sb->s_flags |= SB_RDONLY; } else if (err) goto failed_mount4a; ext4_set_resv_clusters(sb); if (test_opt(sb, BLOCK_VALIDITY)) { err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " "zone (%d)", err); goto failed_mount4a; } } ext4_fc_replay_cleanup(sb); ext4_ext_init(sb); /* * Enable optimize_scan if number of groups is > threshold. This can be * turned off by passing "mb_optimize_scan=0". This can also be * turned on forcefully by passing "mb_optimize_scan=1". */ if (!(ctx->spec & EXT4_SPEC_mb_optimize_scan)) { if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD) set_opt2(sb, MB_OPTIMIZE_SCAN); else clear_opt2(sb, MB_OPTIMIZE_SCAN); } err = ext4_mb_init(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", err); goto failed_mount5; } /* * We can only set up the journal commit callback once * mballoc is initialized */ if (sbi->s_journal) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; err = ext4_percpu_param_init(sbi); if (err) goto failed_mount6; if (ext4_has_feature_flex_bg(sb)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); err = -ENOMEM; goto failed_mount6; } err = ext4_register_li_request(sb, first_not_zeroed); if (err) goto failed_mount6; err = ext4_register_sysfs(sb); if (err) goto failed_mount7; err = ext4_init_orphan_info(sb); if (err) goto failed_mount8; #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { err = ext4_enable_quotas(sb); if (err) goto failed_mount9; } #endif /* CONFIG_QUOTA */ /* * Save the original bdev mapping's wb_err value which could be * used to detect the metadata async write error. */ spin_lock_init(&sbi->s_bdev_wb_lock); errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, &sbi->s_bdev_wb_err); EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; /* * Update the checksum after updating free space/inode counters and * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect * checksum in the buffer cache until it is written out and * e2fsprogs programs trying to open a file system immediately * after it is mounted can fail. */ ext4_superblock_csum_set(sb); if (needs_recovery) { ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); if (err) goto failed_mount10; } if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) ext4_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but the device does not support discard"); if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ /* Enable message ratelimiting. Default is 10 messages per 5 secs. */ ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); atomic_set(&sbi->s_warning_count, 0); atomic_set(&sbi->s_msg_count, 0); return 0; failed_mount10: ext4_quotas_off(sb, EXT4_MAXQUOTAS); failed_mount9: __maybe_unused ext4_release_orphan_info(sb); failed_mount8: ext4_unregister_sysfs(sb); kobject_put(&sbi->s_kobj); failed_mount7: ext4_unregister_li_request(sb); failed_mount6: ext4_mb_release(sb); ext4_flex_groups_free(sbi); ext4_percpu_param_destroy(sbi); failed_mount5: ext4_ext_release(sb); ext4_release_system_zone(sb); failed_mount4a: dput(sb->s_root); sb->s_root = NULL; failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); sbi->s_ea_inode_cache = NULL; ext4_xattr_destroy_cache(sbi->s_ea_block_cache); sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { /* flush s_sb_upd_work before journal destroy. */ flush_work(&sbi->s_sb_upd_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: /* flush s_sb_upd_work before sbi destroy */ flush_work(&sbi->s_sb_upd_work); del_timer_sync(&sbi->s_err_report); ext4_stop_mmpd(sbi); ext4_group_desc_free(sbi); failed_mount: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif #ifdef CONFIG_QUOTA for (unsigned int i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); brelse(sbi->s_sbh); if (sbi->s_journal_bdev_handle) { invalidate_bdev(sbi->s_journal_bdev_handle->bdev); bdev_release(sbi->s_journal_bdev_handle); } out_fail: invalidate_bdev(sb->s_bdev); sb->s_fs_info = NULL; return err; } static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi; const char *descr; int ret; sbi = ext4_alloc_sbi(sb); if (!sbi) return -ENOMEM; fc->s_fs_info = sbi; /* Cleanup superblock name */ strreplace(sb->s_id, '/', '!'); sbi->s_sb_block = 1; /* Default super block location */ if (ctx->spec & EXT4_SPEC_s_sb_block) sbi->s_sb_block = ctx->s_sb_block; ret = __ext4_fill_super(fc, sb); if (ret < 0) goto free_sbi; if (sbi->s_journal) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) descr = " journalled data mode"; else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) descr = " ordered data mode"; else descr = " writeback data mode"; } else descr = "out journal"; if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) ext4_msg(sb, KERN_INFO, "mounted filesystem %pU %s with%s. " "Quota mode: %s.", &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", descr, ext4_quota_mode(sb)); /* Update the s_overhead_clusters if necessary */ ext4_update_overhead(sb, false); return 0; free_sbi: ext4_free_sbi(sbi); fc->s_fs_info = NULL; return ret; } static int ext4_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, ext4_fill_super); } /* * Setup any per-fs journal parameters now. We'll do this both on * initial mount, once the journal has been initialised but before we've * done any recovery; and again on any subsequent remount. */ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) { struct ext4_sb_info *sbi = EXT4_SB(sb); journal->j_commit_interval = sbi->s_commit_interval; journal->j_min_batch_time = sbi->s_min_batch_time; journal->j_max_batch_time = sbi->s_max_batch_time; ext4_fc_init(sb, journal); write_lock(&journal->j_state_lock); if (test_opt(sb, BARRIER)) journal->j_flags |= JBD2_BARRIER; else journal->j_flags &= ~JBD2_BARRIER; if (test_opt(sb, DATA_ERR_ABORT)) journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; else journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; /* * Always enable journal cycle record option, letting the journal * records log transactions continuously between each mount. */ journal->j_flags |= JBD2_CYCLE_RECORD; write_unlock(&journal->j_state_lock); } static struct inode *ext4_get_journal_inode(struct super_block *sb, unsigned int journal_inum) { struct inode *journal_inode; /* * Test for the existence of a valid inode on disk. Bad things * happen if we iget() an unused inode, as the subsequent iput() * will try to delete it. */ journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL); if (IS_ERR(journal_inode)) { ext4_msg(sb, KERN_ERR, "no journal found"); return ERR_CAST(journal_inode); } if (!journal_inode->i_nlink) { make_bad_inode(journal_inode); iput(journal_inode); ext4_msg(sb, KERN_ERR, "journal inode is deleted"); return ERR_PTR(-EFSCORRUPTED); } if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); iput(journal_inode); return ERR_PTR(-EFSCORRUPTED); } ext4_debug("Journal inode found at %p: %lld bytes\n", journal_inode, journal_inode->i_size); return journal_inode; } static int ext4_journal_bmap(journal_t *journal, sector_t *block) { struct ext4_map_blocks map; int ret; if (journal->j_inode == NULL) return 0; map.m_lblk = *block; map.m_len = 1; ret = ext4_map_blocks(NULL, journal->j_inode, &map, 0); if (ret <= 0) { ext4_msg(journal->j_inode->i_sb, KERN_CRIT, "journal bmap failed: block %llu ret %d\n", *block, ret); jbd2_journal_abort(journal, ret ? ret : -EIO); return ret; } *block = map.m_pblk; return 0; } static journal_t *ext4_open_inode_journal(struct super_block *sb, unsigned int journal_inum) { struct inode *journal_inode; journal_t *journal; journal_inode = ext4_get_journal_inode(sb, journal_inum); if (IS_ERR(journal_inode)) return ERR_CAST(journal_inode); journal = jbd2_journal_init_inode(journal_inode); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "Could not load journal inode"); iput(journal_inode); return ERR_CAST(journal); } journal->j_private = sb; journal->j_bmap = ext4_journal_bmap; ext4_init_journal_params(sb, journal); return journal; } static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb, dev_t j_dev, ext4_fsblk_t *j_start, ext4_fsblk_t *j_len) { struct buffer_head *bh; struct block_device *bdev; struct bdev_handle *bdev_handle; int hblock, blocksize; ext4_fsblk_t sb_block; unsigned long offset; struct ext4_super_block *es; int errno; bdev_handle = bdev_open_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, sb, &fs_holder_ops); if (IS_ERR(bdev_handle)) { ext4_msg(sb, KERN_ERR, "failed to open journal device unknown-block(%u,%u) %ld", MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_handle)); return bdev_handle; } bdev = bdev_handle->bdev; blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { ext4_msg(sb, KERN_ERR, "blocksize too small for journal device"); errno = -EINVAL; goto out_bdev; } sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; offset = EXT4_MIN_BLOCK_SIZE % blocksize; set_blocksize(bdev, blocksize); bh = __bread(bdev, sb_block, blocksize); if (!bh) { ext4_msg(sb, KERN_ERR, "couldn't read superblock of " "external journal"); errno = -EINVAL; goto out_bdev; } es = (struct ext4_super_block *) (bh->b_data + offset); if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || !(le32_to_cpu(es->s_feature_incompat) & EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { ext4_msg(sb, KERN_ERR, "external journal has bad superblock"); errno = -EFSCORRUPTED; goto out_bh; } if ((le32_to_cpu(es->s_feature_ro_compat) & EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && es->s_checksum != ext4_superblock_csum(sb, es)) { ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock"); errno = -EFSCORRUPTED; goto out_bh; } if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { ext4_msg(sb, KERN_ERR, "journal UUID does not match"); errno = -EFSCORRUPTED; goto out_bh; } *j_start = sb_block + 1; *j_len = ext4_blocks_count(es); brelse(bh); return bdev_handle; out_bh: brelse(bh); out_bdev: bdev_release(bdev_handle); return ERR_PTR(errno); } static journal_t *ext4_open_dev_journal(struct super_block *sb, dev_t j_dev) { journal_t *journal; ext4_fsblk_t j_start; ext4_fsblk_t j_len; struct bdev_handle *bdev_handle; int errno = 0; bdev_handle = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); if (IS_ERR(bdev_handle)) return ERR_CAST(bdev_handle); journal = jbd2_journal_init_dev(bdev_handle->bdev, sb->s_bdev, j_start, j_len, sb->s_blocksize); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); errno = PTR_ERR(journal); goto out_bdev; } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { ext4_msg(sb, KERN_ERR, "External journal has more than one " "user (unsupported) - %d", be32_to_cpu(journal->j_superblock->s_nr_users)); errno = -EINVAL; goto out_journal; } journal->j_private = sb; EXT4_SB(sb)->s_journal_bdev_handle = bdev_handle; ext4_init_journal_params(sb, journal); return journal; out_journal: jbd2_journal_destroy(journal); out_bdev: bdev_release(bdev_handle); return ERR_PTR(errno); } static int ext4_load_journal(struct super_block *sb, struct ext4_super_block *es, unsigned long journal_devnum) { journal_t *journal; unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); dev_t journal_dev; int err = 0; int really_read_only; int journal_dev_ro; if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) return -EFSCORRUPTED; if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { ext4_msg(sb, KERN_INFO, "external journal device major/minor " "numbers have changed"); journal_dev = new_decode_dev(journal_devnum); } else journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); if (journal_inum && journal_dev) { ext4_msg(sb, KERN_ERR, "filesystem has both journal inode and journal device!"); return -EINVAL; } if (journal_inum) { journal = ext4_open_inode_journal(sb, journal_inum); if (IS_ERR(journal)) return PTR_ERR(journal); } else { journal = ext4_open_dev_journal(sb, journal_dev); if (IS_ERR(journal)) return PTR_ERR(journal); } journal_dev_ro = bdev_read_only(journal->j_dev); really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro; if (journal_dev_ro && !sb_rdonly(sb)) { ext4_msg(sb, KERN_ERR, "journal device read-only, try mounting with '-o ro'"); err = -EROFS; goto err_out; } /* * Are we loading a blank journal or performing recovery after a * crash? For recovery, we need to check in advance whether we * can get read-write access to the device. */ if (ext4_has_feature_journal_needs_recovery(sb)) { if (sb_rdonly(sb)) { ext4_msg(sb, KERN_INFO, "INFO: recovery " "required on readonly filesystem"); if (really_read_only) { ext4_msg(sb, KERN_ERR, "write access " "unavailable, cannot proceed " "(try mounting with noload)"); err = -EROFS; goto err_out; } ext4_msg(sb, KERN_INFO, "write access will " "be enabled during recovery"); } } if (!(journal->j_flags & JBD2_BARRIER)) ext4_msg(sb, KERN_INFO, "barriers disabled"); if (!ext4_has_feature_journal_needs_recovery(sb)) err = jbd2_journal_wipe(journal, !really_read_only); if (!err) { char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); __le16 orig_state; bool changed = false; if (save) memcpy(save, ((char *) es) + EXT4_S_ERR_START, EXT4_S_ERR_LEN); err = jbd2_journal_load(journal); if (save && memcmp(((char *) es) + EXT4_S_ERR_START, save, EXT4_S_ERR_LEN)) { memcpy(((char *) es) + EXT4_S_ERR_START, save, EXT4_S_ERR_LEN); changed = true; } kfree(save); orig_state = es->s_state; es->s_state |= cpu_to_le16(EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS); if (orig_state != es->s_state) changed = true; /* Write out restored error information to the superblock */ if (changed && !really_read_only) { int err2; err2 = ext4_commit_super(sb); err = err ? : err2; } } if (err) { ext4_msg(sb, KERN_ERR, "error loading journal"); goto err_out; } EXT4_SB(sb)->s_journal = journal; err = ext4_clear_journal_err(sb, es); if (err) { EXT4_SB(sb)->s_journal = NULL; jbd2_journal_destroy(journal); return err; } if (!really_read_only && journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { es->s_journal_dev = cpu_to_le32(journal_devnum); ext4_commit_super(sb); } if (!really_read_only && journal_inum && journal_inum != le32_to_cpu(es->s_journal_inum)) { es->s_journal_inum = cpu_to_le32(journal_inum); ext4_commit_super(sb); } return 0; err_out: jbd2_journal_destroy(journal); return err; } /* Copy state of EXT4_SB(sb) into buffer for on-disk superblock */ static void ext4_update_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct buffer_head *sbh = sbi->s_sbh; lock_buffer(sbh); /* * If the file system is mounted read-only, don't update the * superblock write time. This avoids updating the superblock * write time when we are mounting the root file system * read/only but we need to replay the journal; at that point, * for people who are east of GMT and who make their clock * tick in localtime for Windows bug-for-bug compatibility, * the clock is set in the future, and this will cause e2fsck * to complain and force a full file system check. */ if (!sb_rdonly(sb)) ext4_update_tstamp(es, s_wtime); es->s_kbytes_written = cpu_to_le64(sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - sbi->s_sectors_written_start) >> 1)); if (percpu_counter_initialized(&sbi->s_freeclusters_counter)) ext4_free_blocks_count_set(es, EXT4_C2B(sbi, percpu_counter_sum_positive( &sbi->s_freeclusters_counter))); if (percpu_counter_initialized(&sbi->s_freeinodes_counter)) es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( &sbi->s_freeinodes_counter)); /* Copy error information to the on-disk superblock */ spin_lock(&sbi->s_error_lock); if (sbi->s_add_error_count > 0) { es->s_state |= cpu_to_le16(EXT4_ERROR_FS); if (!es->s_first_error_time && !es->s_first_error_time_hi) { __ext4_update_tstamp(&es->s_first_error_time, &es->s_first_error_time_hi, sbi->s_first_error_time); strncpy(es->s_first_error_func, sbi->s_first_error_func, sizeof(es->s_first_error_func)); es->s_first_error_line = cpu_to_le32(sbi->s_first_error_line); es->s_first_error_ino = cpu_to_le32(sbi->s_first_error_ino); es->s_first_error_block = cpu_to_le64(sbi->s_first_error_block); es->s_first_error_errcode = ext4_errno_to_code(sbi->s_first_error_code); } __ext4_update_tstamp(&es->s_last_error_time, &es->s_last_error_time_hi, sbi->s_last_error_time); strncpy(es->s_last_error_func, sbi->s_last_error_func, sizeof(es->s_last_error_func)); es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block); es->s_last_error_errcode = ext4_errno_to_code(sbi->s_last_error_code); /* * Start the daily error reporting function if it hasn't been * started already */ if (!es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); le32_add_cpu(&es->s_error_count, sbi->s_add_error_count); sbi->s_add_error_count = 0; } spin_unlock(&sbi->s_error_lock); ext4_superblock_csum_set(sb); unlock_buffer(sbh); } static int ext4_commit_super(struct super_block *sb) { struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; if (!sbh) return -EINVAL; if (block_device_ejected(sb)) return -ENODEV; ext4_update_super(sb); lock_buffer(sbh); /* Buffer got discarded which means block device got invalidated */ if (!buffer_mapped(sbh)) { unlock_buffer(sbh); return -EIO; } if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { /* * Oh, dear. A previous attempt to write the * superblock failed. This could happen because the * USB device was yanked out. Or it could happen to * be a transient write error and maybe the block will * be remapped. Nothing we can do but to retry the * write and hope for the best. */ ext4_msg(sb, KERN_ERR, "previous I/O error to " "superblock detected"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } get_bh(sbh); /* Clear potential dirty bit if it was journalled update */ clear_buffer_dirty(sbh); sbh->b_end_io = end_buffer_write_sync; submit_bh(REQ_OP_WRITE | REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh); wait_on_buffer(sbh); if (buffer_write_io_error(sbh)) { ext4_msg(sb, KERN_ERR, "I/O error while writing " "superblock"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); return -EIO; } return 0; } /* * Have we just finished recovery? If so, and if we are mounting (or * remounting) the filesystem readonly, then we will end up with a * consistent fs on disk. Record that fact. */ static int ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es) { int err; journal_t *journal = EXT4_SB(sb)->s_journal; if (!ext4_has_feature_journal(sb)) { if (journal != NULL) { ext4_error(sb, "Journal got removed while the fs was " "mounted!"); return -EFSCORRUPTED; } return 0; } jbd2_journal_lock_updates(journal); err = jbd2_journal_flush(journal, 0); if (err < 0) goto out; if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) || ext4_has_feature_orphan_present(sb))) { if (!ext4_orphan_file_empty(sb)) { ext4_error(sb, "Orphan file not empty on read-only fs."); err = -EFSCORRUPTED; goto out; } ext4_clear_feature_journal_needs_recovery(sb); ext4_clear_feature_orphan_present(sb); ext4_commit_super(sb); } out: jbd2_journal_unlock_updates(journal); return err; } /* * If we are mounting (or read-write remounting) a filesystem whose journal * has recorded an error from a previous lifetime, move that error to the * main filesystem now. */ static int ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es) { journal_t *journal; int j_errno; const char *errstr; if (!ext4_has_feature_journal(sb)) { ext4_error(sb, "Journal got removed while the fs was mounted!"); return -EFSCORRUPTED; } journal = EXT4_SB(sb)->s_journal; /* * Now check for any error status which may have been recorded in the * journal by a prior ext4_error() or ext4_abort() */ j_errno = jbd2_journal_errno(journal); if (j_errno) { char nbuf[16]; errstr = ext4_decode_error(sb, j_errno, nbuf); ext4_warning(sb, "Filesystem error recorded " "from previous mount: %s", errstr); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); j_errno = ext4_commit_super(sb); if (j_errno) return j_errno; ext4_warning(sb, "Marked fs in need of filesystem check."); jbd2_journal_clear_err(journal); jbd2_journal_update_sb_errno(journal); } return 0; } /* * Force the running and committing transactions to commit, * and wait on the commit. */ int ext4_force_commit(struct super_block *sb) { return ext4_journal_force_commit(EXT4_SB(sb)->s_journal); } static int ext4_sync_fs(struct super_block *sb, int wait) { int ret = 0; tid_t target; bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); if (unlikely(ext4_forced_shutdown(sb))) return 0; trace_ext4_sync_fs(sb, wait); flush_workqueue(sbi->rsv_conversion_wq); /* * Writeback quota in non-journalled quota case - journalled quota has * no dirty dquots */ dquot_writeback_dquots(sb, -1); /* * Data writeback is possible w/o journal transaction, so barrier must * being sent at the end of the function. But we can skip it if * transaction_commit will do it for us. */ if (sbi->s_journal) { target = jbd2_get_latest_transaction(sbi->s_journal); if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) needs_barrier = true; if (jbd2_journal_start_commit(sbi->s_journal, &target)) { if (wait) ret = jbd2_log_wait_commit(sbi->s_journal, target); } } else if (wait && test_opt(sb, BARRIER)) needs_barrier = true; if (needs_barrier) { int err; err = blkdev_issue_flush(sb->s_bdev); if (!ret) ret = err; } return ret; } /* * LVM calls this function before a (read-only) snapshot is created. This * gives us a chance to flush the journal completely and mark the fs clean. * * Note that only this function cannot bring a filesystem to be in a clean * state independently. It relies on upper layer to stop all data & metadata * modifications. */ static int ext4_freeze(struct super_block *sb) { int error = 0; journal_t *journal = EXT4_SB(sb)->s_journal; if (journal) { /* Now we set up the journal barrier. */ jbd2_journal_lock_updates(journal); /* * Don't clear the needs_recovery flag if we failed to * flush the journal. */ error = jbd2_journal_flush(journal, 0); if (error < 0) goto out; /* Journal blocked and flushed, clear needs_recovery flag. */ ext4_clear_feature_journal_needs_recovery(sb); if (ext4_orphan_file_empty(sb)) ext4_clear_feature_orphan_present(sb); } error = ext4_commit_super(sb); out: if (journal) /* we rely on upper layer to stop further updates */ jbd2_journal_unlock_updates(journal); return error; } /* * Called by LVM after the snapshot is done. We need to reset the RECOVER * flag here, even though the filesystem is not technically dirty yet. */ static int ext4_unfreeze(struct super_block *sb) { if (ext4_forced_shutdown(sb)) return 0; if (EXT4_SB(sb)->s_journal) { /* Reset the needs_recovery flag before the fs is unlocked. */ ext4_set_feature_journal_needs_recovery(sb); if (ext4_has_feature_orphan_file(sb)) ext4_set_feature_orphan_present(sb); } ext4_commit_super(sb); return 0; } /* * Structure to save mount options for ext4_remount's benefit */ struct ext4_mount_options { unsigned long s_mount_opt; unsigned long s_mount_opt2; kuid_t s_resuid; kgid_t s_resgid; unsigned long s_commit_interval; u32 s_min_batch_time, s_max_batch_time; #ifdef CONFIG_QUOTA int s_jquota_fmt; char *s_qf_names[EXT4_MAXQUOTAS]; #endif }; static int __ext4_remount(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_super_block *es; struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned long old_sb_flags; struct ext4_mount_options old_opts; ext4_group_t g; int err = 0; int alloc_ctx; #ifdef CONFIG_QUOTA int enable_quota = 0; int i, j; char *to_free[EXT4_MAXQUOTAS]; #endif /* Store the original options */ old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; old_opts.s_mount_opt2 = sbi->s_mount_opt2; old_opts.s_resuid = sbi->s_resuid; old_opts.s_resgid = sbi->s_resgid; old_opts.s_commit_interval = sbi->s_commit_interval; old_opts.s_min_batch_time = sbi->s_min_batch_time; old_opts.s_max_batch_time = sbi->s_max_batch_time; #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) if (sbi->s_qf_names[i]) { char *qf_name = get_qf_name(sb, sbi, i); old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL); if (!old_opts.s_qf_names[i]) { for (j = 0; j < i; j++) kfree(old_opts.s_qf_names[j]); return -ENOMEM; } } else old_opts.s_qf_names[i] = NULL; #endif if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) { if (sbi->s_journal && sbi->s_journal->j_task->io_context) ctx->journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; else ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; } /* * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause * two calls to ext4_should_dioread_nolock() to return inconsistent * values, triggering WARN_ON in ext4_add_complete_io(). we grab * here s_writepages_rwsem to avoid race between writepages ops and * remount. */ alloc_ctx = ext4_writepages_down_write(sb); ext4_apply_options(fc, sb); ext4_writepages_up_write(sb, alloc_ctx); if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "changing journal_checksum " "during remount not supported; ignoring"); sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM; } if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); err = -EINVAL; goto restore_opts; } if (test_opt(sb, DIOREAD_NOLOCK)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dioread_nolock"); err = -EINVAL; goto restore_opts; } } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) { if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit in data=ordered mode"); err = -EINVAL; goto restore_opts; } } if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) { ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount"); err = -EINVAL; goto restore_opts; } if (test_opt2(sb, ABORT)) ext4_abort(sb, ESHUTDOWN, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); es = sbi->s_es; if (sbi->s_journal) { ext4_init_journal_params(sb, sbi->s_journal); set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); } /* Flush outstanding errors before changing fs state */ flush_work(&sbi->s_sb_upd_work); if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { if (ext4_forced_shutdown(sb)) { err = -EROFS; goto restore_opts; } if (fc->sb_flags & SB_RDONLY) { err = sync_filesystem(sb); if (err < 0) goto restore_opts; err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; /* * First of all, the unconditional stuff we have to do * to disable replay of the journal when we next remount */ sb->s_flags |= SB_RDONLY; /* * OK, test if we are remounting a valid rw partition * readonly, and if so set the rdonly flag and then * mark the partition as valid again. */ if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) && (sbi->s_mount_state & EXT4_VALID_FS)) es->s_state = cpu_to_le16(sbi->s_mount_state); if (sbi->s_journal) { /* * We let remount-ro finish even if marking fs * as clean failed... */ ext4_mark_recovery_complete(sb, es); } } else { /* Make sure we can mount this feature set readwrite */ if (ext4_has_feature_readonly(sb) || !ext4_feature_set_ok(sb, 0)) { err = -EROFS; goto restore_opts; } /* * Make sure the group descriptor checksums * are sane. If they aren't, refuse to remount r/w. */ for (g = 0; g < sbi->s_groups_count; g++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, g, NULL); if (!ext4_group_desc_csum_verify(sb, g, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_remount: Checksum for group %u failed (%u!=%u)", g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)), le16_to_cpu(gdp->bg_checksum)); err = -EFSBADCRC; goto restore_opts; } } /* * If we have an unprocessed orphan list hanging * around from a previously readonly bdev mount, * require a full umount/remount for now. */ if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) { ext4_msg(sb, KERN_WARNING, "Couldn't " "remount RDWR because of unprocessed " "orphan inode list. Please " "umount/remount instead"); err = -EINVAL; goto restore_opts; } /* * Mounting a RDONLY partition read-write, so reread * and store the current valid flag. (It may have * been changed by e2fsck since we originally mounted * the partition.) */ if (sbi->s_journal) { err = ext4_clear_journal_err(sb, es); if (err) goto restore_opts; } sbi->s_mount_state = (le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY); err = ext4_setup_super(sb, es, 0); if (err) goto restore_opts; sb->s_flags &= ~SB_RDONLY; if (ext4_has_feature_mmp(sb)) { err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); if (err) goto restore_opts; } #ifdef CONFIG_QUOTA enable_quota = 1; #endif } } /* * Handle creation of system zone data early because it can fail. * Releasing of existing data is done when we are sure remount will * succeed. */ if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) { err = ext4_setup_system_zone(sb); if (err) goto restore_opts; } if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { err = ext4_commit_super(sb); if (err) goto restore_opts; } #ifdef CONFIG_QUOTA if (enable_quota) { if (sb_any_quota_suspended(sb)) dquot_resume(sb, -1); else if (ext4_has_feature_quota(sb)) { err = ext4_enable_quotas(sb); if (err) goto restore_opts; } } /* Release old quota file names */ for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(old_opts.s_qf_names[i]); #endif if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); /* * Reinitialize lazy itable initialization thread based on * current settings */ if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE)) ext4_unregister_li_request(sb); else { ext4_group_t first_not_zeroed; first_not_zeroed = ext4_has_uninit_itable(sb); ext4_register_li_request(sb, first_not_zeroed); } if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); return 0; restore_opts: /* * If there was a failing r/w to ro transition, we may need to * re-enable quota */ if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) && sb_any_quota_suspended(sb)) dquot_resume(sb, -1); alloc_ctx = ext4_writepages_down_write(sb); sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; sbi->s_mount_opt2 = old_opts.s_mount_opt2; sbi->s_resuid = old_opts.s_resuid; sbi->s_resgid = old_opts.s_resgid; sbi->s_commit_interval = old_opts.s_commit_interval; sbi->s_min_batch_time = old_opts.s_min_batch_time; sbi->s_max_batch_time = old_opts.s_max_batch_time; ext4_writepages_up_write(sb, alloc_ctx); if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) { to_free[i] = get_qf_name(sb, sbi, i); rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]); } synchronize_rcu(); for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(to_free[i]); #endif if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); return err; } static int ext4_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int ret; fc->s_fs_info = EXT4_SB(sb); ret = ext4_check_opt_consistency(fc, sb); if (ret < 0) return ret; ret = __ext4_remount(fc, sb); if (ret < 0) return ret; ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.", &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", ext4_quota_mode(sb)); return 0; } #ifdef CONFIG_QUOTA static int ext4_statfs_project(struct super_block *sb, kprojid_t projid, struct kstatfs *buf) { struct kqid qid; struct dquot *dquot; u64 limit; u64 curblock; qid = make_kqid_projid(projid); dquot = dqget(sb, qid); if (IS_ERR(dquot)) return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit, dquot->dq_dqb.dqb_bhardlimit); limit >>= sb->s_blocksize_bits; if (limit && buf->f_blocks > limit) { curblock = (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; buf->f_blocks = limit; buf->f_bfree = buf->f_bavail = (buf->f_blocks > curblock) ? (buf->f_blocks - curblock) : 0; } limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, dquot->dq_dqb.dqb_ihardlimit); if (limit && buf->f_files > limit) { buf->f_files = limit; buf->f_ffree = (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; } spin_unlock(&dquot->dq_dqb_lock); dqput(dquot); return 0; } #endif static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_fsblk_t overhead = 0, resv_blocks; s64 bfree; resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); if (!test_opt(sb, MINIX_DF)) overhead = sbi->s_overhead; buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead); bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); /* prevent underflow in case that few free space is available */ buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); buf->f_bavail = buf->f_bfree - (ext4_r_blocks_count(es) + resv_blocks); if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); buf->f_namelen = EXT4_NAME_LEN; buf->f_fsid = uuid_to_fsid(es->s_uuid); #ifdef CONFIG_QUOTA if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && sb_has_quota_limits_enabled(sb, PRJQUOTA)) ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf); #endif return 0; } #ifdef CONFIG_QUOTA /* * Helper functions so that transaction is started before we acquire dqio_sem * to keep correct lock ordering of transaction > dqio_sem */ static inline struct inode *dquot_to_inode(struct dquot *dquot) { return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; } static int ext4_write_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; struct inode *inode; inode = dquot_to_inode(dquot); handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit(dquot); err = ext4_journal_stop(handle); if (!ret) ret = err; return ret; } static int ext4_acquire_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_acquire(dquot); err = ext4_journal_stop(handle); if (!ret) ret = err; return ret; } static int ext4_release_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) { /* Release dquot anyway to avoid endless cycle in dqput() */ dquot_release(dquot); return PTR_ERR(handle); } ret = dquot_release(dquot); err = ext4_journal_stop(handle); if (!ret) ret = err; return ret; } static int ext4_mark_dquot_dirty(struct dquot *dquot) { struct super_block *sb = dquot->dq_sb; if (ext4_is_quota_journalled(sb)) { dquot_mark_dquot_dirty(dquot); return ext4_write_dquot(dquot); } else { return dquot_mark_dquot_dirty(dquot); } } static int ext4_write_info(struct super_block *sb, int type) { int ret, err; handle_t *handle; /* Data block + inode block */ handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit_info(sb, type); err = ext4_journal_stop(handle); if (!ret) ret = err; return ret; } static void lockdep_set_quota_inode(struct inode *inode, int subclass) { struct ext4_inode_info *ei = EXT4_I(inode); /* The first argument of lockdep_set_subclass has to be * *exactly* the same as the argument to init_rwsem() --- in * this case, in init_once() --- or lockdep gets unhappy * because the name of the lock is set using the * stringification of the argument to init_rwsem(). */ (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */ lockdep_set_subclass(&ei->i_data_sem, subclass); } /* * Standard function to be called on quota_on */ static int ext4_quota_on(struct super_block *sb, int type, int format_id, const struct path *path) { int err; if (!test_opt(sb, QUOTA)) return -EINVAL; /* Quotafile not on the same filesystem? */ if (path->dentry->d_sb != sb) return -EXDEV; /* Quota already enabled for this file? */ if (IS_NOQUOTA(d_inode(path->dentry))) return -EBUSY; /* Journaling quota? */ if (EXT4_SB(sb)->s_qf_names[type]) { /* Quotafile not in fs root? */ if (path->dentry->d_parent != sb->s_root) ext4_msg(sb, KERN_WARNING, "Quota file not on filesystem root. " "Journaled quota will not work"); sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY; } else { /* * Clear the flag just in case mount options changed since * last time. */ sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY; } lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); err = dquot_quota_on(sb, type, format_id, path); if (!err) { struct inode *inode = d_inode(path->dentry); handle_t *handle; /* * Set inode flags to prevent userspace from messing with quota * files. If this fails, we return success anyway since quotas * are already enabled and this is not a hard failure. */ inode_lock(inode); handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) goto unlock_inode; EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL; inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, S_NOATIME | S_IMMUTABLE); err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); unlock_inode: inode_unlock(inode); if (err) dquot_quota_off(sb, type); } if (err) lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_NORMAL); return err; } static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum) { switch (type) { case USRQUOTA: return qf_inum == EXT4_USR_QUOTA_INO; case GRPQUOTA: return qf_inum == EXT4_GRP_QUOTA_INO; case PRJQUOTA: return qf_inum >= EXT4_GOOD_OLD_FIRST_INO; default: BUG(); } } static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags) { int err; struct inode *qf_inode; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; BUG_ON(!ext4_has_feature_quota(sb)); if (!qf_inums[type]) return -EPERM; if (!ext4_check_quota_inum(type, qf_inums[type])) { ext4_error(sb, "Bad quota inum: %lu, type: %d", qf_inums[type], type); return -EUCLEAN; } qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL); if (IS_ERR(qf_inode)) { ext4_error(sb, "Bad quota inode: %lu, type: %d", qf_inums[type], type); return PTR_ERR(qf_inode); } /* Don't account quota for quota files to avoid recursion */ qf_inode->i_flags |= S_NOQUOTA; lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); err = dquot_load_quota_inode(qf_inode, type, format_id, flags); if (err) lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); iput(qf_inode); return err; } /* Enable usage tracking for all quota types. */ int ext4_enable_quotas(struct super_block *sb) { int type, err = 0; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; bool quota_mopt[EXT4_MAXQUOTAS] = { test_opt(sb, USRQUOTA), test_opt(sb, GRPQUOTA), test_opt(sb, PRJQUOTA), }; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; for (type = 0; type < EXT4_MAXQUOTAS; type++) { if (qf_inums[type]) { err = ext4_quota_enable(sb, type, QFMT_VFS_V1, DQUOT_USAGE_ENABLED | (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); if (err) { ext4_warning(sb, "Failed to enable quota tracking " "(type=%d, err=%d, ino=%lu). " "Please run e2fsck to fix.", type, err, qf_inums[type]); ext4_quotas_off(sb, type); return err; } } } return 0; } static int ext4_quota_off(struct super_block *sb, int type) { struct inode *inode = sb_dqopt(sb)->files[type]; handle_t *handle; int err; /* Force all delayed allocation blocks to be allocated. * Caller already holds s_umount sem */ if (test_opt(sb, DELALLOC)) sync_filesystem(sb); if (!inode || !igrab(inode)) goto out; err = dquot_quota_off(sb, type); if (err || ext4_has_feature_quota(sb)) goto out_put; /* * When the filesystem was remounted read-only first, we cannot cleanup * inode flags here. Bad luck but people should be using QUOTA feature * these days anyway. */ if (sb_rdonly(sb)) goto out_put; inode_lock(inode); /* * Update modification times of quota files when userspace can * start looking at them. If we fail, we return success anyway since * this is not a hard failure and quotas are already disabled. */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto out_unlock; } EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); out_unlock: inode_unlock(inode); out_put: lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL); iput(inode); return err; out: return dquot_quota_off(sb, type); } /* Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code * itself serializes the operations (and no one else should touch the files) * we don't have to be afraid of races */ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off) { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int offset = off & (sb->s_blocksize - 1); int tocopy; size_t toread; struct buffer_head *bh; loff_t i_size = i_size_read(inode); if (off > i_size) return 0; if (off+len > i_size) len = i_size-off; toread = len; while (toread > 0) { tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); bh = ext4_bread(NULL, inode, blk, 0); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) /* A hole? */ memset(data, 0, tocopy); else memcpy(data, bh->b_data+offset, tocopy); brelse(bh); offset = 0; toread -= tocopy; data += tocopy; blk++; } return len; } /* Write to quotafile (we know the transaction is already started and has * enough credits) */ static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off) { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1); int retries = 0; struct buffer_head *bh; handle_t *handle = journal_current_handle(); if (!handle) { ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" " cancelled because transaction is not started", (unsigned long long)off, (unsigned long long)len); return -EIO; } /* * Since we account only one data block in transaction credits, * then it is impossible to cross a block boundary. */ if (sb->s_blocksize - offset < len) { ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" " cancelled because not block aligned", (unsigned long long)off, (unsigned long long)len); return -EIO; } do { bh = ext4_bread(handle, inode, blk, EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL); } while (PTR_ERR(bh) == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) goto out; BUFFER_TRACE(bh, "get write access"); err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) { brelse(bh); return err; } lock_buffer(bh); memcpy(bh->b_data+offset, data, len); flush_dcache_page(bh->b_page); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); out: if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; } return err ? err : len; } #endif #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static inline void register_as_ext2(void) { int err = register_filesystem(&ext2_fs_type); if (err) printk(KERN_WARNING "EXT4-fs: Unable to register as ext2 (%d)\n", err); } static inline void unregister_as_ext2(void) { unregister_filesystem(&ext2_fs_type); } static inline int ext2_feature_set_ok(struct super_block *sb) { if (ext4_has_unknown_ext2_incompat_features(sb)) return 0; if (sb_rdonly(sb)) return 1; if (ext4_has_unknown_ext2_ro_compat_features(sb)) return 0; return 1; } #else static inline void register_as_ext2(void) { } static inline void unregister_as_ext2(void) { } static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; } #endif static inline void register_as_ext3(void) { int err = register_filesystem(&ext3_fs_type); if (err) printk(KERN_WARNING "EXT4-fs: Unable to register as ext3 (%d)\n", err); } static inline void unregister_as_ext3(void) { unregister_filesystem(&ext3_fs_type); } static inline int ext3_feature_set_ok(struct super_block *sb) { if (ext4_has_unknown_ext3_incompat_features(sb)) return 0; if (!ext4_has_feature_journal(sb)) return 0; if (sb_rdonly(sb)) return 1; if (ext4_has_unknown_ext3_ro_compat_features(sb)) return 0; return 1; } static void ext4_kill_sb(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct bdev_handle *handle = sbi ? sbi->s_journal_bdev_handle : NULL; kill_block_super(sb); if (handle) bdev_release(handle); } static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, .name = "ext4", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("ext4"); /* Shared across all ext4 file systems */ wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; static int __init ext4_init_fs(void) { int i, err; ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); ext4_li_info = NULL; /* Build-time check for flags consistency */ ext4_check_flag_values(); for (i = 0; i < EXT4_WQ_HASH_SZ; i++) init_waitqueue_head(&ext4__ioend_wq[i]); err = ext4_init_es(); if (err) return err; err = ext4_init_pending(); if (err) goto out7; err = ext4_init_post_read_processing(); if (err) goto out6; err = ext4_init_pageio(); if (err) goto out5; err = ext4_init_system_zone(); if (err) goto out4; err = ext4_init_sysfs(); if (err) goto out3; err = ext4_init_mballoc(); if (err) goto out2; err = init_inodecache(); if (err) goto out1; err = ext4_fc_init_dentry_cache(); if (err) goto out05; register_as_ext3(); register_as_ext2(); err = register_filesystem(&ext4_fs_type); if (err) goto out; return 0; out: unregister_as_ext2(); unregister_as_ext3(); ext4_fc_destroy_dentry_cache(); out05: destroy_inodecache(); out1: ext4_exit_mballoc(); out2: ext4_exit_sysfs(); out3: ext4_exit_system_zone(); out4: ext4_exit_pageio(); out5: ext4_exit_post_read_processing(); out6: ext4_exit_pending(); out7: ext4_exit_es(); return err; } static void __exit ext4_exit_fs(void) { ext4_destroy_lazyinit_thread(); unregister_as_ext2(); unregister_as_ext3(); unregister_filesystem(&ext4_fs_type); ext4_fc_destroy_dentry_cache(); destroy_inodecache(); ext4_exit_mballoc(); ext4_exit_sysfs(); ext4_exit_system_zone(); ext4_exit_pageio(); ext4_exit_post_read_processing(); ext4_exit_es(); ext4_exit_pending(); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_DESCRIPTION("Fourth Extended Filesystem"); MODULE_LICENSE("GPL"); MODULE_SOFTDEP("pre: crc32c"); module_init(ext4_init_fs) module_exit(ext4_exit_fs)
16 60 214 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 /* SPDX-License-Identifier: GPL-2.0 */ /* * sysfs.h - definitions for the device driver filesystem * * Copyright (c) 2001,2002 Patrick Mochel * Copyright (c) 2004 Silicon Graphics, Inc. * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #ifndef _SYSFS_H_ #define _SYSFS_H_ #include <linux/kernfs.h> #include <linux/compiler.h> #include <linux/errno.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/kobject_ns.h> #include <linux/stat.h> #include <linux/atomic.h> struct kobject; struct module; struct bin_attribute; enum kobj_ns_type; struct attribute { const char *name; umode_t mode; #ifdef CONFIG_DEBUG_LOCK_ALLOC bool ignore_lockdep:1; struct lock_class_key *key; struct lock_class_key skey; #endif }; /** * sysfs_attr_init - initialize a dynamically allocated sysfs attribute * @attr: struct attribute to initialize * * Initialize a dynamically allocated struct attribute so we can * make lockdep happy. This is a new requirement for attributes * and initially this is only needed when lockdep is enabled. * Lockdep gives a nice error when your attribute is added to * sysfs if you don't have this. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC #define sysfs_attr_init(attr) \ do { \ static struct lock_class_key __key; \ \ (attr)->key = &__key; \ } while (0) #else #define sysfs_attr_init(attr) do {} while (0) #endif /** * struct attribute_group - data structure used to declare an attribute group. * @name: Optional: Attribute group name * If specified, the attribute group will be created in * a new subdirectory with this name. * @is_visible: Optional: Function to return permissions associated with an * attribute of the group. Will be called repeatedly for each * non-binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC are accepted. Must * return 0 if an attribute is not visible. The returned value * will replace static permissions defined in struct attribute. * @is_bin_visible: * Optional: Function to return permissions associated with a * binary attribute of the group. Will be called repeatedly * for each binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC are accepted. Must * return 0 if a binary attribute is not visible. The returned * value will replace static permissions defined in * struct bin_attribute. * @attrs: Pointer to NULL terminated list of attributes. * @bin_attrs: Pointer to NULL terminated list of binary attributes. * Either attrs or bin_attrs or both must be provided. */ struct attribute_group { const char *name; umode_t (*is_visible)(struct kobject *, struct attribute *, int); umode_t (*is_bin_visible)(struct kobject *, struct bin_attribute *, int); struct attribute **attrs; struct bin_attribute **bin_attrs; }; /* * Use these macros to make defining attributes easier. * See include/linux/device.h for examples.. */ #define SYSFS_PREALLOC 010000 #define __ATTR(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _show, \ .store = _store, \ } #define __ATTR_PREALLOC(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = SYSFS_PREALLOC | VERIFY_OCTAL_PERMISSIONS(_mode) },\ .show = _show, \ .store = _store, \ } #define __ATTR_RO(_name) { \ .attr = { .name = __stringify(_name), .mode = 0444 }, \ .show = _name##_show, \ } #define __ATTR_RO_MODE(_name, _mode) { \ .attr = { .name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _name##_show, \ } #define __ATTR_RW_MODE(_name, _mode) { \ .attr = { .name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _name##_show, \ .store = _name##_store, \ } #define __ATTR_WO(_name) { \ .attr = { .name = __stringify(_name), .mode = 0200 }, \ .store = _name##_store, \ } #define __ATTR_RW(_name) __ATTR(_name, 0644, _name##_show, _name##_store) #define __ATTR_NULL { .attr = { .name = NULL } } #ifdef CONFIG_DEBUG_LOCK_ALLOC #define __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), .mode = _mode, \ .ignore_lockdep = true }, \ .show = _show, \ .store = _store, \ } #else #define __ATTR_IGNORE_LOCKDEP __ATTR #endif #define __ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group *_name##_groups[] = { \ &_name##_group, \ NULL, \ } #define ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ .attrs = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) #define BIN_ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ .bin_attrs = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) struct file; struct vm_area_struct; struct address_space; struct bin_attribute { struct attribute attr; size_t size; void *private; struct address_space *(*f_mapping)(void); ssize_t (*read)(struct file *, struct kobject *, struct bin_attribute *, char *, loff_t, size_t); ssize_t (*write)(struct file *, struct kobject *, struct bin_attribute *, char *, loff_t, size_t); loff_t (*llseek)(struct file *, struct kobject *, struct bin_attribute *, loff_t, int); int (*mmap)(struct file *, struct kobject *, struct bin_attribute *attr, struct vm_area_struct *vma); }; /** * sysfs_bin_attr_init - initialize a dynamically allocated bin_attribute * @attr: struct bin_attribute to initialize * * Initialize a dynamically allocated struct bin_attribute so we * can make lockdep happy. This is a new requirement for * attributes and initially this is only needed when lockdep is * enabled. Lockdep gives a nice error when your attribute is * added to sysfs if you don't have this. */ #define sysfs_bin_attr_init(bin_attr) sysfs_attr_init(&(bin_attr)->attr) /* macros to create static binary attributes easier */ #define __BIN_ATTR(_name, _mode, _read, _write, _size) { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ .read = _read, \ .write = _write, \ .size = _size, \ } #define __BIN_ATTR_RO(_name, _size) { \ .attr = { .name = __stringify(_name), .mode = 0444 }, \ .read = _name##_read, \ .size = _size, \ } #define __BIN_ATTR_WO(_name, _size) { \ .attr = { .name = __stringify(_name), .mode = 0200 }, \ .write = _name##_write, \ .size = _size, \ } #define __BIN_ATTR_RW(_name, _size) \ __BIN_ATTR(_name, 0644, _name##_read, _name##_write, _size) #define __BIN_ATTR_NULL __ATTR_NULL #define BIN_ATTR(_name, _mode, _read, _write, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR(_name, _mode, _read, \ _write, _size) #define BIN_ATTR_RO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RO(_name, _size) #define BIN_ATTR_WO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_WO(_name, _size) #define BIN_ATTR_RW(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size) #define __BIN_ATTR_ADMIN_RO(_name, _size) { \ .attr = { .name = __stringify(_name), .mode = 0400 }, \ .read = _name##_read, \ .size = _size, \ } #define __BIN_ATTR_ADMIN_RW(_name, _size) \ __BIN_ATTR(_name, 0600, _name##_read, _name##_write, _size) #define BIN_ATTR_ADMIN_RO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RO(_name, _size) #define BIN_ATTR_ADMIN_RW(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RW(_name, _size) struct sysfs_ops { ssize_t (*show)(struct kobject *, struct attribute *, char *); ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t); }; #ifdef CONFIG_SYSFS int __must_check sysfs_create_dir_ns(struct kobject *kobj, const void *ns); void sysfs_remove_dir(struct kobject *kobj); int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns); int __must_check sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns); int __must_check sysfs_create_mount_point(struct kobject *parent_kobj, const char *name); void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name); int __must_check sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns); int __must_check sysfs_create_files(struct kobject *kobj, const struct attribute * const *attr); int __must_check sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode); struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr); void sysfs_unbreak_active_protection(struct kernfs_node *kn); void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns); bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr); void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr); int __must_check sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr); void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr); int __must_check sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name); int __must_check sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name); void sysfs_remove_link(struct kobject *kobj, const char *name); int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *target, const char *old_name, const char *new_name, const void *new_ns); void sysfs_delete_link(struct kobject *dir, struct kobject *targ, const char *name); int __must_check sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp); int __must_check sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups); int __must_check sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups); int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups); int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group); void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group); int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp); int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name); void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name); int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name); void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr); int __must_check sysfs_init(void); static inline void sysfs_enable_ns(struct kernfs_node *kn) { return kernfs_enable_ns(kn); } int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid); int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid); int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, kgid_t kgid); __printf(2, 3) int sysfs_emit(char *buf, const char *fmt, ...); __printf(3, 4) int sysfs_emit_at(char *buf, int at, const char *fmt, ...); #else /* CONFIG_SYSFS */ static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { return 0; } static inline void sysfs_remove_dir(struct kobject *kobj) { } static inline int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns) { return 0; } static inline int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns) { return 0; } static inline int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name) { return 0; } static inline void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name) { } static inline int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { return 0; } static inline int sysfs_create_files(struct kobject *kobj, const struct attribute * const *attr) { return 0; } static inline int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode) { return 0; } static inline struct kernfs_node * sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr) { return NULL; } static inline void sysfs_unbreak_active_protection(struct kernfs_node *kn) { } static inline void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { } static inline bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) { return false; } static inline void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr) { } static inline int sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { return 0; } static inline void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { } static inline int sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name) { return 0; } static inline int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name) { return 0; } static inline void sysfs_remove_link(struct kobject *kobj, const char *name) { } static inline int sysfs_rename_link_ns(struct kobject *k, struct kobject *t, const char *old_name, const char *new_name, const void *ns) { return 0; } static inline void sysfs_delete_link(struct kobject *k, struct kobject *t, const char *name) { } static inline int sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline int sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups) { return 0; } static inline int sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups) { return 0; } static inline int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp) { } static inline void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups) { } static inline int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group) { return 0; } static inline void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group) { } static inline int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp) { } static inline int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name) { return 0; } static inline void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name) { } static inline int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name) { return 0; } static inline void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr) { } static inline int __must_check sysfs_init(void) { return 0; } static inline void sysfs_enable_ns(struct kernfs_node *kn) { } static inline int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, kgid_t kgid) { return 0; } __printf(2, 3) static inline int sysfs_emit(char *buf, const char *fmt, ...) { return 0; } __printf(3, 4) static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...) { return 0; } #endif /* CONFIG_SYSFS */ static inline int __must_check sysfs_create_file(struct kobject *kobj, const struct attribute *attr) { return sysfs_create_file_ns(kobj, attr, NULL); } static inline void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr) { sysfs_remove_file_ns(kobj, attr, NULL); } static inline int sysfs_rename_link(struct kobject *kobj, struct kobject *target, const char *old_name, const char *new_name) { return sysfs_rename_link_ns(kobj, target, old_name, new_name, NULL); } static inline void sysfs_notify_dirent(struct kernfs_node *kn) { kernfs_notify(kn); } static inline struct kernfs_node *sysfs_get_dirent(struct kernfs_node *parent, const char *name) { return kernfs_find_and_get(parent, name); } static inline struct kernfs_node *sysfs_get(struct kernfs_node *kn) { kernfs_get(kn); return kn; } static inline void sysfs_put(struct kernfs_node *kn) { kernfs_put(kn); } #endif /* _SYSFS_H_ */
9226 2743 3 3 172 171 69 5468 5459 172 5419 5454 5468 5592 5468 5468 5450 8968 8950 8946 5602 8968 8041 8038 9 8 7 9 750 584 752 749 7531 214 11 213 213 7525 7531 8950 8945 17 641 17 17 16 17 641 640 98 26 3489 27 3489 8038 8107 8055 8055 8081 7497 8043 8043 8038 8062 8063 32 32 32 32 26 26 32 7496 7477 7477 7495 7497 1 7484 7482 536 536 518 518 536 3 3 535 486 536 885 885 241 722 885 885 821 819 818 1 1 817 817 818 818 818 817 817 821 538 536 488 536 32 4 3 536 486 486 536 519 518 536 536 535 537 7416 7411 7413 7411 7431 1480 1483 1478 7408 7407 7413 2747 6489 6600 6629 1135 4970 4 127 30 3 29 26 30 41 41 40 40 41 5 2 1 5 29 28 27 25 24 26 22 26 2 25 20 22 22 4 21 21 21 21 20 19 19 2 10 9 2 10 7 6 5 4 4 2 4 23 3 3 20 20 20 23 42 8041 8036 6 3 5 4 5 3 227 8055 7879 226 226 227 5910 5916 7 6 5 6 6 7 1 1 1 1 1 1 1 5919 5912 5907 888 5907 7650 1 1 9290 9313 9292 1363 345 345 1362 8045 8048 8061 8044 7623 8043 8039 8063 8062 8042 5 8045 8061 5917 6 5922 5915 7651 1831 1833 1833 1833 1830 1833 150 929 930 930 2629 2635 2628 2155 2111 927 932 931 931 884 53 53 928 929 2 930 928 930 5 931 2810 2635 2635 2807 2800 2802 4 2801 2805 7 1688 12 12 12 29 27 23 26 17 17 17 14 9 12 5 4 2 2 1 2 23 29 12 11 10 9 5 2 4 3 5 5 3 2 12 2 1 1 8065 8052 31 8075 7642 8055 754 750 753 113 7717 8039 7406 1065 8052 8055 8053 8045 114 8033 8040 112 112 8047 7648 753 751 743 1 744 135 743 145 145 743 1 743 1 744 5 744 180 179 744 752 753 69 68 69 69 69 69 69 69 69 68 69 69 69 68 3237 6070 615 611 613 615 127 1 1 913 914 912 117 117 848 917 911 615 373 3 371 612 611 612 1 612 614 328 611 1 892 902 896 899 900 899 490 841 823 821 58 60 60 60 874 5559 7 4 3 7 5 3 5 3 5 1 5 5 4 5 5 4 3 3 4 3 4 1 4 5543 4854 4854 5559 6 5 5551 5561 5546 4871 4864 5550 5 5550 5547 7214 7212 7205 7194 7168 7172 6146 6816 6786 1387 1382 371 369 1384 1384 371 1382 3 4 4 4 4 4 3 3 4 4 3 3 4 4 2 4 4 3 4 4 4 4 2 4 69 69 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 // SPDX-License-Identifier: GPL-2.0-or-later /* * NETLINK Kernel-user communication protocol. * * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * Patrick McHardy <kaber@trash.net> * * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith * added netlink_proto_exit * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br> * use nlk_sk, as sk->protinfo is on a diet 8) * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org> * - inc module use count of module that owns * the kernel socket in case userspace opens * socket of same protocol * - remove all module support, since netlink is * mandatory if CONFIG_NET=y these days */ #include <linux/module.h> #include <linux/bpf.h> #include <linux/capability.h> #include <linux/kernel.h> #include <linux/filter.h> #include <linux/init.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/socket.h> #include <linux/un.h> #include <linux/fcntl.h> #include <linux/termios.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/security.h> #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/random.h> #include <linux/bitops.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/audit.h> #include <linux/mutex.h> #include <linux/vmalloc.h> #include <linux/if_arp.h> #include <linux/rhashtable.h> #include <asm/cacheflush.h> #include <linux/hash.h> #include <linux/genetlink.h> #include <linux/net_namespace.h> #include <linux/nospec.h> #include <linux/btf_ids.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> #include <net/scm.h> #include <net/netlink.h> #define CREATE_TRACE_POINTS #include <trace/events/netlink.h> #include "af_netlink.h" struct listeners { struct rcu_head rcu; unsigned long masks[]; }; /* state bits */ #define NETLINK_S_CONGESTED 0x0 static inline int netlink_is_kernel(struct sock *sk) { return nlk_test_bit(KERNEL_SOCKET, sk); } struct netlink_table *nl_table __read_mostly; EXPORT_SYMBOL_GPL(nl_table); static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); static struct lock_class_key nlk_cb_mutex_keys[MAX_LINKS]; static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = { "nlk_cb_mutex-ROUTE", "nlk_cb_mutex-1", "nlk_cb_mutex-USERSOCK", "nlk_cb_mutex-FIREWALL", "nlk_cb_mutex-SOCK_DIAG", "nlk_cb_mutex-NFLOG", "nlk_cb_mutex-XFRM", "nlk_cb_mutex-SELINUX", "nlk_cb_mutex-ISCSI", "nlk_cb_mutex-AUDIT", "nlk_cb_mutex-FIB_LOOKUP", "nlk_cb_mutex-CONNECTOR", "nlk_cb_mutex-NETFILTER", "nlk_cb_mutex-IP6_FW", "nlk_cb_mutex-DNRTMSG", "nlk_cb_mutex-KOBJECT_UEVENT", "nlk_cb_mutex-GENERIC", "nlk_cb_mutex-17", "nlk_cb_mutex-SCSITRANSPORT", "nlk_cb_mutex-ECRYPTFS", "nlk_cb_mutex-RDMA", "nlk_cb_mutex-CRYPTO", "nlk_cb_mutex-SMC", "nlk_cb_mutex-23", "nlk_cb_mutex-24", "nlk_cb_mutex-25", "nlk_cb_mutex-26", "nlk_cb_mutex-27", "nlk_cb_mutex-28", "nlk_cb_mutex-29", "nlk_cb_mutex-30", "nlk_cb_mutex-31", "nlk_cb_mutex-MAX_LINKS" }; static int netlink_dump(struct sock *sk); /* nl_table locking explained: * Lookup and traversal are protected with an RCU read-side lock. Insertion * and removal are protected with per bucket lock while using RCU list * modification primitives and may run in parallel to RCU protected lookups. * Destruction of the Netlink socket may only occur *after* nl_table_lock has * been acquired * either during or after the socket has been removed from * the list and after an RCU grace period. */ DEFINE_RWLOCK(nl_table_lock); EXPORT_SYMBOL_GPL(nl_table_lock); static atomic_t nl_table_users = ATOMIC_INIT(0); #define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock)); static BLOCKING_NOTIFIER_HEAD(netlink_chain); static const struct rhashtable_params netlink_rhashtable_params; void do_trace_netlink_extack(const char *msg) { trace_netlink_extack(msg); } EXPORT_SYMBOL(do_trace_netlink_extack); static inline u32 netlink_group_mask(u32 group) { if (group > 32) return 0; return group ? 1 << (group - 1) : 0; } static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb, gfp_t gfp_mask) { unsigned int len = skb_end_offset(skb); struct sk_buff *new; new = alloc_skb(len, gfp_mask); if (new == NULL) return NULL; NETLINK_CB(new).portid = NETLINK_CB(skb).portid; NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group; NETLINK_CB(new).creds = NETLINK_CB(skb).creds; skb_put_data(new, skb->data, len); return new; } static unsigned int netlink_tap_net_id; struct netlink_tap_net { struct list_head netlink_tap_all; struct mutex netlink_tap_lock; }; int netlink_add_tap(struct netlink_tap *nt) { struct net *net = dev_net(nt->dev); struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); if (unlikely(nt->dev->type != ARPHRD_NETLINK)) return -EINVAL; mutex_lock(&nn->netlink_tap_lock); list_add_rcu(&nt->list, &nn->netlink_tap_all); mutex_unlock(&nn->netlink_tap_lock); __module_get(nt->module); return 0; } EXPORT_SYMBOL_GPL(netlink_add_tap); static int __netlink_remove_tap(struct netlink_tap *nt) { struct net *net = dev_net(nt->dev); struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); bool found = false; struct netlink_tap *tmp; mutex_lock(&nn->netlink_tap_lock); list_for_each_entry(tmp, &nn->netlink_tap_all, list) { if (nt == tmp) { list_del_rcu(&nt->list); found = true; goto out; } } pr_warn("__netlink_remove_tap: %p not found\n", nt); out: mutex_unlock(&nn->netlink_tap_lock); if (found) module_put(nt->module); return found ? 0 : -ENODEV; } int netlink_remove_tap(struct netlink_tap *nt) { int ret; ret = __netlink_remove_tap(nt); synchronize_net(); return ret; } EXPORT_SYMBOL_GPL(netlink_remove_tap); static __net_init int netlink_tap_init_net(struct net *net) { struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); INIT_LIST_HEAD(&nn->netlink_tap_all); mutex_init(&nn->netlink_tap_lock); return 0; } static struct pernet_operations netlink_tap_net_ops = { .init = netlink_tap_init_net, .id = &netlink_tap_net_id, .size = sizeof(struct netlink_tap_net), }; static bool netlink_filter_tap(const struct sk_buff *skb) { struct sock *sk = skb->sk; /* We take the more conservative approach and * whitelist socket protocols that may pass. */ switch (sk->sk_protocol) { case NETLINK_ROUTE: case NETLINK_USERSOCK: case NETLINK_SOCK_DIAG: case NETLINK_NFLOG: case NETLINK_XFRM: case NETLINK_FIB_LOOKUP: case NETLINK_NETFILTER: case NETLINK_GENERIC: return true; } return false; } static int __netlink_deliver_tap_skb(struct sk_buff *skb, struct net_device *dev) { struct sk_buff *nskb; struct sock *sk = skb->sk; int ret = -ENOMEM; if (!net_eq(dev_net(dev), sock_net(sk))) return 0; dev_hold(dev); if (is_vmalloc_addr(skb->head)) nskb = netlink_to_full_skb(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); if (nskb) { nskb->dev = dev; nskb->protocol = htons((u16) sk->sk_protocol); nskb->pkt_type = netlink_is_kernel(sk) ? PACKET_KERNEL : PACKET_USER; skb_reset_network_header(nskb); ret = dev_queue_xmit(nskb); if (unlikely(ret > 0)) ret = net_xmit_errno(ret); } dev_put(dev); return ret; } static void __netlink_deliver_tap(struct sk_buff *skb, struct netlink_tap_net *nn) { int ret; struct netlink_tap *tmp; if (!netlink_filter_tap(skb)) return; list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) { ret = __netlink_deliver_tap_skb(skb, tmp->dev); if (unlikely(ret)) break; } } static void netlink_deliver_tap(struct net *net, struct sk_buff *skb) { struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); rcu_read_lock(); if (unlikely(!list_empty(&nn->netlink_tap_all))) __netlink_deliver_tap(skb, nn); rcu_read_unlock(); } static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src, struct sk_buff *skb) { if (!(netlink_is_kernel(dst) && netlink_is_kernel(src))) netlink_deliver_tap(sock_net(dst), skb); } static void netlink_overrun(struct sock *sk) { if (!nlk_test_bit(RECV_NO_ENOBUFS, sk)) { if (!test_and_set_bit(NETLINK_S_CONGESTED, &nlk_sk(sk)->state)) { WRITE_ONCE(sk->sk_err, ENOBUFS); sk_error_report(sk); } } atomic_inc(&sk->sk_drops); } static void netlink_rcv_wake(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); if (skb_queue_empty_lockless(&sk->sk_receive_queue)) clear_bit(NETLINK_S_CONGESTED, &nlk->state); if (!test_bit(NETLINK_S_CONGESTED, &nlk->state)) wake_up_interruptible(&nlk->wait); } static void netlink_skb_destructor(struct sk_buff *skb) { if (is_vmalloc_addr(skb->head)) { if (!skb->cloned || !atomic_dec_return(&(skb_shinfo(skb)->dataref))) vfree_atomic(skb->head); skb->head = NULL; } if (skb->sk != NULL) sock_rfree(skb); } static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { WARN_ON(skb->sk != NULL); skb->sk = sk; skb->destructor = netlink_skb_destructor; atomic_add(skb->truesize, &sk->sk_rmem_alloc); sk_mem_charge(sk, skb->truesize); } static void netlink_sock_destruct(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); if (nlk->cb_running) { if (nlk->cb.done) nlk->cb.done(&nlk->cb); module_put(nlk->cb.module); kfree_skb(nlk->cb.skb); } skb_queue_purge(&sk->sk_receive_queue); if (!sock_flag(sk, SOCK_DEAD)) { printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); return; } WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); WARN_ON(nlk_sk(sk)->groups); } static void netlink_sock_destruct_work(struct work_struct *work) { struct netlink_sock *nlk = container_of(work, struct netlink_sock, work); sk_free(&nlk->sk); } /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on * SMP. Look, when several writers sleep and reader wakes them up, all but one * immediately hit write lock and grab all the cpus. Exclusive sleep solves * this, _but_ remember, it adds useless work on UP machines. */ void netlink_table_grab(void) __acquires(nl_table_lock) { might_sleep(); write_lock_irq(&nl_table_lock); if (atomic_read(&nl_table_users)) { DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&nl_table_wait, &wait); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (atomic_read(&nl_table_users) == 0) break; write_unlock_irq(&nl_table_lock); schedule(); write_lock_irq(&nl_table_lock); } __set_current_state(TASK_RUNNING); remove_wait_queue(&nl_table_wait, &wait); } } void netlink_table_ungrab(void) __releases(nl_table_lock) { write_unlock_irq(&nl_table_lock); wake_up(&nl_table_wait); } static inline void netlink_lock_table(void) { unsigned long flags; /* read_lock() synchronizes us to netlink_table_grab */ read_lock_irqsave(&nl_table_lock, flags); atomic_inc(&nl_table_users); read_unlock_irqrestore(&nl_table_lock, flags); } static inline void netlink_unlock_table(void) { if (atomic_dec_and_test(&nl_table_users)) wake_up(&nl_table_wait); } struct netlink_compare_arg { possible_net_t pnet; u32 portid; }; /* Doing sizeof directly may yield 4 extra bytes on 64-bit. */ #define netlink_compare_arg_len \ (offsetof(struct netlink_compare_arg, portid) + sizeof(u32)) static inline int netlink_compare(struct rhashtable_compare_arg *arg, const void *ptr) { const struct netlink_compare_arg *x = arg->key; const struct netlink_sock *nlk = ptr; return nlk->portid != x->portid || !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet)); } static void netlink_compare_arg_init(struct netlink_compare_arg *arg, struct net *net, u32 portid) { memset(arg, 0, sizeof(*arg)); write_pnet(&arg->pnet, net); arg->portid = portid; } static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid, struct net *net) { struct netlink_compare_arg arg; netlink_compare_arg_init(&arg, net, portid); return rhashtable_lookup_fast(&table->hash, &arg, netlink_rhashtable_params); } static int __netlink_insert(struct netlink_table *table, struct sock *sk) { struct netlink_compare_arg arg; netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid); return rhashtable_lookup_insert_key(&table->hash, &arg, &nlk_sk(sk)->node, netlink_rhashtable_params); } static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid) { struct netlink_table *table = &nl_table[protocol]; struct sock *sk; rcu_read_lock(); sk = __netlink_lookup(table, portid, net); if (sk) sock_hold(sk); rcu_read_unlock(); return sk; } static const struct proto_ops netlink_ops; static void netlink_update_listeners(struct sock *sk) { struct netlink_table *tbl = &nl_table[sk->sk_protocol]; unsigned long mask; unsigned int i; struct listeners *listeners; listeners = nl_deref_protected(tbl->listeners); if (!listeners) return; for (i = 0; i < NLGRPLONGS(tbl->groups); i++) { mask = 0; sk_for_each_bound(sk, &tbl->mc_list) { if (i < NLGRPLONGS(nlk_sk(sk)->ngroups)) mask |= nlk_sk(sk)->groups[i]; } listeners->masks[i] = mask; } /* this function is only called with the netlink table "grabbed", which * makes sure updates are visible before bind or setsockopt return. */ } static int netlink_insert(struct sock *sk, u32 portid) { struct netlink_table *table = &nl_table[sk->sk_protocol]; int err; lock_sock(sk); err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY; if (nlk_sk(sk)->bound) goto err; /* portid can be read locklessly from netlink_getname(). */ WRITE_ONCE(nlk_sk(sk)->portid, portid); sock_hold(sk); err = __netlink_insert(table, sk); if (err) { /* In case the hashtable backend returns with -EBUSY * from here, it must not escape to the caller. */ if (unlikely(err == -EBUSY)) err = -EOVERFLOW; if (err == -EEXIST) err = -EADDRINUSE; sock_put(sk); goto err; } /* We need to ensure that the socket is hashed and visible. */ smp_wmb(); /* Paired with lockless reads from netlink_bind(), * netlink_connect() and netlink_sendmsg(). */ WRITE_ONCE(nlk_sk(sk)->bound, portid); err: release_sock(sk); return err; } static void netlink_remove(struct sock *sk) { struct netlink_table *table; table = &nl_table[sk->sk_protocol]; if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node, netlink_rhashtable_params)) { WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } netlink_table_grab(); if (nlk_sk(sk)->subscriptions) { __sk_del_bind_node(sk); netlink_update_listeners(sk); } if (sk->sk_protocol == NETLINK_GENERIC) atomic_inc(&genl_sk_destructing_cnt); netlink_table_ungrab(); } static struct proto netlink_proto = { .name = "NETLINK", .owner = THIS_MODULE, .obj_size = sizeof(struct netlink_sock), }; static int __netlink_create(struct net *net, struct socket *sock, struct mutex *cb_mutex, int protocol, int kern) { struct sock *sk; struct netlink_sock *nlk; sock->ops = &netlink_ops; sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); nlk = nlk_sk(sk); if (cb_mutex) { nlk->cb_mutex = cb_mutex; } else { nlk->cb_mutex = &nlk->cb_def_mutex; mutex_init(nlk->cb_mutex); lockdep_set_class_and_name(nlk->cb_mutex, nlk_cb_mutex_keys + protocol, nlk_cb_mutex_key_strings[protocol]); } init_waitqueue_head(&nlk->wait); sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; return 0; } static int netlink_create(struct net *net, struct socket *sock, int protocol, int kern) { struct module *module = NULL; struct mutex *cb_mutex; struct netlink_sock *nlk; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); void (*release)(struct sock *sock, unsigned long *groups); int err = 0; sock->state = SS_UNCONNECTED; if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) return -ESOCKTNOSUPPORT; if (protocol < 0 || protocol >= MAX_LINKS) return -EPROTONOSUPPORT; protocol = array_index_nospec(protocol, MAX_LINKS); netlink_lock_table(); #ifdef CONFIG_MODULES if (!nl_table[protocol].registered) { netlink_unlock_table(); request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol); netlink_lock_table(); } #endif if (nl_table[protocol].registered && try_module_get(nl_table[protocol].module)) module = nl_table[protocol].module; else err = -EPROTONOSUPPORT; cb_mutex = nl_table[protocol].cb_mutex; bind = nl_table[protocol].bind; unbind = nl_table[protocol].unbind; release = nl_table[protocol].release; netlink_unlock_table(); if (err < 0) goto out; err = __netlink_create(net, sock, cb_mutex, protocol, kern); if (err < 0) goto out_module; sock_prot_inuse_add(net, &netlink_proto, 1); nlk = nlk_sk(sock->sk); nlk->module = module; nlk->netlink_bind = bind; nlk->netlink_unbind = unbind; nlk->netlink_release = release; out: return err; out_module: module_put(module); goto out; } static void deferred_put_nlk_sk(struct rcu_head *head) { struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu); struct sock *sk = &nlk->sk; kfree(nlk->groups); nlk->groups = NULL; if (!refcount_dec_and_test(&sk->sk_refcnt)) return; if (nlk->cb_running && nlk->cb.done) { INIT_WORK(&nlk->work, netlink_sock_destruct_work); schedule_work(&nlk->work); return; } sk_free(sk); } static int netlink_release(struct socket *sock) { struct sock *sk = sock->sk; struct netlink_sock *nlk; if (!sk) return 0; netlink_remove(sk); sock_orphan(sk); nlk = nlk_sk(sk); /* * OK. Socket is unlinked, any packets that arrive now * will be purged. */ if (nlk->netlink_release) nlk->netlink_release(sk, nlk->groups); /* must not acquire netlink_table_lock in any way again before unbind * and notifying genetlink is done as otherwise it might deadlock */ if (nlk->netlink_unbind) { int i; for (i = 0; i < nlk->ngroups; i++) if (test_bit(i, nlk->groups)) nlk->netlink_unbind(sock_net(sk), i + 1); } if (sk->sk_protocol == NETLINK_GENERIC && atomic_dec_return(&genl_sk_destructing_cnt) == 0) wake_up(&genl_sk_destructing_waitq); sock->sk = NULL; wake_up_interruptible_all(&nlk->wait); skb_queue_purge(&sk->sk_write_queue); if (nlk->portid && nlk->bound) { struct netlink_notify n = { .net = sock_net(sk), .protocol = sk->sk_protocol, .portid = nlk->portid, }; blocking_notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n); } module_put(nlk->module); if (netlink_is_kernel(sk)) { netlink_table_grab(); BUG_ON(nl_table[sk->sk_protocol].registered == 0); if (--nl_table[sk->sk_protocol].registered == 0) { struct listeners *old; old = nl_deref_protected(nl_table[sk->sk_protocol].listeners); RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL); kfree_rcu(old, rcu); nl_table[sk->sk_protocol].module = NULL; nl_table[sk->sk_protocol].bind = NULL; nl_table[sk->sk_protocol].unbind = NULL; nl_table[sk->sk_protocol].flags = 0; nl_table[sk->sk_protocol].registered = 0; } netlink_table_ungrab(); } sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); /* Because struct net might disappear soon, do not keep a pointer. */ if (!sk->sk_net_refcnt && sock_net(sk) != &init_net) { __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); /* Because of deferred_put_nlk_sk and use of work queue, * it is possible netns will be freed before this socket. */ sock_net_set(sk, &init_net); __netns_tracker_alloc(&init_net, &sk->ns_tracker, false, GFP_KERNEL); } call_rcu(&nlk->rcu, deferred_put_nlk_sk); return 0; } static int netlink_autobind(struct socket *sock) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct netlink_table *table = &nl_table[sk->sk_protocol]; s32 portid = task_tgid_vnr(current); int err; s32 rover = -4096; bool ok; retry: cond_resched(); rcu_read_lock(); ok = !__netlink_lookup(table, portid, net); rcu_read_unlock(); if (!ok) { /* Bind collision, search negative portid values. */ if (rover == -4096) /* rover will be in range [S32_MIN, -4097] */ rover = S32_MIN + get_random_u32_below(-4096 - S32_MIN); else if (rover >= -4096) rover = -4097; portid = rover--; goto retry; } err = netlink_insert(sk, portid); if (err == -EADDRINUSE) goto retry; /* If 2 threads race to autobind, that is fine. */ if (err == -EBUSY) err = 0; return err; } /** * __netlink_ns_capable - General netlink message capability test * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace. * @user_ns: The user namespace of the capability to use * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap in the user namespace @user_ns. */ bool __netlink_ns_capable(const struct netlink_skb_parms *nsp, struct user_namespace *user_ns, int cap) { return ((nsp->flags & NETLINK_SKB_DST) || file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) && ns_capable(user_ns, cap); } EXPORT_SYMBOL(__netlink_ns_capable); /** * netlink_ns_capable - General netlink message capability test * @skb: socket buffer holding a netlink command from userspace * @user_ns: The user namespace of the capability to use * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap in the user namespace @user_ns. */ bool netlink_ns_capable(const struct sk_buff *skb, struct user_namespace *user_ns, int cap) { return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap); } EXPORT_SYMBOL(netlink_ns_capable); /** * netlink_capable - Netlink global message capability test * @skb: socket buffer holding a netlink command from userspace * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap in all user namespaces. */ bool netlink_capable(const struct sk_buff *skb, int cap) { return netlink_ns_capable(skb, &init_user_ns, cap); } EXPORT_SYMBOL(netlink_capable); /** * netlink_net_capable - Netlink network namespace message capability test * @skb: socket buffer holding a netlink command from userspace * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap over the network namespace of * the socket we received the message from. */ bool netlink_net_capable(const struct sk_buff *skb, int cap) { return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap); } EXPORT_SYMBOL(netlink_net_capable); static inline int netlink_allowed(const struct socket *sock, unsigned int flag) { return (nl_table[sock->sk->sk_protocol].flags & flag) || ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN); } static void netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions) { struct netlink_sock *nlk = nlk_sk(sk); if (nlk->subscriptions && !subscriptions) __sk_del_bind_node(sk); else if (!nlk->subscriptions && subscriptions) sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list); nlk->subscriptions = subscriptions; } static int netlink_realloc_groups(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); unsigned int groups; unsigned long *new_groups; int err = 0; netlink_table_grab(); groups = nl_table[sk->sk_protocol].groups; if (!nl_table[sk->sk_protocol].registered) { err = -ENOENT; goto out_unlock; } if (nlk->ngroups >= groups) goto out_unlock; new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC); if (new_groups == NULL) { err = -ENOMEM; goto out_unlock; } memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0, NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups)); nlk->groups = new_groups; nlk->ngroups = groups; out_unlock: netlink_table_ungrab(); return err; } static void netlink_undo_bind(int group, long unsigned int groups, struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); int undo; if (!nlk->netlink_unbind) return; for (undo = 0; undo < group; undo++) if (test_bit(undo, &groups)) nlk->netlink_unbind(sock_net(sk), undo + 1); } static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err = 0; unsigned long groups; bool bound; if (addr_len < sizeof(struct sockaddr_nl)) return -EINVAL; if (nladdr->nl_family != AF_NETLINK) return -EINVAL; groups = nladdr->nl_groups; /* Only superuser is allowed to listen multicasts */ if (groups) { if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) return err; } if (nlk->ngroups < BITS_PER_LONG) groups &= (1UL << nlk->ngroups) - 1; /* Paired with WRITE_ONCE() in netlink_insert() */ bound = READ_ONCE(nlk->bound); if (bound) { /* Ensure nlk->portid is up-to-date. */ smp_rmb(); if (nladdr->nl_pid != nlk->portid) return -EINVAL; } if (nlk->netlink_bind && groups) { int group; /* nl_groups is a u32, so cap the maximum groups we can bind */ for (group = 0; group < BITS_PER_TYPE(u32); group++) { if (!test_bit(group, &groups)) continue; err = nlk->netlink_bind(net, group + 1); if (!err) continue; netlink_undo_bind(group, groups, sk); return err; } } /* No need for barriers here as we return to user-space without * using any of the bound attributes. */ netlink_lock_table(); if (!bound) { err = nladdr->nl_pid ? netlink_insert(sk, nladdr->nl_pid) : netlink_autobind(sock); if (err) { netlink_undo_bind(BITS_PER_TYPE(u32), groups, sk); goto unlock; } } if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) goto unlock; netlink_unlock_table(); netlink_table_grab(); netlink_update_subscriptions(sk, nlk->subscriptions + hweight32(groups) - hweight32(nlk->groups[0])); nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups; netlink_update_listeners(sk); netlink_table_ungrab(); return 0; unlock: netlink_unlock_table(); return err; } static int netlink_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { int err = 0; struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; if (alen < sizeof(addr->sa_family)) return -EINVAL; if (addr->sa_family == AF_UNSPEC) { /* paired with READ_ONCE() in netlink_getsockbyportid() */ WRITE_ONCE(sk->sk_state, NETLINK_UNCONNECTED); /* dst_portid and dst_group can be read locklessly */ WRITE_ONCE(nlk->dst_portid, 0); WRITE_ONCE(nlk->dst_group, 0); return 0; } if (addr->sa_family != AF_NETLINK) return -EINVAL; if (alen < sizeof(struct sockaddr_nl)) return -EINVAL; if ((nladdr->nl_groups || nladdr->nl_pid) && !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) return -EPERM; /* No need for barriers here as we return to user-space without * using any of the bound attributes. * Paired with WRITE_ONCE() in netlink_insert(). */ if (!READ_ONCE(nlk->bound)) err = netlink_autobind(sock); if (err == 0) { /* paired with READ_ONCE() in netlink_getsockbyportid() */ WRITE_ONCE(sk->sk_state, NETLINK_CONNECTED); /* dst_portid and dst_group can be read locklessly */ WRITE_ONCE(nlk->dst_portid, nladdr->nl_pid); WRITE_ONCE(nlk->dst_group, ffs(nladdr->nl_groups)); } return err; } static int netlink_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr); nladdr->nl_family = AF_NETLINK; nladdr->nl_pad = 0; if (peer) { /* Paired with WRITE_ONCE() in netlink_connect() */ nladdr->nl_pid = READ_ONCE(nlk->dst_portid); nladdr->nl_groups = netlink_group_mask(READ_ONCE(nlk->dst_group)); } else { /* Paired with WRITE_ONCE() in netlink_insert() */ nladdr->nl_pid = READ_ONCE(nlk->portid); netlink_lock_table(); nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; netlink_unlock_table(); } return sizeof(*nladdr); } static int netlink_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { /* try to hand this ioctl down to the NIC drivers. */ return -ENOIOCTLCMD; } static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) { struct sock *sock; struct netlink_sock *nlk; sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid); if (!sock) return ERR_PTR(-ECONNREFUSED); /* Don't bother queuing skb if kernel socket has no input function */ nlk = nlk_sk(sock); /* dst_portid and sk_state can be changed in netlink_connect() */ if (READ_ONCE(sock->sk_state) == NETLINK_CONNECTED && READ_ONCE(nlk->dst_portid) != nlk_sk(ssk)->portid) { sock_put(sock); return ERR_PTR(-ECONNREFUSED); } return sock; } struct sock *netlink_getsockbyfilp(struct file *filp) { struct inode *inode = file_inode(filp); struct sock *sock; if (!S_ISSOCK(inode->i_mode)) return ERR_PTR(-ENOTSOCK); sock = SOCKET_I(inode)->sk; if (sock->sk_family != AF_NETLINK) return ERR_PTR(-EINVAL); sock_hold(sock); return sock; } struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast) { struct sk_buff *skb; void *data; if (size <= NLMSG_GOODSIZE || broadcast) return alloc_skb(size, GFP_KERNEL); size = SKB_DATA_ALIGN(size) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); data = vmalloc(size); if (data == NULL) return NULL; skb = __build_skb(data, size); if (skb == NULL) vfree(data); else skb->destructor = netlink_skb_destructor; return skb; } /* * Attach a skb to a netlink socket. * The caller must hold a reference to the destination socket. On error, the * reference is dropped. The skb is not send to the destination, just all * all error checks are performed and memory in the queue is reserved. * Return values: * < 0: error. skb freed, reference to sock dropped. * 0: continue * 1: repeat lookup - reference dropped while waiting for socket memory. */ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, long *timeo, struct sock *ssk) { struct netlink_sock *nlk; nlk = nlk_sk(sk); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(NETLINK_S_CONGESTED, &nlk->state))) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) netlink_overrun(sk); sock_put(sk); kfree_skb(skb); return -EAGAIN; } __set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&nlk->wait, &wait); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(NETLINK_S_CONGESTED, &nlk->state)) && !sock_flag(sk, SOCK_DEAD)) *timeo = schedule_timeout(*timeo); __set_current_state(TASK_RUNNING); remove_wait_queue(&nlk->wait, &wait); sock_put(sk); if (signal_pending(current)) { kfree_skb(skb); return sock_intr_errno(*timeo); } return 1; } netlink_skb_set_owner_r(skb, sk); return 0; } static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = skb->len; netlink_deliver_tap(sock_net(sk), skb); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); return len; } int netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = __netlink_sendskb(sk, skb); sock_put(sk); return len; } void netlink_detachskb(struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); sock_put(sk); } static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) { int delta; WARN_ON(skb->sk != NULL); delta = skb->end - skb->tail; if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize) return skb; if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, allocation); if (!nskb) return skb; consume_skb(skb); skb = nskb; } pskb_expand_head(skb, 0, -delta, (allocation & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN | __GFP_NORETRY); return skb; } static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, struct sock *ssk) { int ret; struct netlink_sock *nlk = nlk_sk(sk); ret = -ECONNREFUSED; if (nlk->netlink_rcv != NULL) { ret = skb->len; netlink_skb_set_owner_r(skb, sk); NETLINK_CB(skb).sk = ssk; netlink_deliver_tap_kernel(sk, ssk, skb); nlk->netlink_rcv(skb); consume_skb(skb); } else { kfree_skb(skb); } sock_put(sk); return ret; } int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 portid, int nonblock) { struct sock *sk; int err; long timeo; skb = netlink_trim(skb, gfp_any()); timeo = sock_sndtimeo(ssk, nonblock); retry: sk = netlink_getsockbyportid(ssk, portid); if (IS_ERR(sk)) { kfree_skb(skb); return PTR_ERR(sk); } if (netlink_is_kernel(sk)) return netlink_unicast_kernel(sk, skb, ssk); if (sk_filter(sk, skb)) { err = skb->len; kfree_skb(skb); sock_put(sk); return err; } err = netlink_attachskb(sk, skb, &timeo, ssk); if (err == 1) goto retry; if (err) return err; return netlink_sendskb(sk, skb); } EXPORT_SYMBOL(netlink_unicast); int netlink_has_listeners(struct sock *sk, unsigned int group) { int res = 0; struct listeners *listeners; BUG_ON(!netlink_is_kernel(sk)); rcu_read_lock(); listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners); if (listeners && group - 1 < nl_table[sk->sk_protocol].groups) res = test_bit(group - 1, listeners->masks); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(netlink_has_listeners); bool netlink_strict_get_check(struct sk_buff *skb) { return nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk); } EXPORT_SYMBOL_GPL(netlink_strict_get_check); static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) { struct netlink_sock *nlk = nlk_sk(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !test_bit(NETLINK_S_CONGESTED, &nlk->state)) { netlink_skb_set_owner_r(skb, sk); __netlink_sendskb(sk, skb); return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); } return -1; } struct netlink_broadcast_data { struct sock *exclude_sk; struct net *net; u32 portid; u32 group; int failure; int delivery_failure; int congested; int delivered; gfp_t allocation; struct sk_buff *skb, *skb2; int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data); void *tx_data; }; static void do_one_broadcast(struct sock *sk, struct netlink_broadcast_data *p) { struct netlink_sock *nlk = nlk_sk(sk); int val; if (p->exclude_sk == sk) return; if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups || !test_bit(p->group - 1, nlk->groups)) return; if (!net_eq(sock_net(sk), p->net)) { if (!nlk_test_bit(LISTEN_ALL_NSID, sk)) return; if (!peernet_has_id(sock_net(sk), p->net)) return; if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns, CAP_NET_BROADCAST)) return; } if (p->failure) { netlink_overrun(sk); return; } sock_hold(sk); if (p->skb2 == NULL) { if (skb_shared(p->skb)) { p->skb2 = skb_clone(p->skb, p->allocation); } else { p->skb2 = skb_get(p->skb); /* * skb ownership may have been set when * delivered to a previous socket. */ skb_orphan(p->skb2); } } if (p->skb2 == NULL) { netlink_overrun(sk); /* Clone failed. Notify ALL listeners. */ p->failure = 1; if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) p->delivery_failure = 1; goto out; } if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { kfree_skb(p->skb2); p->skb2 = NULL; goto out; } if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; goto out; } NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net); if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED) NETLINK_CB(p->skb2).nsid_is_set = true; val = netlink_broadcast_deliver(sk, p->skb2); if (val < 0) { netlink_overrun(sk); if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) p->delivery_failure = 1; } else { p->congested |= val; p->delivered = 1; p->skb2 = NULL; } out: sock_put(sk); } int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid, u32 group, gfp_t allocation, netlink_filter_fn filter, void *filter_data) { struct net *net = sock_net(ssk); struct netlink_broadcast_data info; struct sock *sk; skb = netlink_trim(skb, allocation); info.exclude_sk = ssk; info.net = net; info.portid = portid; info.group = group; info.failure = 0; info.delivery_failure = 0; info.congested = 0; info.delivered = 0; info.allocation = allocation; info.skb = skb; info.skb2 = NULL; info.tx_filter = filter; info.tx_data = filter_data; /* While we sleep in clone, do not allow to change socket list */ netlink_lock_table(); sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) do_one_broadcast(sk, &info); consume_skb(skb); netlink_unlock_table(); if (info.delivery_failure) { kfree_skb(info.skb2); return -ENOBUFS; } consume_skb(info.skb2); if (info.delivered) { if (info.congested && gfpflags_allow_blocking(allocation)) yield(); return 0; } return -ESRCH; } EXPORT_SYMBOL(netlink_broadcast_filtered); int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, u32 group, gfp_t allocation) { return netlink_broadcast_filtered(ssk, skb, portid, group, allocation, NULL, NULL); } EXPORT_SYMBOL(netlink_broadcast); struct netlink_set_err_data { struct sock *exclude_sk; u32 portid; u32 group; int code; }; static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) { struct netlink_sock *nlk = nlk_sk(sk); int ret = 0; if (sk == p->exclude_sk) goto out; if (!net_eq(sock_net(sk), sock_net(p->exclude_sk))) goto out; if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups || !test_bit(p->group - 1, nlk->groups)) goto out; if (p->code == ENOBUFS && nlk_test_bit(RECV_NO_ENOBUFS, sk)) { ret = 1; goto out; } WRITE_ONCE(sk->sk_err, p->code); sk_error_report(sk); out: return ret; } /** * netlink_set_err - report error to broadcast listeners * @ssk: the kernel netlink socket, as returned by netlink_kernel_create() * @portid: the PORTID of a process that we want to skip (if any) * @group: the broadcast group that will notice the error * @code: error code, must be negative (as usual in kernelspace) * * This function returns the number of broadcast listeners that have set the * NETLINK_NO_ENOBUFS socket option. */ int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) { struct netlink_set_err_data info; unsigned long flags; struct sock *sk; int ret = 0; info.exclude_sk = ssk; info.portid = portid; info.group = group; /* sk->sk_err wants a positive error value */ info.code = -code; read_lock_irqsave(&nl_table_lock, flags); sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) ret += do_one_set_err(sk, &info); read_unlock_irqrestore(&nl_table_lock, flags); return ret; } EXPORT_SYMBOL(netlink_set_err); /* must be called with netlink table grabbed */ static void netlink_update_socket_mc(struct netlink_sock *nlk, unsigned int group, int is_new) { int old, new = !!is_new, subscriptions; old = test_bit(group - 1, nlk->groups); subscriptions = nlk->subscriptions - old + new; __assign_bit(group - 1, nlk->groups, new); netlink_update_subscriptions(&nlk->sk, subscriptions); netlink_update_listeners(&nlk->sk); } static int netlink_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); unsigned int val = 0; int nr = -1; if (level != SOL_NETLINK) return -ENOPROTOOPT; if (optlen >= sizeof(int) && copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (optname) { case NETLINK_PKTINFO: nr = NETLINK_F_RECV_PKTINFO; break; case NETLINK_ADD_MEMBERSHIP: case NETLINK_DROP_MEMBERSHIP: { int err; if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) return err; if (!val || val - 1 >= nlk->ngroups) return -EINVAL; if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) { err = nlk->netlink_bind(sock_net(sk), val); if (err) return err; } netlink_table_grab(); netlink_update_socket_mc(nlk, val, optname == NETLINK_ADD_MEMBERSHIP); netlink_table_ungrab(); if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind) nlk->netlink_unbind(sock_net(sk), val); break; } case NETLINK_BROADCAST_ERROR: nr = NETLINK_F_BROADCAST_SEND_ERROR; break; case NETLINK_NO_ENOBUFS: assign_bit(NETLINK_F_RECV_NO_ENOBUFS, &nlk->flags, val); if (val) { clear_bit(NETLINK_S_CONGESTED, &nlk->state); wake_up_interruptible(&nlk->wait); } break; case NETLINK_LISTEN_ALL_NSID: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST)) return -EPERM; nr = NETLINK_F_LISTEN_ALL_NSID; break; case NETLINK_CAP_ACK: nr = NETLINK_F_CAP_ACK; break; case NETLINK_EXT_ACK: nr = NETLINK_F_EXT_ACK; break; case NETLINK_GET_STRICT_CHK: nr = NETLINK_F_STRICT_CHK; break; default: return -ENOPROTOOPT; } if (nr >= 0) assign_bit(nr, &nlk->flags, val); return 0; } static int netlink_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); unsigned int flag; int len, val; if (level != SOL_NETLINK) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case NETLINK_PKTINFO: flag = NETLINK_F_RECV_PKTINFO; break; case NETLINK_BROADCAST_ERROR: flag = NETLINK_F_BROADCAST_SEND_ERROR; break; case NETLINK_NO_ENOBUFS: flag = NETLINK_F_RECV_NO_ENOBUFS; break; case NETLINK_LIST_MEMBERSHIPS: { int pos, idx, shift, err = 0; netlink_lock_table(); for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) { if (len - pos < sizeof(u32)) break; idx = pos / sizeof(unsigned long); shift = (pos % sizeof(unsigned long)) * 8; if (put_user((u32)(nlk->groups[idx] >> shift), (u32 __user *)(optval + pos))) { err = -EFAULT; break; } } if (put_user(ALIGN(BITS_TO_BYTES(nlk->ngroups), sizeof(u32)), optlen)) err = -EFAULT; netlink_unlock_table(); return err; } case NETLINK_CAP_ACK: flag = NETLINK_F_CAP_ACK; break; case NETLINK_EXT_ACK: flag = NETLINK_F_EXT_ACK; break; case NETLINK_GET_STRICT_CHK: flag = NETLINK_F_STRICT_CHK; break; default: return -ENOPROTOOPT; } if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = test_bit(flag, &nlk->flags); if (put_user(len, optlen) || copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) { struct nl_pktinfo info; info.group = NETLINK_CB(skb).dst_group; put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); } static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { if (!NETLINK_CB(skb).nsid_is_set) return; put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int), &NETLINK_CB(skb).nsid); } static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); u32 dst_portid; u32 dst_group; struct sk_buff *skb; int err; struct scm_cookie scm; u32 netlink_skb_flags = 0; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; if (len == 0) { pr_warn_once("Zero length message leads to an empty skb\n"); return -ENODATA; } err = scm_send(sock, msg, &scm, true); if (err < 0) return err; if (msg->msg_namelen) { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_nl)) goto out; if (addr->nl_family != AF_NETLINK) goto out; dst_portid = addr->nl_pid; dst_group = ffs(addr->nl_groups); err = -EPERM; if ((dst_group || dst_portid) && !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) goto out; netlink_skb_flags |= NETLINK_SKB_DST; } else { /* Paired with WRITE_ONCE() in netlink_connect() */ dst_portid = READ_ONCE(nlk->dst_portid); dst_group = READ_ONCE(nlk->dst_group); } /* Paired with WRITE_ONCE() in netlink_insert() */ if (!READ_ONCE(nlk->bound)) { err = netlink_autobind(sock); if (err) goto out; } else { /* Ensure nlk is hashed and visible. */ smp_rmb(); } err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) goto out; err = -ENOBUFS; skb = netlink_alloc_large_skb(len, dst_group); if (skb == NULL) goto out; NETLINK_CB(skb).portid = nlk->portid; NETLINK_CB(skb).dst_group = dst_group; NETLINK_CB(skb).creds = scm.creds; NETLINK_CB(skb).flags = netlink_skb_flags; err = -EFAULT; if (memcpy_from_msg(skb_put(skb, len), msg, len)) { kfree_skb(skb); goto out; } err = security_netlink_send(sk, skb); if (err) { kfree_skb(skb); goto out; } if (dst_group) { refcount_inc(&skb->users); netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL); } err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags & MSG_DONTWAIT); out: scm_destroy(&scm); return err; } static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct scm_cookie scm; struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); size_t copied, max_recvmsg_len; struct sk_buff *skb, *data_skb; int err, ret; if (flags & MSG_OOB) return -EOPNOTSUPP; copied = 0; skb = skb_recv_datagram(sk, flags, &err); if (skb == NULL) goto out; data_skb = skb; #ifdef CONFIG_COMPAT_NETLINK_MESSAGES if (unlikely(skb_shinfo(skb)->frag_list)) { /* * If this skb has a frag_list, then here that means that we * will have to use the frag_list skb's data for compat tasks * and the regular skb's data for normal (non-compat) tasks. * * If we need to send the compat skb, assign it to the * 'data_skb' variable so that it will be used below for data * copying. We keep 'skb' for everything else, including * freeing both later. */ if (flags & MSG_CMSG_COMPAT) data_skb = skb_shinfo(skb)->frag_list; } #endif /* Record the max length of recvmsg() calls for future allocations */ max_recvmsg_len = max(READ_ONCE(nlk->max_recvmsg_len), len); max_recvmsg_len = min_t(size_t, max_recvmsg_len, SKB_WITH_OVERHEAD(32768)); WRITE_ONCE(nlk->max_recvmsg_len, max_recvmsg_len); copied = data_skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(data_skb, 0, msg, copied); if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); addr->nl_family = AF_NETLINK; addr->nl_pad = 0; addr->nl_pid = NETLINK_CB(skb).portid; addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group); msg->msg_namelen = sizeof(*addr); } if (nlk_test_bit(RECV_PKTINFO, sk)) netlink_cmsg_recv_pktinfo(msg, skb); if (nlk_test_bit(LISTEN_ALL_NSID, sk)) netlink_cmsg_listen_all_nsid(sk, msg, skb); memset(&scm, 0, sizeof(scm)); scm.creds = *NETLINK_CREDS(skb); if (flags & MSG_TRUNC) copied = data_skb->len; skb_free_datagram(sk, skb); if (READ_ONCE(nlk->cb_running) && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { ret = netlink_dump(sk); if (ret) { WRITE_ONCE(sk->sk_err, -ret); sk_error_report(sk); } } scm_recv(sock, msg, &scm, flags); out: netlink_rcv_wake(sk); return err ? : copied; } static void netlink_data_ready(struct sock *sk) { BUG(); } /* * We export these functions to other modules. They provide a * complete set of kernel non-blocking support for message * queueing. */ struct sock * __netlink_kernel_create(struct net *net, int unit, struct module *module, struct netlink_kernel_cfg *cfg) { struct socket *sock; struct sock *sk; struct netlink_sock *nlk; struct listeners *listeners = NULL; struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL; unsigned int groups; BUG_ON(!nl_table); if (unit < 0 || unit >= MAX_LINKS) return NULL; if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; if (__netlink_create(net, sock, cb_mutex, unit, 1) < 0) goto out_sock_release_nosk; sk = sock->sk; if (!cfg || cfg->groups < 32) groups = 32; else groups = cfg->groups; listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); if (!listeners) goto out_sock_release; sk->sk_data_ready = netlink_data_ready; if (cfg && cfg->input) nlk_sk(sk)->netlink_rcv = cfg->input; if (netlink_insert(sk, 0)) goto out_sock_release; nlk = nlk_sk(sk); set_bit(NETLINK_F_KERNEL_SOCKET, &nlk->flags); netlink_table_grab(); if (!nl_table[unit].registered) { nl_table[unit].groups = groups; rcu_assign_pointer(nl_table[unit].listeners, listeners); nl_table[unit].cb_mutex = cb_mutex; nl_table[unit].module = module; if (cfg) { nl_table[unit].bind = cfg->bind; nl_table[unit].unbind = cfg->unbind; nl_table[unit].release = cfg->release; nl_table[unit].flags = cfg->flags; } nl_table[unit].registered = 1; } else { kfree(listeners); nl_table[unit].registered++; } netlink_table_ungrab(); return sk; out_sock_release: kfree(listeners); netlink_kernel_release(sk); return NULL; out_sock_release_nosk: sock_release(sock); return NULL; } EXPORT_SYMBOL(__netlink_kernel_create); void netlink_kernel_release(struct sock *sk) { if (sk == NULL || sk->sk_socket == NULL) return; sock_release(sk->sk_socket); } EXPORT_SYMBOL(netlink_kernel_release); int __netlink_change_ngroups(struct sock *sk, unsigned int groups) { struct listeners *new, *old; struct netlink_table *tbl = &nl_table[sk->sk_protocol]; if (groups < 32) groups = 32; if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) { new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC); if (!new) return -ENOMEM; old = nl_deref_protected(tbl->listeners); memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups)); rcu_assign_pointer(tbl->listeners, new); kfree_rcu(old, rcu); } tbl->groups = groups; return 0; } /** * netlink_change_ngroups - change number of multicast groups * * This changes the number of multicast groups that are available * on a certain netlink family. Note that it is not possible to * change the number of groups to below 32. Also note that it does * not implicitly call netlink_clear_multicast_users() when the * number of groups is reduced. * * @sk: The kernel netlink socket, as returned by netlink_kernel_create(). * @groups: The new number of groups. */ int netlink_change_ngroups(struct sock *sk, unsigned int groups) { int err; netlink_table_grab(); err = __netlink_change_ngroups(sk, groups); netlink_table_ungrab(); return err; } void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group) { struct sock *sk; struct netlink_table *tbl = &nl_table[ksk->sk_protocol]; sk_for_each_bound(sk, &tbl->mc_list) netlink_update_socket_mc(nlk_sk(sk), group, 0); } struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags) { struct nlmsghdr *nlh; int size = nlmsg_msg_size(len); nlh = skb_put(skb, NLMSG_ALIGN(size)); nlh->nlmsg_type = type; nlh->nlmsg_len = size; nlh->nlmsg_flags = flags; nlh->nlmsg_pid = portid; nlh->nlmsg_seq = seq; if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0) memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size); return nlh; } EXPORT_SYMBOL(__nlmsg_put); /* * It looks a bit ugly. * It would be better to create kernel thread. */ static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb, struct netlink_callback *cb, struct netlink_ext_ack *extack) { struct nlmsghdr *nlh; nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(nlk->dump_done_errno), NLM_F_MULTI | cb->answer_flags); if (WARN_ON(!nlh)) return -ENOBUFS; nl_dump_check_consistent(cb, nlh); memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno)); if (extack->_msg && test_bit(NETLINK_F_EXT_ACK, &nlk->flags)) { nlh->nlmsg_flags |= NLM_F_ACK_TLVS; if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)) nlmsg_end(skb, nlh); } return 0; } static int netlink_dump(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); struct netlink_ext_ack extack = {}; struct netlink_callback *cb; struct sk_buff *skb = NULL; size_t max_recvmsg_len; struct module *module; int err = -ENOBUFS; int alloc_min_size; int alloc_size; mutex_lock(nlk->cb_mutex); if (!nlk->cb_running) { err = -EINVAL; goto errout_skb; } if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto errout_skb; /* NLMSG_GOODSIZE is small to avoid high order allocations being * required, but it makes sense to _attempt_ a 16K bytes allocation * to reduce number of system calls on dump operations, if user * ever provided a big enough buffer. */ cb = &nlk->cb; alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); max_recvmsg_len = READ_ONCE(nlk->max_recvmsg_len); if (alloc_min_size < max_recvmsg_len) { alloc_size = max_recvmsg_len; skb = alloc_skb(alloc_size, (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN | __GFP_NORETRY); } if (!skb) { alloc_size = alloc_min_size; skb = alloc_skb(alloc_size, GFP_KERNEL); } if (!skb) goto errout_skb; /* Trim skb to allocated size. User is expected to provide buffer as * large as max(min_dump_alloc, 16KiB (mac_recvmsg_len capped at * netlink_recvmsg())). dump will pack as many smaller messages as * could fit within the allocated skb. skb is typically allocated * with larger space than required (could be as much as near 2x the * requested size with align to next power of 2 approach). Allowing * dump to use the excess space makes it difficult for a user to have a * reasonable static buffer based on the expected largest dump of a * single netdev. The outcome is MSG_TRUNC error. */ skb_reserve(skb, skb_tailroom(skb) - alloc_size); /* Make sure malicious BPF programs can not read unitialized memory * from skb->head -> skb->data */ skb_reset_network_header(skb); skb_reset_mac_header(skb); netlink_skb_set_owner_r(skb, sk); if (nlk->dump_done_errno > 0) { cb->extack = &extack; nlk->dump_done_errno = cb->dump(skb, cb); cb->extack = NULL; } if (nlk->dump_done_errno > 0 || skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) { mutex_unlock(nlk->cb_mutex); if (sk_filter(sk, skb)) kfree_skb(skb); else __netlink_sendskb(sk, skb); return 0; } if (netlink_dump_done(nlk, skb, cb, &extack)) goto errout_skb; #ifdef CONFIG_COMPAT_NETLINK_MESSAGES /* frag_list skb's data is used for compat tasks * and the regular skb's data for normal (non-compat) tasks. * See netlink_recvmsg(). */ if (unlikely(skb_shinfo(skb)->frag_list)) { if (netlink_dump_done(nlk, skb_shinfo(skb)->frag_list, cb, &extack)) goto errout_skb; } #endif if (sk_filter(sk, skb)) kfree_skb(skb); else __netlink_sendskb(sk, skb); if (cb->done) cb->done(cb); WRITE_ONCE(nlk->cb_running, false); module = cb->module; skb = cb->skb; mutex_unlock(nlk->cb_mutex); module_put(module); consume_skb(skb); return 0; errout_skb: mutex_unlock(nlk->cb_mutex); kfree_skb(skb); return err; } int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { struct netlink_callback *cb; struct netlink_sock *nlk; struct sock *sk; int ret; refcount_inc(&skb->users); sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid); if (sk == NULL) { ret = -ECONNREFUSED; goto error_free; } nlk = nlk_sk(sk); mutex_lock(nlk->cb_mutex); /* A dump is in progress... */ if (nlk->cb_running) { ret = -EBUSY; goto error_unlock; } /* add reference of module which cb->dump belongs to */ if (!try_module_get(control->module)) { ret = -EPROTONOSUPPORT; goto error_unlock; } cb = &nlk->cb; memset(cb, 0, sizeof(*cb)); cb->dump = control->dump; cb->done = control->done; cb->nlh = nlh; cb->data = control->data; cb->module = control->module; cb->min_dump_alloc = control->min_dump_alloc; cb->skb = skb; cb->strict_check = nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk); if (control->start) { cb->extack = control->extack; ret = control->start(cb); cb->extack = NULL; if (ret) goto error_put; } WRITE_ONCE(nlk->cb_running, true); nlk->dump_done_errno = INT_MAX; mutex_unlock(nlk->cb_mutex); ret = netlink_dump(sk); sock_put(sk); if (ret) return ret; /* We successfully started a dump, by returning -EINTR we * signal not to send ACK even if it was requested. */ return -EINTR; error_put: module_put(control->module); error_unlock: sock_put(sk); mutex_unlock(nlk->cb_mutex); error_free: kfree_skb(skb); return ret; } EXPORT_SYMBOL(__netlink_dump_start); static size_t netlink_ack_tlv_len(struct netlink_sock *nlk, int err, const struct netlink_ext_ack *extack) { size_t tlvlen; if (!extack || !test_bit(NETLINK_F_EXT_ACK, &nlk->flags)) return 0; tlvlen = 0; if (extack->_msg) tlvlen += nla_total_size(strlen(extack->_msg) + 1); if (extack->cookie_len) tlvlen += nla_total_size(extack->cookie_len); /* Following attributes are only reported as error (not warning) */ if (!err) return tlvlen; if (extack->bad_attr) tlvlen += nla_total_size(sizeof(u32)); if (extack->policy) tlvlen += netlink_policy_dump_attr_size_estimate(extack->policy); if (extack->miss_type) tlvlen += nla_total_size(sizeof(u32)); if (extack->miss_nest) tlvlen += nla_total_size(sizeof(u32)); return tlvlen; } static void netlink_ack_tlv_fill(struct sk_buff *in_skb, struct sk_buff *skb, struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack) { if (extack->_msg) WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)); if (extack->cookie_len) WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, extack->cookie_len, extack->cookie)); if (!err) return; if (extack->bad_attr && !WARN_ON((u8 *)extack->bad_attr < in_skb->data || (u8 *)extack->bad_attr >= in_skb->data + in_skb->len)) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, (u8 *)extack->bad_attr - (u8 *)nlh)); if (extack->policy) netlink_policy_dump_write_attr(skb, extack->policy, NLMSGERR_ATTR_POLICY); if (extack->miss_type) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_TYPE, extack->miss_type)); if (extack->miss_nest && !WARN_ON((u8 *)extack->miss_nest < in_skb->data || (u8 *)extack->miss_nest > in_skb->data + in_skb->len)) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_NEST, (u8 *)extack->miss_nest - (u8 *)nlh)); } void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack) { struct sk_buff *skb; struct nlmsghdr *rep; struct nlmsgerr *errmsg; size_t payload = sizeof(*errmsg); struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk); unsigned int flags = 0; size_t tlvlen; /* Error messages get the original request appened, unless the user * requests to cap the error message, and get extra error data if * requested. */ if (err && !test_bit(NETLINK_F_CAP_ACK, &nlk->flags)) payload += nlmsg_len(nlh); else flags |= NLM_F_CAPPED; tlvlen = netlink_ack_tlv_len(nlk, err, extack); if (tlvlen) flags |= NLM_F_ACK_TLVS; skb = nlmsg_new(payload + tlvlen, GFP_KERNEL); if (!skb) goto err_skb; rep = nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, NLMSG_ERROR, sizeof(*errmsg), flags); if (!rep) goto err_bad_put; errmsg = nlmsg_data(rep); errmsg->error = err; errmsg->msg = *nlh; if (!(flags & NLM_F_CAPPED)) { if (!nlmsg_append(skb, nlmsg_len(nlh))) goto err_bad_put; memcpy(nlmsg_data(&errmsg->msg), nlmsg_data(nlh), nlmsg_len(nlh)); } if (tlvlen) netlink_ack_tlv_fill(in_skb, skb, nlh, err, extack); nlmsg_end(skb, rep); nlmsg_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid); return; err_bad_put: nlmsg_free(skb); err_skb: WRITE_ONCE(NETLINK_CB(in_skb).sk->sk_err, ENOBUFS); sk_error_report(NETLINK_CB(in_skb).sk); } EXPORT_SYMBOL(netlink_ack); int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *)) { struct netlink_ext_ack extack; struct nlmsghdr *nlh; int err; while (skb->len >= nlmsg_total_size(0)) { int msglen; memset(&extack, 0, sizeof(extack)); nlh = nlmsg_hdr(skb); err = 0; if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) return 0; /* Only requests are handled by the kernel */ if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) goto ack; /* Skip control messages */ if (nlh->nlmsg_type < NLMSG_MIN_TYPE) goto ack; err = cb(skb, nlh, &extack); if (err == -EINTR) goto skip; ack: if (nlh->nlmsg_flags & NLM_F_ACK || err) netlink_ack(skb, nlh, err, &extack); skip: msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; skb_pull(skb, msglen); } return 0; } EXPORT_SYMBOL(netlink_rcv_skb); /** * nlmsg_notify - send a notification netlink message * @sk: netlink socket to use * @skb: notification message * @portid: destination netlink portid for reports or 0 * @group: destination multicast group or 0 * @report: 1 to report back, 0 to disable * @flags: allocation flags */ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, int report, gfp_t flags) { int err = 0; if (group) { int exclude_portid = 0; if (report) { refcount_inc(&skb->users); exclude_portid = portid; } /* errors reported via destination sk->sk_err, but propagate * delivery errors if NETLINK_BROADCAST_ERROR flag is set */ err = nlmsg_multicast(sk, skb, exclude_portid, group, flags); if (err == -ESRCH) err = 0; } if (report) { int err2; err2 = nlmsg_unicast(sk, skb, portid); if (!err) err = err2; } return err; } EXPORT_SYMBOL(nlmsg_notify); #ifdef CONFIG_PROC_FS struct nl_seq_iter { struct seq_net_private p; struct rhashtable_iter hti; int link; }; static void netlink_walk_start(struct nl_seq_iter *iter) { rhashtable_walk_enter(&nl_table[iter->link].hash, &iter->hti); rhashtable_walk_start(&iter->hti); } static void netlink_walk_stop(struct nl_seq_iter *iter) { rhashtable_walk_stop(&iter->hti); rhashtable_walk_exit(&iter->hti); } static void *__netlink_seq_next(struct seq_file *seq) { struct nl_seq_iter *iter = seq->private; struct netlink_sock *nlk; do { for (;;) { nlk = rhashtable_walk_next(&iter->hti); if (IS_ERR(nlk)) { if (PTR_ERR(nlk) == -EAGAIN) continue; return nlk; } if (nlk) break; netlink_walk_stop(iter); if (++iter->link >= MAX_LINKS) return NULL; netlink_walk_start(iter); } } while (sock_net(&nlk->sk) != seq_file_net(seq)); return nlk; } static void *netlink_seq_start(struct seq_file *seq, loff_t *posp) __acquires(RCU) { struct nl_seq_iter *iter = seq->private; void *obj = SEQ_START_TOKEN; loff_t pos; iter->link = 0; netlink_walk_start(iter); for (pos = *posp; pos && obj && !IS_ERR(obj); pos--) obj = __netlink_seq_next(seq); return obj; } static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return __netlink_seq_next(seq); } static void netlink_native_seq_stop(struct seq_file *seq, void *v) { struct nl_seq_iter *iter = seq->private; if (iter->link >= MAX_LINKS) return; netlink_walk_stop(iter); } static int netlink_native_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, "sk Eth Pid Groups " "Rmem Wmem Dump Locks Drops Inode\n"); } else { struct sock *s = v; struct netlink_sock *nlk = nlk_sk(s); seq_printf(seq, "%pK %-3d %-10u %08x %-8d %-8d %-5d %-8d %-8u %-8lu\n", s, s->sk_protocol, nlk->portid, nlk->groups ? (u32)nlk->groups[0] : 0, sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), READ_ONCE(nlk->cb_running), refcount_read(&s->sk_refcnt), atomic_read(&s->sk_drops), sock_i_ino(s) ); } return 0; } #ifdef CONFIG_BPF_SYSCALL struct bpf_iter__netlink { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct netlink_sock *, sk); }; DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk) static int netlink_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, void *v) { struct bpf_iter__netlink ctx; meta->seq_num--; /* skip SEQ_START_TOKEN */ ctx.meta = meta; ctx.sk = nlk_sk((struct sock *)v); return bpf_iter_run_prog(prog, &ctx); } static int netlink_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, false); if (!prog) return netlink_native_seq_show(seq, v); if (v != SEQ_START_TOKEN) return netlink_prog_seq_show(prog, &meta, v); return 0; } static void netlink_seq_stop(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)netlink_prog_seq_show(prog, &meta, v); } netlink_native_seq_stop(seq, v); } #else static int netlink_seq_show(struct seq_file *seq, void *v) { return netlink_native_seq_show(seq, v); } static void netlink_seq_stop(struct seq_file *seq, void *v) { netlink_native_seq_stop(seq, v); } #endif static const struct seq_operations netlink_seq_ops = { .start = netlink_seq_start, .next = netlink_seq_next, .stop = netlink_seq_stop, .show = netlink_seq_show, }; #endif int netlink_register_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&netlink_chain, nb); } EXPORT_SYMBOL(netlink_register_notifier); int netlink_unregister_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&netlink_chain, nb); } EXPORT_SYMBOL(netlink_unregister_notifier); static const struct proto_ops netlink_ops = { .family = PF_NETLINK, .owner = THIS_MODULE, .release = netlink_release, .bind = netlink_bind, .connect = netlink_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, .poll = datagram_poll, .ioctl = netlink_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = netlink_setsockopt, .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, .mmap = sock_no_mmap, }; static const struct net_proto_family netlink_family_ops = { .family = PF_NETLINK, .create = netlink_create, .owner = THIS_MODULE, /* for consistency 8) */ }; static int __net_init netlink_net_init(struct net *net) { #ifdef CONFIG_PROC_FS if (!proc_create_net("netlink", 0, net->proc_net, &netlink_seq_ops, sizeof(struct nl_seq_iter))) return -ENOMEM; #endif return 0; } static void __net_exit netlink_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("netlink", net->proc_net); #endif } static void __init netlink_add_usersock_entry(void) { struct listeners *listeners; int groups = 32; listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); if (!listeners) panic("netlink_add_usersock_entry: Cannot allocate listeners\n"); netlink_table_grab(); nl_table[NETLINK_USERSOCK].groups = groups; rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners); nl_table[NETLINK_USERSOCK].module = THIS_MODULE; nl_table[NETLINK_USERSOCK].registered = 1; nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND; netlink_table_ungrab(); } static struct pernet_operations __net_initdata netlink_net_ops = { .init = netlink_net_init, .exit = netlink_net_exit, }; static inline u32 netlink_hash(const void *data, u32 len, u32 seed) { const struct netlink_sock *nlk = data; struct netlink_compare_arg arg; netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid); return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed); } static const struct rhashtable_params netlink_rhashtable_params = { .head_offset = offsetof(struct netlink_sock, node), .key_len = netlink_compare_arg_len, .obj_hashfn = netlink_hash, .obj_cmpfn = netlink_compare, .automatic_shrinking = true, }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) BTF_ID_LIST(btf_netlink_sock_id) BTF_ID(struct, netlink_sock) static const struct bpf_iter_seq_info netlink_seq_info = { .seq_ops = &netlink_seq_ops, .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct nl_seq_iter), }; static struct bpf_iter_reg netlink_reg_info = { .target = "netlink", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__netlink, sk), PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &netlink_seq_info, }; static int __init bpf_iter_register(void) { netlink_reg_info.ctx_arg_info[0].btf_id = *btf_netlink_sock_id; return bpf_iter_reg_target(&netlink_reg_info); } #endif static int __init netlink_proto_init(void) { int i; int err = proto_register(&netlink_proto, 0); if (err != 0) goto out; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) err = bpf_iter_register(); if (err) goto out; #endif BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb)); nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); if (!nl_table) goto panic; for (i = 0; i < MAX_LINKS; i++) { if (rhashtable_init(&nl_table[i].hash, &netlink_rhashtable_params) < 0) { while (--i > 0) rhashtable_destroy(&nl_table[i].hash); kfree(nl_table); goto panic; } } netlink_add_usersock_entry(); sock_register(&netlink_family_ops); register_pernet_subsys(&netlink_net_ops); register_pernet_subsys(&netlink_tap_net_ops); /* The netlink device handler may be needed early. */ rtnetlink_init(); out: return err; panic: panic("netlink_init: Cannot allocate nl_table\n"); } core_initcall(netlink_proto_init);
14 14 14 14 14 64 168 99 100 98 100 100 100 100 99 100 100 1 3 6 6 4 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 3 2 2 3 3 5 3 2 1 3 3 6 7 6 6 6 6 5 6 6 4 6 6 6 6 5 6 5 5 5 6 2 2 2 1 1 1 1 6 7 4 3 2 1 1 1 1 1 1 1 1 1 5 5 3 3 3 1 1 1 1 1224 156 1223 156 2 2 8 7 7 7 1 7 7 7 2 2 2 2 8 6 4 5 4 5 5 5 5 5 5 10 7 4 2 4 4 7 6 5 4 2 3 3 3 3 6 4 5 5 4 1 10 9 5 5 4 1 4 4 2 8 100 100 100 44 43 42 42 42 39 34 38 6 5 4 10 9 8 7 7 7 4 3 3 12 11 11 2 11 9 3 8 18 8 7 6 2 1 1 3 2 2 2 4 41 3 3 2 1 1 1 1 1 9 8 7 7 9 5 2 1 5 4 3 3 10 10 10 10 1 1 1 1 1 1 1 1 7 7 7 7 7 7 7 7 7 7 7 7 7 10 8 7 6 5 4 4 3 2 9 10 1 2 1 2 2 2 2 10 23 12 22 5 5 4 3 1 1 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux IPv6 multicast routing support for BSD pim6sd * Based on net/ipv4/ipmr.c. * * (c) 2004 Mickael Hoerdt, <hoerdt@clarinet.u-strasbg.fr> * LSIIT Laboratory, Strasbourg, France * (c) 2004 Jean-Philippe Andriot, <jean-philippe.andriot@6WIND.com> * 6WIND, Paris, France * Copyright (C)2007,2008 USAGI/WIDE Project * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> */ #include <linux/uaccess.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/socket.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/rhashtable.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/raw.h> #include <linux/notifier.h> #include <linux/if_arp.h> #include <net/checksum.h> #include <net/netlink.h> #include <net/fib_rules.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <linux/mroute6.h> #include <linux/pim.h> #include <net/addrconf.h> #include <linux/netfilter_ipv6.h> #include <linux/export.h> #include <net/ip6_checksum.h> #include <linux/netconf.h> #include <net/ip_tunnels.h> #include <linux/nospec.h> struct ip6mr_rule { struct fib_rule common; }; struct ip6mr_result { struct mr_table *mrt; }; /* Big lock, protecting vif table, mrt cache and mroute socket state. Note that the changes are semaphored via rtnl_lock. */ static DEFINE_SPINLOCK(mrt_lock); static struct net_device *vif_dev_read(const struct vif_device *vif) { return rcu_dereference(vif->dev); } /* Multicast router control variables */ /* Special spinlock for queue of unresolved entries */ static DEFINE_SPINLOCK(mfc_unres_lock); /* We return to original Alan's scheme. Hash table of resolved entries is changed only in process context and protected with weak lock mrt_lock. Queue of unresolved entries is protected with strong spinlock mfc_unres_lock. In this case data path is free of exclusive locks at all. */ static struct kmem_cache *mrt_cachep __read_mostly; static struct mr_table *ip6mr_new_table(struct net *net, u32 id); static void ip6mr_free_table(struct mr_table *mrt); static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *cache); static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert); static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd); static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt); static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb); static void mroute_clean_tables(struct mr_table *mrt, int flags); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES #define ip6mr_for_each_table(mrt, net) \ list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list, \ lockdep_rtnl_is_held() || \ list_empty(&net->ipv6.mr6_tables)) static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { struct mr_table *ret; if (!mrt) ret = list_entry_rcu(net->ipv6.mr6_tables.next, struct mr_table, list); else ret = list_entry_rcu(mrt->list.next, struct mr_table, list); if (&ret->list == &net->ipv6.mr6_tables) return NULL; return ret; } static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { struct mr_table *mrt; ip6mr_for_each_table(mrt, net) { if (mrt->id == id) return mrt; } return NULL; } static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { int err; struct ip6mr_result res; struct fib_lookup_arg arg = { .result = &res, .flags = FIB_LOOKUP_NOREF, }; /* update flow if oif or iif point to device enslaved to l3mdev */ l3mdev_update_flow(net, flowi6_to_flowi(flp6)); err = fib_rules_lookup(net->ipv6.mr6_rules_ops, flowi6_to_flowi(flp6), 0, &arg); if (err < 0) return err; *mrt = res.mrt; return 0; } static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct ip6mr_result *res = arg->result; struct mr_table *mrt; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: return -ENETUNREACH; case FR_ACT_PROHIBIT: return -EACCES; case FR_ACT_BLACKHOLE: default: return -EINVAL; } arg->table = fib_rule_get_table(rule, arg); mrt = ip6mr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; return 0; } static int ip6mr_rule_match(struct fib_rule *rule, struct flowi *flp, int flags) { return 1; } static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { return 0; } static int ip6mr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { return 1; } static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh) { frh->dst_len = 0; frh->src_len = 0; frh->tos = 0; return 0; } static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = { .family = RTNL_FAMILY_IP6MR, .rule_size = sizeof(struct ip6mr_rule), .addr_size = sizeof(struct in6_addr), .action = ip6mr_rule_action, .match = ip6mr_rule_match, .configure = ip6mr_rule_configure, .compare = ip6mr_rule_compare, .fill = ip6mr_rule_fill, .nlgroup = RTNLGRP_IPV6_RULE, .owner = THIS_MODULE, }; static int __net_init ip6mr_rules_init(struct net *net) { struct fib_rules_ops *ops; struct mr_table *mrt; int err; ops = fib_rules_register(&ip6mr_rules_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); INIT_LIST_HEAD(&net->ipv6.mr6_tables); mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); if (IS_ERR(mrt)) { err = PTR_ERR(mrt); goto err1; } err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT); if (err < 0) goto err2; net->ipv6.mr6_rules_ops = ops; return 0; err2: rtnl_lock(); ip6mr_free_table(mrt); rtnl_unlock(); err1: fib_rules_unregister(ops); return err; } static void __net_exit ip6mr_rules_exit(struct net *net) { struct mr_table *mrt, *next; ASSERT_RTNL(); list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) { list_del(&mrt->list); ip6mr_free_table(mrt); } fib_rules_unregister(net->ipv6.mr6_rules_ops); } static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack); } static unsigned int ip6mr_rules_seq_read(struct net *net) { return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR); } bool ip6mr_rule_default(const struct fib_rule *rule) { return fib_rule_matchall(rule) && rule->action == FR_ACT_TO_TBL && rule->table == RT6_TABLE_DFLT && !rule->l3mdev; } EXPORT_SYMBOL(ip6mr_rule_default); #else #define ip6mr_for_each_table(mrt, net) \ for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { if (!mrt) return net->ipv6.mrt6; return NULL; } static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { return net->ipv6.mrt6; } static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { *mrt = net->ipv6.mrt6; return 0; } static int __net_init ip6mr_rules_init(struct net *net) { struct mr_table *mrt; mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); if (IS_ERR(mrt)) return PTR_ERR(mrt); net->ipv6.mrt6 = mrt; return 0; } static void __net_exit ip6mr_rules_exit(struct net *net) { ASSERT_RTNL(); ip6mr_free_table(net->ipv6.mrt6); net->ipv6.mrt6 = NULL; } static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return 0; } static unsigned int ip6mr_rules_seq_read(struct net *net) { return 0; } #endif static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct mfc6_cache_cmp_arg *cmparg = arg->key; struct mfc6_cache *c = (struct mfc6_cache *)ptr; return !ipv6_addr_equal(&c->mf6c_mcastgrp, &cmparg->mf6c_mcastgrp) || !ipv6_addr_equal(&c->mf6c_origin, &cmparg->mf6c_origin); } static const struct rhashtable_params ip6mr_rht_params = { .head_offset = offsetof(struct mr_mfc, mnode), .key_offset = offsetof(struct mfc6_cache, cmparg), .key_len = sizeof(struct mfc6_cache_cmp_arg), .nelem_hint = 3, .obj_cmpfn = ip6mr_hash_cmp, .automatic_shrinking = true, }; static void ip6mr_new_table_set(struct mr_table *mrt, struct net *net) { #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables); #endif } static struct mfc6_cache_cmp_arg ip6mr_mr_table_ops_cmparg_any = { .mf6c_origin = IN6ADDR_ANY_INIT, .mf6c_mcastgrp = IN6ADDR_ANY_INIT, }; static struct mr_table_ops ip6mr_mr_table_ops = { .rht_params = &ip6mr_rht_params, .cmparg_any = &ip6mr_mr_table_ops_cmparg_any, }; static struct mr_table *ip6mr_new_table(struct net *net, u32 id) { struct mr_table *mrt; mrt = ip6mr_get_table(net, id); if (mrt) return mrt; return mr_table_alloc(net, id, &ip6mr_mr_table_ops, ipmr_expire_process, ip6mr_new_table_set); } static void ip6mr_free_table(struct mr_table *mrt) { timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC | MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } #ifdef CONFIG_PROC_FS /* The /proc interfaces to multicast routing * /proc/ip6_mr_cache /proc/ip6_mr_vif */ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt; mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) return ERR_PTR(-ENOENT); iter->mrt = mrt; rcu_read_lock(); return mr_vif_seq_start(seq, pos); } static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) { struct mr_vif_iter *iter = seq->private; struct mr_table *mrt = iter->mrt; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Interface BytesIn PktsIn BytesOut PktsOut Flags\n"); } else { const struct vif_device *vif = v; const struct net_device *vif_dev; const char *name; vif_dev = vif_dev_read(vif); name = vif_dev ? vif_dev->name : "none"; seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X\n", vif - mrt->vif_table, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, vif->flags); } return 0; } static const struct seq_operations ip6mr_vif_seq_ops = { .start = ip6mr_vif_seq_start, .next = mr_vif_seq_next, .stop = ip6mr_vif_seq_stop, .show = ip6mr_vif_seq_show, }; static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); struct mr_table *mrt; mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) return ERR_PTR(-ENOENT); return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) { int n; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Group " "Origin " "Iif Pkts Bytes Wrong Oifs\n"); } else { const struct mfc6_cache *mfc = v; const struct mr_mfc_iter *it = seq->private; struct mr_table *mrt = it->mrt; seq_printf(seq, "%pI6 %pI6 %-3hd", &mfc->mf6c_mcastgrp, &mfc->mf6c_origin, mfc->_c.mfc_parent); if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", mfc->_c.mfc_un.res.pkt, mfc->_c.mfc_un.res.bytes, mfc->_c.mfc_un.res.wrong_if); for (n = mfc->_c.mfc_un.res.minvif; n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && mfc->_c.mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", n, mfc->_c.mfc_un.res.ttls[n]); } } else { /* unresolved mfc_caches don't contain * pkt, bytes and wrong_if values */ seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul); } seq_putc(seq, '\n'); } return 0; } static const struct seq_operations ipmr_mfc_seq_ops = { .start = ipmr_mfc_seq_start, .next = mr_mfc_seq_next, .stop = mr_mfc_seq_stop, .show = ipmr_mfc_seq_show, }; #endif #ifdef CONFIG_IPV6_PIMSM_V2 static int pim6_rcv(struct sk_buff *skb) { struct pimreghdr *pim; struct ipv6hdr *encap; struct net_device *reg_dev = NULL; struct net *net = dev_net(skb->dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; int reg_vif_num; if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) goto drop; pim = (struct pimreghdr *)skb_transport_header(skb); if (pim->type != ((PIM_VERSION << 4) | PIM_TYPE_REGISTER) || (pim->flags & PIM_NULL_REGISTER) || (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, sizeof(*pim), IPPROTO_PIM, csum_partial((void *)pim, sizeof(*pim), 0)) && csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; /* check if the inner packet is destined to mcast group */ encap = (struct ipv6hdr *)(skb_transport_header(skb) + sizeof(*pim)); if (!ipv6_addr_is_multicast(&encap->daddr) || encap->payload_len == 0 || ntohs(encap->payload_len) + sizeof(*pim) > skb->len) goto drop; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) goto drop; /* Pairs with WRITE_ONCE() in mif6_add()/mif6_delete() */ reg_vif_num = READ_ONCE(mrt->mroute_reg_vif_num); if (reg_vif_num >= 0) reg_dev = vif_dev_read(&mrt->vif_table[reg_vif_num]); if (!reg_dev) goto drop; skb->mac_header = skb->network_header; skb_pull(skb, (u8 *)encap - skb->data); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = CHECKSUM_NONE; skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); netif_rx(skb); return 0; drop: kfree_skb(skb); return 0; } static const struct inet6_protocol pim6_protocol = { .handler = pim6_rcv, }; /* Service routines creating virtual interfaces: PIMREG */ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_oif = dev->ifindex, .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_mark = skb->mark, }; if (!pskb_inet_may_pull(skb)) goto tx_err; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) goto tx_err; DEV_STATS_ADD(dev, tx_bytes, skb->len); DEV_STATS_INC(dev, tx_packets); rcu_read_lock(); ip6mr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num), MRT6MSG_WHOLEPKT); rcu_read_unlock(); kfree_skb(skb); return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } static int reg_vif_get_iflink(const struct net_device *dev) { return 0; } static const struct net_device_ops reg_vif_netdev_ops = { .ndo_start_xmit = reg_vif_xmit, .ndo_get_iflink = reg_vif_get_iflink, }; static void reg_vif_setup(struct net_device *dev) { dev->type = ARPHRD_PIMREG; dev->mtu = 1500 - sizeof(struct ipv6hdr) - 8; dev->flags = IFF_NOARP; dev->netdev_ops = &reg_vif_netdev_ops; dev->needs_free_netdev = true; dev->features |= NETIF_F_NETNS_LOCAL; } static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) { struct net_device *dev; char name[IFNAMSIZ]; if (mrt->id == RT6_TABLE_DFLT) sprintf(name, "pim6reg"); else sprintf(name, "pim6reg%u", mrt->id); dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); if (!dev) return NULL; dev_net_set(dev, net); if (register_netdevice(dev)) { free_netdev(dev); return NULL; } if (dev_open(dev, NULL)) goto failure; dev_hold(dev); return dev; failure: unregister_netdevice(dev); return NULL; } #endif static int call_ip6mr_vif_entry_notifiers(struct net *net, enum fib_event_type event_type, struct vif_device *vif, struct net_device *vif_dev, mifi_t vif_index, u32 tb_id) { return mr_call_vif_notifiers(net, RTNL_FAMILY_IP6MR, event_type, vif, vif_dev, vif_index, tb_id, &net->ipv6.ipmr_seq); } static int call_ip6mr_mfc_entry_notifiers(struct net *net, enum fib_event_type event_type, struct mfc6_cache *mfc, u32 tb_id) { return mr_call_mfc_notifiers(net, RTNL_FAMILY_IP6MR, event_type, &mfc->_c, tb_id, &net->ipv6.ipmr_seq); } /* Delete a VIF entry */ static int mif6_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { struct vif_device *v; struct net_device *dev; struct inet6_dev *in6_dev; if (vifi < 0 || vifi >= mrt->maxvif) return -EADDRNOTAVAIL; v = &mrt->vif_table[vifi]; dev = rtnl_dereference(v->dev); if (!dev) return -EADDRNOTAVAIL; call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_VIF_DEL, v, dev, vifi, mrt->id); spin_lock(&mrt_lock); RCU_INIT_POINTER(v->dev, NULL); #ifdef CONFIG_IPV6_PIMSM_V2 if (vifi == mrt->mroute_reg_vif_num) { /* Pairs with READ_ONCE() in ip6mr_cache_report() and reg_vif_xmit() */ WRITE_ONCE(mrt->mroute_reg_vif_num, -1); } #endif if (vifi + 1 == mrt->maxvif) { int tmp; for (tmp = vifi - 1; tmp >= 0; tmp--) { if (VIF_EXISTS(mrt, tmp)) break; } WRITE_ONCE(mrt->maxvif, tmp + 1); } spin_unlock(&mrt_lock); dev_set_allmulti(dev, -1); in6_dev = __in6_dev_get(dev); if (in6_dev) { atomic_dec(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); } if ((v->flags & MIFF_REGISTER) && !notify) unregister_netdevice_queue(dev, head); netdev_put(dev, &v->dev_tracker); return 0; } static inline void ip6mr_cache_free_rcu(struct rcu_head *head) { struct mr_mfc *c = container_of(head, struct mr_mfc, rcu); kmem_cache_free(mrt_cachep, (struct mfc6_cache *)c); } static inline void ip6mr_cache_free(struct mfc6_cache *c) { call_rcu(&c->_c.rcu, ip6mr_cache_free_rcu); } /* Destroy an unresolved cache entry, killing queued skbs and reporting error to netlink readers. */ static void ip6mr_destroy_unres(struct mr_table *mrt, struct mfc6_cache *c) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; atomic_dec(&mrt->cache_resolve_queue_len); while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved)) != NULL) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); ((struct nlmsgerr *)nlmsg_data(nlh))->error = -ETIMEDOUT; rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else kfree_skb(skb); } ip6mr_cache_free(c); } /* Timer process for all the unresolved queue. */ static void ipmr_do_expire_process(struct mr_table *mrt) { unsigned long now = jiffies; unsigned long expires = 10 * HZ; struct mr_mfc *c, *next; list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { if (time_after(c->mfc_un.unres.expires, now)) { /* not yet... */ unsigned long interval = c->mfc_un.unres.expires - now; if (interval < expires) expires = interval; continue; } list_del(&c->list); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } if (!list_empty(&mrt->mfc_unres_queue)) mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); } static void ipmr_expire_process(struct timer_list *t) { struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer); if (!spin_trylock(&mfc_unres_lock)) { mod_timer(&mrt->ipmr_expire_timer, jiffies + 1); return; } if (!list_empty(&mrt->mfc_unres_queue)) ipmr_do_expire_process(mrt); spin_unlock(&mfc_unres_lock); } /* Fill oifs list. It is called under locked mrt_lock. */ static void ip6mr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, unsigned char *ttls) { int vifi; cache->mfc_un.res.minvif = MAXMIFS; cache->mfc_un.res.maxvif = 0; memset(cache->mfc_un.res.ttls, 255, MAXMIFS); for (vifi = 0; vifi < mrt->maxvif; vifi++) { if (VIF_EXISTS(mrt, vifi) && ttls[vifi] && ttls[vifi] < 255) { cache->mfc_un.res.ttls[vifi] = ttls[vifi]; if (cache->mfc_un.res.minvif > vifi) cache->mfc_un.res.minvif = vifi; if (cache->mfc_un.res.maxvif <= vifi) cache->mfc_un.res.maxvif = vifi + 1; } } cache->mfc_un.res.lastuse = jiffies; } static int mif6_add(struct net *net, struct mr_table *mrt, struct mif6ctl *vifc, int mrtsock) { int vifi = vifc->mif6c_mifi; struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct inet6_dev *in6_dev; int err; /* Is vif busy ? */ if (VIF_EXISTS(mrt, vifi)) return -EADDRINUSE; switch (vifc->mif6c_flags) { #ifdef CONFIG_IPV6_PIMSM_V2 case MIFF_REGISTER: /* * Special Purpose VIF in PIM * All the packets will be sent to the daemon */ if (mrt->mroute_reg_vif_num >= 0) return -EADDRINUSE; dev = ip6mr_reg_vif(net, mrt); if (!dev) return -ENOBUFS; err = dev_set_allmulti(dev, 1); if (err) { unregister_netdevice(dev); dev_put(dev); return err; } break; #endif case 0: dev = dev_get_by_index(net, vifc->mif6c_pifi); if (!dev) return -EADDRNOTAVAIL; err = dev_set_allmulti(dev, 1); if (err) { dev_put(dev); return err; } break; default: return -EINVAL; } in6_dev = __in6_dev_get(dev); if (in6_dev) { atomic_inc(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); } /* Fill in the VIF structures */ vif_device_init(v, dev, vifc->vifc_rate_limit, vifc->vifc_threshold, vifc->mif6c_flags | (!mrtsock ? VIFF_STATIC : 0), MIFF_REGISTER); /* And finish update writing critical data */ spin_lock(&mrt_lock); rcu_assign_pointer(v->dev, dev); netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC); #ifdef CONFIG_IPV6_PIMSM_V2 if (v->flags & MIFF_REGISTER) WRITE_ONCE(mrt->mroute_reg_vif_num, vifi); #endif if (vifi + 1 > mrt->maxvif) WRITE_ONCE(mrt->maxvif, vifi + 1); spin_unlock(&mrt_lock); call_ip6mr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, dev, vifi, mrt->id); return 0; } static struct mfc6_cache *ip6mr_cache_find(struct mr_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = *origin, .mf6c_mcastgrp = *mcastgrp, }; return mr_mfc_find(mrt, &arg); } /* Look for a (*,G) entry */ static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt, struct in6_addr *mcastgrp, mifi_t mifi) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = in6addr_any, .mf6c_mcastgrp = *mcastgrp, }; if (ipv6_addr_any(mcastgrp)) return mr_mfc_find_any_parent(mrt, mifi); return mr_mfc_find_any(mrt, mifi, &arg); } /* Look for a (S,G,iif) entry if parent != -1 */ static struct mfc6_cache * ip6mr_cache_find_parent(struct mr_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp, int parent) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = *origin, .mf6c_mcastgrp = *mcastgrp, }; return mr_mfc_find_parent(mrt, &arg, parent); } /* Allocate a multicast cache entry */ static struct mfc6_cache *ip6mr_cache_alloc(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); if (!c) return NULL; c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->_c.mfc_un.res.minvif = MAXMIFS; c->_c.free = ip6mr_cache_free_rcu; refcount_set(&c->_c.mfc_un.res.refcount, 1); return c; } static struct mfc6_cache *ip6mr_cache_alloc_unres(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); if (!c) return NULL; skb_queue_head_init(&c->_c.mfc_un.unres.unresolved); c->_c.mfc_un.unres.expires = jiffies + 10 * HZ; return c; } /* * A cache entry has gone into a resolved state from queued */ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, struct mfc6_cache *uc, struct mfc6_cache *c) { struct sk_buff *skb; /* * Play the pending entries through our router */ while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); if (mr_fill_mroute(mrt, skb, &c->_c, nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); ((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE; } rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else { rcu_read_lock(); ip6_mr_forward(net, mrt, skb->dev, skb, c); rcu_read_unlock(); } } } /* * Bounce a cache query up to pim6sd and netlink. * * Called under rcu_read_lock() */ static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert) { struct sock *mroute6_sk; struct sk_buff *skb; struct mrt6msg *msg; int ret; #ifdef CONFIG_IPV6_PIMSM_V2 if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt) +sizeof(*msg)); else #endif skb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(*msg), GFP_ATOMIC); if (!skb) return -ENOBUFS; /* I suppose that internal messages * do not require checksums */ skb->ip_summed = CHECKSUM_UNNECESSARY; #ifdef CONFIG_IPV6_PIMSM_V2 if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) { /* Ugly, but we have no choice with this interface. Duplicate old header, fix length etc. And all this only to mangle msg->im6_msgtype and to set msg->im6_mbz to "mbz" :-) */ __skb_pull(skb, skb_network_offset(pkt)); skb_push(skb, sizeof(*msg)); skb_reset_transport_header(skb); msg = (struct mrt6msg *)skb_transport_header(skb); msg->im6_mbz = 0; msg->im6_msgtype = assert; if (assert == MRT6MSG_WRMIFWHOLE) msg->im6_mif = mifi; else msg->im6_mif = READ_ONCE(mrt->mroute_reg_vif_num); msg->im6_pad = 0; msg->im6_src = ipv6_hdr(pkt)->saddr; msg->im6_dst = ipv6_hdr(pkt)->daddr; skb->ip_summed = CHECKSUM_UNNECESSARY; } else #endif { /* * Copy the IP header */ skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); skb_copy_to_linear_data(skb, ipv6_hdr(pkt), sizeof(struct ipv6hdr)); /* * Add our header */ skb_put(skb, sizeof(*msg)); skb_reset_transport_header(skb); msg = (struct mrt6msg *)skb_transport_header(skb); msg->im6_mbz = 0; msg->im6_msgtype = assert; msg->im6_mif = mifi; msg->im6_pad = 0; msg->im6_src = ipv6_hdr(pkt)->saddr; msg->im6_dst = ipv6_hdr(pkt)->daddr; skb_dst_set(skb, dst_clone(skb_dst(pkt))); skb->ip_summed = CHECKSUM_UNNECESSARY; } mroute6_sk = rcu_dereference(mrt->mroute_sk); if (!mroute6_sk) { kfree_skb(skb); return -EINVAL; } mrt6msg_netlink_event(mrt, skb); /* Deliver to user space multicast routing algorithms */ ret = sock_queue_rcv_skb(mroute6_sk, skb); if (ret < 0) { net_warn_ratelimited("mroute6: pending queue full, dropping entries\n"); kfree_skb(skb); } return ret; } /* Queue a packet for resolution. It gets locked cache entry! */ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, struct sk_buff *skb, struct net_device *dev) { struct mfc6_cache *c; bool found = false; int err; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) { if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) && ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) { found = true; break; } } if (!found) { /* * Create a new entry if allowable */ c = ip6mr_cache_alloc_unres(); if (!c) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); return -ENOBUFS; } /* Fill in the new cache entry */ c->_c.mfc_parent = -1; c->mf6c_origin = ipv6_hdr(skb)->saddr; c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr; /* * Reflect first query at pim6sd */ err = ip6mr_cache_report(mrt, skb, mifi, MRT6MSG_NOCACHE); if (err < 0) { /* If the report failed throw the cache entry out - Brad Parker */ spin_unlock_bh(&mfc_unres_lock); ip6mr_cache_free(c); kfree_skb(skb); return err; } atomic_inc(&mrt->cache_resolve_queue_len); list_add(&c->_c.list, &mrt->mfc_unres_queue); mr6_netlink_event(mrt, c, RTM_NEWROUTE); ipmr_do_expire_process(mrt); } /* See if we can append the packet */ if (c->_c.mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; } else { if (dev) { skb->dev = dev; skb->skb_iif = dev->ifindex; } skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); err = 0; } spin_unlock_bh(&mfc_unres_lock); return err; } /* * MFC6 cache manipulation by user space */ static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc, int parent) { struct mfc6_cache *c; /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, &mfc->mf6cc_mcastgrp.sin6_addr, parent); rcu_read_unlock(); if (!c) return -ENOENT; rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ip6mr_rht_params); list_del_rcu(&c->_c.list); call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_ENTRY_DEL, c, mrt->id); mr6_netlink_event(mrt, c, RTM_DELROUTE); mr_cache_put(&c->_c); return 0; } static int ip6mr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct mr_table *mrt; struct vif_device *v; int ct; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; ip6mr_for_each_table(mrt, net) { v = &mrt->vif_table[0]; for (ct = 0; ct < mrt->maxvif; ct++, v++) { if (rcu_access_pointer(v->dev) == dev) mif6_delete(mrt, ct, 1, NULL); } } return NOTIFY_DONE; } static unsigned int ip6mr_seq_read(struct net *net) { ASSERT_RTNL(); return net->ipv6.ipmr_seq + ip6mr_rules_seq_read(net); } static int ip6mr_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump, ip6mr_mr_table_iter, extack); } static struct notifier_block ip6_mr_notifier = { .notifier_call = ip6mr_device_event }; static const struct fib_notifier_ops ip6mr_notifier_ops_template = { .family = RTNL_FAMILY_IP6MR, .fib_seq_read = ip6mr_seq_read, .fib_dump = ip6mr_dump, .owner = THIS_MODULE, }; static int __net_init ip6mr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; net->ipv6.ipmr_seq = 0; ops = fib_notifier_ops_register(&ip6mr_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv6.ip6mr_notifier_ops = ops; return 0; } static void __net_exit ip6mr_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv6.ip6mr_notifier_ops); net->ipv6.ip6mr_notifier_ops = NULL; } /* Setup for IP multicast routing */ static int __net_init ip6mr_net_init(struct net *net) { int err; err = ip6mr_notifier_init(net); if (err) return err; err = ip6mr_rules_init(net); if (err < 0) goto ip6mr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; if (!proc_create_net("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_seq_ops, sizeof(struct mr_vif_iter))) goto proc_vif_fail; if (!proc_create_net("ip6_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops, sizeof(struct mr_mfc_iter))) goto proc_cache_fail; #endif return 0; #ifdef CONFIG_PROC_FS proc_cache_fail: remove_proc_entry("ip6_mr_vif", net->proc_net); proc_vif_fail: rtnl_lock(); ip6mr_rules_exit(net); rtnl_unlock(); #endif ip6mr_rules_fail: ip6mr_notifier_exit(net); return err; } static void __net_exit ip6mr_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("ip6_mr_cache", net->proc_net); remove_proc_entry("ip6_mr_vif", net->proc_net); #endif ip6mr_notifier_exit(net); } static void __net_exit ip6mr_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) ip6mr_rules_exit(net); rtnl_unlock(); } static struct pernet_operations ip6mr_net_ops = { .init = ip6mr_net_init, .exit = ip6mr_net_exit, .exit_batch = ip6mr_net_exit_batch, }; int __init ip6_mr_init(void) { int err; mrt_cachep = kmem_cache_create("ip6_mrt_cache", sizeof(struct mfc6_cache), 0, SLAB_HWCACHE_ALIGN, NULL); if (!mrt_cachep) return -ENOMEM; err = register_pernet_subsys(&ip6mr_net_ops); if (err) goto reg_pernet_fail; err = register_netdevice_notifier(&ip6_mr_notifier); if (err) goto reg_notif_fail; #ifdef CONFIG_IPV6_PIMSM_V2 if (inet6_add_protocol(&pim6_protocol, IPPROTO_PIM) < 0) { pr_err("%s: can't add PIM protocol\n", __func__); err = -EAGAIN; goto add_proto_fail; } #endif err = rtnl_register_module(THIS_MODULE, RTNL_FAMILY_IP6MR, RTM_GETROUTE, ip6mr_rtm_getroute, ip6mr_rtm_dumproute, 0); if (err == 0) return 0; #ifdef CONFIG_IPV6_PIMSM_V2 inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); add_proto_fail: unregister_netdevice_notifier(&ip6_mr_notifier); #endif reg_notif_fail: unregister_pernet_subsys(&ip6mr_net_ops); reg_pernet_fail: kmem_cache_destroy(mrt_cachep); return err; } void ip6_mr_cleanup(void) { rtnl_unregister(RTNL_FAMILY_IP6MR, RTM_GETROUTE); #ifdef CONFIG_IPV6_PIMSM_V2 inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); #endif unregister_netdevice_notifier(&ip6_mr_notifier); unregister_pernet_subsys(&ip6mr_net_ops); kmem_cache_destroy(mrt_cachep); } static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, struct mf6cctl *mfc, int mrtsock, int parent) { unsigned char ttls[MAXMIFS]; struct mfc6_cache *uc, *c; struct mr_mfc *_uc; bool found; int i, err; if (mfc->mf6cc_parent >= MAXMIFS) return -ENFILE; memset(ttls, 255, MAXMIFS); for (i = 0; i < MAXMIFS; i++) { if (IF_ISSET(i, &mfc->mf6cc_ifset)) ttls[i] = 1; } /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, &mfc->mf6cc_mcastgrp.sin6_addr, parent); rcu_read_unlock(); if (c) { spin_lock(&mrt_lock); c->_c.mfc_parent = mfc->mf6cc_parent; ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; spin_unlock(&mrt_lock); call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } if (!ipv6_addr_any(&mfc->mf6cc_mcastgrp.sin6_addr) && !ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr)) return -EINVAL; c = ip6mr_cache_alloc(); if (!c) return -ENOMEM; c->mf6c_origin = mfc->mf6cc_origin.sin6_addr; c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr; c->_c.mfc_parent = mfc->mf6cc_parent; ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; err = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode, ip6mr_rht_params); if (err) { pr_err("ip6mr: rhtable insert error %d\n", err); ip6mr_cache_free(c); return err; } list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list); /* Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) { uc = (struct mfc6_cache *)_uc; if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) && ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) { list_del(&_uc->list); atomic_dec(&mrt->cache_resolve_queue_len); found = true; break; } } if (list_empty(&mrt->mfc_unres_queue)) del_timer(&mrt->ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); if (found) { ip6mr_cache_resolve(net, mrt, uc, c); ip6mr_cache_free(uc); } call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } /* * Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, int flags) { struct mr_mfc *c, *tmp; LIST_HEAD(list); int i; /* Shut down all active vif entries */ if (flags & (MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC)) { for (i = 0; i < mrt->maxvif; i++) { if (((mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS_STATIC)) || (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS))) continue; mif6_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); } /* Wipe the cache */ if (flags & (MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC)) { list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC_STATIC)) || (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC))) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params); list_del_rcu(&c->list); call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_ENTRY_DEL, (struct mfc6_cache *)c, mrt->id); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); mr_cache_put(c); } } if (flags & MRT6_FLUSH_MFC) { if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } spin_unlock_bh(&mfc_unres_lock); } } } static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk) { int err = 0; struct net *net = sock_net(sk); rtnl_lock(); spin_lock(&mrt_lock); if (rtnl_dereference(mrt->mroute_sk)) { err = -EADDRINUSE; } else { rcu_assign_pointer(mrt->mroute_sk, sk); sock_set_flag(sk, SOCK_RCU_FREE); atomic_inc(&net->ipv6.devconf_all->mc_forwarding); } spin_unlock(&mrt_lock); if (!err) inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); rtnl_unlock(); return err; } int ip6mr_sk_done(struct sock *sk) { struct net *net = sock_net(sk); struct ipv6_devconf *devconf; struct mr_table *mrt; int err = -EACCES; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return err; devconf = net->ipv6.devconf_all; if (!devconf || !atomic_read(&devconf->mc_forwarding)) return err; rtnl_lock(); ip6mr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { spin_lock(&mrt_lock); RCU_INIT_POINTER(mrt->mroute_sk, NULL); /* Note that mroute_sk had SOCK_RCU_FREE set, * so the RCU grace period before sk freeing * is guaranteed by sk_destruct() */ atomic_dec(&devconf->mc_forwarding); spin_unlock(&mrt_lock); inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MFC); err = 0; break; } } rtnl_unlock(); return err; } bool mroute6_is_socket(struct net *net, struct sk_buff *skb) { struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_oif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) return NULL; return rcu_access_pointer(mrt->mroute_sk); } EXPORT_SYMBOL(mroute6_is_socket); /* * Socket options and virtual interface manipulation. The whole * virtual interface system is a complete heap, but unfortunately * that's how BSD mrouted happens to think. Maybe one day with a proper * MOSPF/PIM router set up we can clean this up. */ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { int ret, parent = 0; struct mif6ctl vif; struct mf6cctl mfc; mifi_t mifi; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; if (optname != MRT6_INIT) { if (sk != rcu_access_pointer(mrt->mroute_sk) && !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EACCES; } switch (optname) { case MRT6_INIT: if (optlen < sizeof(int)) return -EINVAL; return ip6mr_sk_init(mrt, sk); case MRT6_DONE: return ip6mr_sk_done(sk); case MRT6_ADD_MIF: if (optlen < sizeof(vif)) return -EINVAL; if (copy_from_sockptr(&vif, optval, sizeof(vif))) return -EFAULT; if (vif.mif6c_mifi >= MAXMIFS) return -ENFILE; rtnl_lock(); ret = mif6_add(net, mrt, &vif, sk == rtnl_dereference(mrt->mroute_sk)); rtnl_unlock(); return ret; case MRT6_DEL_MIF: if (optlen < sizeof(mifi_t)) return -EINVAL; if (copy_from_sockptr(&mifi, optval, sizeof(mifi_t))) return -EFAULT; rtnl_lock(); ret = mif6_delete(mrt, mifi, 0, NULL); rtnl_unlock(); return ret; /* * Manipulate the forwarding caches. These live * in a sort of kernel/user symbiosis. */ case MRT6_ADD_MFC: case MRT6_DEL_MFC: parent = -1; fallthrough; case MRT6_ADD_MFC_PROXY: case MRT6_DEL_MFC_PROXY: if (optlen < sizeof(mfc)) return -EINVAL; if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) return -EFAULT; if (parent == 0) parent = mfc.mf6cc_parent; rtnl_lock(); if (optname == MRT6_DEL_MFC || optname == MRT6_DEL_MFC_PROXY) ret = ip6mr_mfc_delete(mrt, &mfc, parent); else ret = ip6mr_mfc_add(net, mrt, &mfc, sk == rtnl_dereference(mrt->mroute_sk), parent); rtnl_unlock(); return ret; case MRT6_FLUSH: { int flags; if (optlen != sizeof(flags)) return -EINVAL; if (copy_from_sockptr(&flags, optval, sizeof(flags))) return -EFAULT; rtnl_lock(); mroute_clean_tables(mrt, flags); rtnl_unlock(); return 0; } /* * Control PIM assert (to activate pim will activate assert) */ case MRT6_ASSERT: { int v; if (optlen != sizeof(v)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; mrt->mroute_do_assert = v; return 0; } #ifdef CONFIG_IPV6_PIMSM_V2 case MRT6_PIM: { bool do_wrmifwhole; int v; if (optlen != sizeof(v)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; do_wrmifwhole = (v == MRT6MSG_WRMIFWHOLE); v = !!v; rtnl_lock(); ret = 0; if (v != mrt->mroute_do_pim) { mrt->mroute_do_pim = v; mrt->mroute_do_assert = v; mrt->mroute_do_wrvifwhole = do_wrmifwhole; } rtnl_unlock(); return ret; } #endif #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES case MRT6_TABLE: { u32 v; if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; /* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */ if (v != RT_TABLE_DEFAULT && v >= 100000000) return -EINVAL; if (sk == rcu_access_pointer(mrt->mroute_sk)) return -EBUSY; rtnl_lock(); ret = 0; mrt = ip6mr_new_table(net, v); if (IS_ERR(mrt)) ret = PTR_ERR(mrt); else raw6_sk(sk)->ip6mr_table = v; rtnl_unlock(); return ret; } #endif /* * Spurious command, or MRT6_VERSION which you cannot * set. */ default: return -ENOPROTOOPT; } } /* * Getsock opt support for the multicast routing system. */ int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, sockptr_t optlen) { int olr; int val; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (optname) { case MRT6_VERSION: val = 0x0305; break; #ifdef CONFIG_IPV6_PIMSM_V2 case MRT6_PIM: val = mrt->mroute_do_pim; break; #endif case MRT6_ASSERT: val = mrt->mroute_do_assert; break; default: return -ENOPROTOOPT; } if (copy_from_sockptr(&olr, optlen, sizeof(int))) return -EFAULT; olr = min_t(int, olr, sizeof(int)); if (olr < 0) return -EINVAL; if (copy_to_sockptr(optlen, &olr, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, olr)) return -EFAULT; return 0; } /* * The IP multicast ioctl support routines. */ int ip6mr_ioctl(struct sock *sk, int cmd, void *arg) { struct sioc_sg_req6 *sr; struct sioc_mif_req6 *vr; struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETMIFCNT_IN6: vr = (struct sioc_mif_req6 *)arg; if (vr->mifi >= mrt->maxvif) return -EINVAL; vr->mifi = array_index_nospec(vr->mifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr->mifi]; if (VIF_EXISTS(mrt, vr->mifi)) { vr->icount = READ_ONCE(vif->pkt_in); vr->ocount = READ_ONCE(vif->pkt_out); vr->ibytes = READ_ONCE(vif->bytes_in); vr->obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: sr = (struct sioc_sg_req6 *)arg; rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr->src.sin6_addr, &sr->grp.sin6_addr); if (c) { sr->pktcnt = c->_c.mfc_un.res.pkt; sr->bytecnt = c->_c.mfc_un.res.bytes; sr->wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #ifdef CONFIG_COMPAT struct compat_sioc_sg_req6 { struct sockaddr_in6 src; struct sockaddr_in6 grp; compat_ulong_t pktcnt; compat_ulong_t bytecnt; compat_ulong_t wrong_if; }; struct compat_sioc_mif_req6 { mifi_t mifi; compat_ulong_t icount; compat_ulong_t ocount; compat_ulong_t ibytes; compat_ulong_t obytes; }; int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { struct compat_sioc_sg_req6 sr; struct compat_sioc_mif_req6 vr; struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETMIFCNT_IN6: if (copy_from_user(&vr, arg, sizeof(vr))) return -EFAULT; if (vr.mifi >= mrt->maxvif) return -EINVAL; vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr.mifi]; if (VIF_EXISTS(mrt, vr.mifi)) { vr.icount = READ_ONCE(vif->pkt_in); vr.ocount = READ_ONCE(vif->pkt_out); vr.ibytes = READ_ONCE(vif->bytes_in); vr.obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); if (copy_to_user(arg, &vr, sizeof(vr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: if (copy_from_user(&sr, arg, sizeof(sr))) return -EFAULT; rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { sr.pktcnt = c->_c.mfc_un.res.pkt; sr.bytecnt = c->_c.mfc_un.res.bytes; sr.wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #endif static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTFORWDATAGRAMS); return dst_output(net, sk, skb); } /* * Processing handlers for ip6mr_forward */ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, struct sk_buff *skb, int vifi) { struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *vif_dev; struct ipv6hdr *ipv6h; struct dst_entry *dst; struct flowi6 fl6; vif_dev = vif_dev_read(vif); if (!vif_dev) goto out_free; #ifdef CONFIG_IPV6_PIMSM_V2 if (vif->flags & MIFF_REGISTER) { WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); DEV_STATS_INC(vif_dev, tx_packets); ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT); goto out_free; } #endif ipv6h = ipv6_hdr(skb); fl6 = (struct flowi6) { .flowi6_oif = vif->link, .daddr = ipv6h->daddr, }; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); goto out_free; } skb_dst_drop(skb); skb_dst_set(skb, dst); /* * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output * interfaces. It is clear, if mrouter runs a multicasting * program, it should receive packets not depending to what interface * program is joined. * If we will not make it, the program will have to join on all * interfaces. On the other hand, multihoming host (or router, but * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ skb->dev = vif_dev; WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); /* We are about to write */ /* XXX: extension headers? */ if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(vif_dev))) goto out_free; ipv6h = ipv6_hdr(skb); ipv6h->hop_limit--; IP6CB(skb)->flags |= IP6SKB_FORWARDED; return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, net, NULL, skb, skb->dev, vif_dev, ip6mr_forward2_finish); out_free: kfree_skb(skb); return 0; } /* Called with rcu_read_lock() */ static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev) { int ct; /* Pairs with WRITE_ONCE() in mif6_delete()/mif6_add() */ for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) { if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev) break; } return ct; } /* Called under rcu_read_lock() */ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *c) { int psend = -1; int vif, ct; int true_vifi = ip6mr_find_vif(mrt, dev); vif = c->_c.mfc_parent; c->_c.mfc_un.res.pkt++; c->_c.mfc_un.res.bytes += skb->len; c->_c.mfc_un.res.lastuse = jiffies; if (ipv6_addr_any(&c->mf6c_origin) && true_vifi >= 0) { struct mfc6_cache *cache_proxy; /* For an (*,G) entry, we only check that the incoming * interface is part of the static tree. */ cache_proxy = mr_mfc_find_any_parent(mrt, vif); if (cache_proxy && cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) goto forward; } /* * Wrong interface: drop packet and (maybe) send PIM assert. */ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) { c->_c.mfc_un.res.wrong_if++; if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, so that we cannot check that packet arrived on an oif. It is bad, but otherwise we would need to move pretty large chunk of pimd to kernel. Ough... --ANK */ (mrt->mroute_do_pim || c->_c.mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, c->_c.mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF); if (mrt->mroute_do_wrvifwhole) ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRMIFWHOLE); } goto dont_forward; } forward: WRITE_ONCE(mrt->vif_table[vif].pkt_in, mrt->vif_table[vif].pkt_in + 1); WRITE_ONCE(mrt->vif_table[vif].bytes_in, mrt->vif_table[vif].bytes_in + skb->len); /* * Forward the frame */ if (ipv6_addr_any(&c->mf6c_origin) && ipv6_addr_any(&c->mf6c_mcastgrp)) { if (true_vifi >= 0 && true_vifi != c->_c.mfc_parent && ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { /* It's an (*,*) entry and the packet is not coming from * the upstream: forward the packet to the upstream * only. */ psend = c->_c.mfc_parent; goto last_forward; } goto dont_forward; } for (ct = c->_c.mfc_un.res.maxvif - 1; ct >= c->_c.mfc_un.res.minvif; ct--) { /* For (*,G) entry, don't forward to the incoming interface */ if ((!ipv6_addr_any(&c->mf6c_origin) || ct != true_vifi) && ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ip6mr_forward2(net, mrt, skb2, psend); } psend = ct; } } last_forward: if (psend != -1) { ip6mr_forward2(net, mrt, skb, psend); return; } dont_forward: kfree_skb(skb); } /* * Multicast packets for forwarding arrive here */ int ip6_mr_input(struct sk_buff *skb) { struct mfc6_cache *cache; struct net *net = dev_net(skb->dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; int err; struct net_device *dev; /* skb->dev passed in is the master dev for vrfs. * Get the proper interface that does have a vif associated with it. */ dev = skb->dev; if (netif_is_l3_master(skb->dev)) { dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); if (!dev) { kfree_skb(skb); return -ENODEV; } } err = ip6mr_fib_lookup(net, &fl6, &mrt); if (err < 0) { kfree_skb(skb); return err; } cache = ip6mr_cache_find(mrt, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); if (!cache) { int vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, &ipv6_hdr(skb)->daddr, vif); } /* * No usable cache entry */ if (!cache) { int vif; vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) { int err = ip6mr_cache_unresolved(mrt, vif, skb, dev); return err; } kfree_skb(skb); return -ENODEV; } ip6_mr_forward(net, mrt, dev, skb, cache); return 0; } int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, u32 portid) { int err; struct mr_table *mrt; struct mfc6_cache *cache; struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) return -ENOENT; rcu_read_lock(); cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr); if (!cache && skb->dev) { int vif = ip6mr_find_vif(mrt, skb->dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, &rt->rt6i_dst.addr, vif); } if (!cache) { struct sk_buff *skb2; struct ipv6hdr *iph; struct net_device *dev; int vif; dev = skb->dev; if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) { rcu_read_unlock(); return -ENODEV; } /* really correct? */ skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); if (!skb2) { rcu_read_unlock(); return -ENOMEM; } NETLINK_CB(skb2).portid = portid; skb_reset_transport_header(skb2); skb_put(skb2, sizeof(struct ipv6hdr)); skb_reset_network_header(skb2); iph = ipv6_hdr(skb2); iph->version = 0; iph->priority = 0; iph->flow_lbl[0] = 0; iph->flow_lbl[1] = 0; iph->flow_lbl[2] = 0; iph->payload_len = 0; iph->nexthdr = IPPROTO_NONE; iph->hop_limit = 0; iph->saddr = rt->rt6i_src.addr; iph->daddr = rt->rt6i_dst.addr; err = ip6mr_cache_unresolved(mrt, vif, skb2, dev); rcu_read_unlock(); return err; } err = mr_fill_mroute(mrt, skb, &cache->_c, rtm); rcu_read_unlock(); return err; } static int ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mfc6_cache *c, int cmd, int flags) { struct nlmsghdr *nlh; struct rtmsg *rtm; int err; nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = RTNL_FAMILY_IP6MR; rtm->rtm_dst_len = 128; rtm->rtm_src_len = 128; rtm->rtm_tos = 0; rtm->rtm_table = mrt->id; if (nla_put_u32(skb, RTA_TABLE, mrt->id)) goto nla_put_failure; rtm->rtm_type = RTN_MULTICAST; rtm->rtm_scope = RT_SCOPE_UNIVERSE; if (c->_c.mfc_flags & MFC_STATIC) rtm->rtm_protocol = RTPROT_STATIC; else rtm->rtm_protocol = RTPROT_MROUTED; rtm->rtm_flags = 0; if (nla_put_in6_addr(skb, RTA_SRC, &c->mf6c_origin) || nla_put_in6_addr(skb, RTA_DST, &c->mf6c_mcastgrp)) goto nla_put_failure; err = mr_fill_mroute(mrt, skb, &c->_c, rtm); /* do not break the dump if cache is unresolved */ if (err < 0 && err != -ENOENT) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int _ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags) { return ip6mr_fill_mroute(mrt, skb, portid, seq, (struct mfc6_cache *)c, cmd, flags); } static int mr6_msgsize(bool unresolved, int maxvif) { size_t len = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(sizeof(struct in6_addr)) /* RTA_SRC */ + nla_total_size(sizeof(struct in6_addr)) /* RTA_DST */ ; if (!unresolved) len = len + nla_total_size(4) /* RTA_IIF */ + nla_total_size(0) /* RTA_MULTIPATH */ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) /* RTA_MFC_STATS */ + nla_total_size_64bit(sizeof(struct rta_mfc_stats)) ; return len; } static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(mr6_msgsize(mfc->_c.mfc_parent >= MAXMIFS, mrt->maxvif), GFP_ATOMIC); if (!skb) goto errout; err = ip6mr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); if (err < 0) goto errout; rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE, NULL, GFP_ATOMIC); return; errout: kfree_skb(skb); if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err); } static size_t mrt6msg_netlink_msgsize(size_t payloadlen) { size_t len = NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(1) /* IP6MRA_CREPORT_MSGTYPE */ + nla_total_size(4) /* IP6MRA_CREPORT_MIF_ID */ /* IP6MRA_CREPORT_SRC_ADDR */ + nla_total_size(sizeof(struct in6_addr)) /* IP6MRA_CREPORT_DST_ADDR */ + nla_total_size(sizeof(struct in6_addr)) /* IP6MRA_CREPORT_PKT */ + nla_total_size(payloadlen) ; return len; } static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt) { struct net *net = read_pnet(&mrt->net); struct nlmsghdr *nlh; struct rtgenmsg *rtgenm; struct mrt6msg *msg; struct sk_buff *skb; struct nlattr *nla; int payloadlen; payloadlen = pkt->len - sizeof(struct mrt6msg); msg = (struct mrt6msg *)skb_transport_header(pkt); skb = nlmsg_new(mrt6msg_netlink_msgsize(payloadlen), GFP_ATOMIC); if (!skb) goto errout; nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT, sizeof(struct rtgenmsg), 0); if (!nlh) goto errout; rtgenm = nlmsg_data(nlh); rtgenm->rtgen_family = RTNL_FAMILY_IP6MR; if (nla_put_u8(skb, IP6MRA_CREPORT_MSGTYPE, msg->im6_msgtype) || nla_put_u32(skb, IP6MRA_CREPORT_MIF_ID, msg->im6_mif) || nla_put_in6_addr(skb, IP6MRA_CREPORT_SRC_ADDR, &msg->im6_src) || nla_put_in6_addr(skb, IP6MRA_CREPORT_DST_ADDR, &msg->im6_dst)) goto nla_put_failure; nla = nla_reserve(skb, IP6MRA_CREPORT_PKT, payloadlen); if (!nla || skb_copy_bits(pkt, sizeof(struct mrt6msg), nla_data(nla), payloadlen)) goto nla_put_failure; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE_R, NULL, GFP_ATOMIC); return; nla_put_failure: nlmsg_cancel(skb, nlh); errout: kfree_skb(skb); rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE_R, -ENOBUFS); } static const struct nla_policy ip6mr_getroute_policy[RTA_MAX + 1] = { [RTA_SRC] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [RTA_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [RTA_TABLE] = { .type = NLA_U32 }, }; static int ip6mr_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int err; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, ip6mr_getroute_policy, extack); if (err) return err; rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for multicast route get request"); return -EINVAL; } if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); return -EINVAL; } return 0; } static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct in6_addr src = {}, grp = {}; struct nlattr *tb[RTA_MAX + 1]; struct mfc6_cache *cache; struct mr_table *mrt; struct sk_buff *skb; u32 tableid; int err; err = ip6mr_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) return err; if (tb[RTA_SRC]) src = nla_get_in6_addr(tb[RTA_SRC]); if (tb[RTA_DST]) grp = nla_get_in6_addr(tb[RTA_DST]); tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0; mrt = ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT); if (!mrt) { NL_SET_ERR_MSG_MOD(extack, "MR table does not exist"); return -ENOENT; } /* entries are added/deleted only under RTNL */ rcu_read_lock(); cache = ip6mr_cache_find(mrt, &src, &grp); rcu_read_unlock(); if (!cache) { NL_SET_ERR_MSG_MOD(extack, "MR cache entry not found"); return -ENOENT; } skb = nlmsg_new(mr6_msgsize(false, mrt->maxvif), GFP_KERNEL); if (!skb) return -ENOBUFS; err = ip6mr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, cache, RTM_NEWROUTE, 0); if (err < 0) { kfree_skb(skb); return err; } return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); } static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct fib_dump_filter filter = {}; int err; if (cb->strict_check) { err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh, &filter, cb); if (err < 0) return err; } if (filter.table_id) { struct mr_table *mrt; mrt = ip6mr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR) return skb->len; NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist"); return -ENOENT; } err = mr_table_dump(mrt, skb, cb, _ip6mr_fill_mroute, &mfc_unres_lock, &filter); return skb->len ? : err; } return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter, _ip6mr_fill_mroute, &mfc_unres_lock, &filter); }
61 1 11 1 60 1 27 58 1 4 5 57 194 231 194 2 3 1 1 2 189 4 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 /* SPDX-License-Identifier: GPL-2.0 */ #if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_KVM_H #include <linux/tracepoint.h> #include <asm/vmx.h> #include <asm/svm.h> #include <asm/clocksource.h> #include <asm/pvclock-abi.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm /* * Tracepoint for guest mode entry. */ TRACE_EVENT(kvm_entry, TP_PROTO(struct kvm_vcpu *vcpu), TP_ARGS(vcpu), TP_STRUCT__entry( __field( unsigned int, vcpu_id ) __field( unsigned long, rip ) ), TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; __entry->rip = kvm_rip_read(vcpu); ), TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip) ); /* * Tracepoint for hypercall. */ TRACE_EVENT(kvm_hypercall, TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3), TP_ARGS(nr, a0, a1, a2, a3), TP_STRUCT__entry( __field( unsigned long, nr ) __field( unsigned long, a0 ) __field( unsigned long, a1 ) __field( unsigned long, a2 ) __field( unsigned long, a3 ) ), TP_fast_assign( __entry->nr = nr; __entry->a0 = a0; __entry->a1 = a1; __entry->a2 = a2; __entry->a3 = a3; ), TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx", __entry->nr, __entry->a0, __entry->a1, __entry->a2, __entry->a3) ); /* * Tracepoint for hypercall. */ TRACE_EVENT(kvm_hv_hypercall, TP_PROTO(__u16 code, bool fast, __u16 var_cnt, __u16 rep_cnt, __u16 rep_idx, __u64 ingpa, __u64 outgpa), TP_ARGS(code, fast, var_cnt, rep_cnt, rep_idx, ingpa, outgpa), TP_STRUCT__entry( __field( __u16, rep_cnt ) __field( __u16, rep_idx ) __field( __u64, ingpa ) __field( __u64, outgpa ) __field( __u16, code ) __field( __u16, var_cnt ) __field( bool, fast ) ), TP_fast_assign( __entry->rep_cnt = rep_cnt; __entry->rep_idx = rep_idx; __entry->ingpa = ingpa; __entry->outgpa = outgpa; __entry->code = code; __entry->var_cnt = var_cnt; __entry->fast = fast; ), TP_printk("code 0x%x %s var_cnt 0x%x rep_cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", __entry->code, __entry->fast ? "fast" : "slow", __entry->var_cnt, __entry->rep_cnt, __entry->rep_idx, __entry->ingpa, __entry->outgpa) ); TRACE_EVENT(kvm_hv_hypercall_done, TP_PROTO(u64 result), TP_ARGS(result), TP_STRUCT__entry( __field(__u64, result) ), TP_fast_assign( __entry->result = result; ), TP_printk("result 0x%llx", __entry->result) ); /* * Tracepoint for Xen hypercall. */ TRACE_EVENT(kvm_xen_hypercall, TP_PROTO(u8 cpl, unsigned long nr, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4, unsigned long a5), TP_ARGS(cpl, nr, a0, a1, a2, a3, a4, a5), TP_STRUCT__entry( __field(u8, cpl) __field(unsigned long, nr) __field(unsigned long, a0) __field(unsigned long, a1) __field(unsigned long, a2) __field(unsigned long, a3) __field(unsigned long, a4) __field(unsigned long, a5) ), TP_fast_assign( __entry->cpl = cpl; __entry->nr = nr; __entry->a0 = a0; __entry->a1 = a1; __entry->a2 = a2; __entry->a3 = a3; __entry->a4 = a4; __entry->a4 = a5; ), TP_printk("cpl %d nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx", __entry->cpl, __entry->nr, __entry->a0, __entry->a1, __entry->a2, __entry->a3, __entry->a4, __entry->a5) ); /* * Tracepoint for PIO. */ #define KVM_PIO_IN 0 #define KVM_PIO_OUT 1 TRACE_EVENT(kvm_pio, TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, unsigned int count, const void *data), TP_ARGS(rw, port, size, count, data), TP_STRUCT__entry( __field( unsigned int, rw ) __field( unsigned int, port ) __field( unsigned int, size ) __field( unsigned int, count ) __field( unsigned int, val ) ), TP_fast_assign( __entry->rw = rw; __entry->port = port; __entry->size = size; __entry->count = count; if (size == 1) __entry->val = *(unsigned char *)data; else if (size == 2) __entry->val = *(unsigned short *)data; else __entry->val = *(unsigned int *)data; ), TP_printk("pio_%s at 0x%x size %d count %d val 0x%x %s", __entry->rw ? "write" : "read", __entry->port, __entry->size, __entry->count, __entry->val, __entry->count > 1 ? "(...)" : "") ); /* * Tracepoint for fast mmio. */ TRACE_EVENT(kvm_fast_mmio, TP_PROTO(u64 gpa), TP_ARGS(gpa), TP_STRUCT__entry( __field(u64, gpa) ), TP_fast_assign( __entry->gpa = gpa; ), TP_printk("fast mmio at gpa 0x%llx", __entry->gpa) ); /* * Tracepoint for cpuid. */ TRACE_EVENT(kvm_cpuid, TP_PROTO(unsigned int function, unsigned int index, unsigned long rax, unsigned long rbx, unsigned long rcx, unsigned long rdx, bool found, bool used_max_basic), TP_ARGS(function, index, rax, rbx, rcx, rdx, found, used_max_basic), TP_STRUCT__entry( __field( unsigned int, function ) __field( unsigned int, index ) __field( unsigned long, rax ) __field( unsigned long, rbx ) __field( unsigned long, rcx ) __field( unsigned long, rdx ) __field( bool, found ) __field( bool, used_max_basic ) ), TP_fast_assign( __entry->function = function; __entry->index = index; __entry->rax = rax; __entry->rbx = rbx; __entry->rcx = rcx; __entry->rdx = rdx; __entry->found = found; __entry->used_max_basic = used_max_basic; ), TP_printk("func %x idx %x rax %lx rbx %lx rcx %lx rdx %lx, cpuid entry %s%s", __entry->function, __entry->index, __entry->rax, __entry->rbx, __entry->rcx, __entry->rdx, __entry->found ? "found" : "not found", __entry->used_max_basic ? ", used max basic" : "") ); #define AREG(x) { APIC_##x, "APIC_" #x } #define kvm_trace_symbol_apic \ AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \ AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \ AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \ AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \ AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \ AREG(ECTRL) /* * Tracepoint for apic access. */ TRACE_EVENT(kvm_apic, TP_PROTO(unsigned int rw, unsigned int reg, u64 val), TP_ARGS(rw, reg, val), TP_STRUCT__entry( __field( unsigned int, rw ) __field( unsigned int, reg ) __field( u64, val ) ), TP_fast_assign( __entry->rw = rw; __entry->reg = reg; __entry->val = val; ), TP_printk("apic_%s %s = 0x%llx", __entry->rw ? "write" : "read", __print_symbolic(__entry->reg, kvm_trace_symbol_apic), __entry->val) ); #define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) #define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) #define KVM_ISA_VMX 1 #define KVM_ISA_SVM 2 #define kvm_print_exit_reason(exit_reason, isa) \ (isa == KVM_ISA_VMX) ? \ __print_symbolic(exit_reason & 0xffff, VMX_EXIT_REASONS) : \ __print_symbolic(exit_reason, SVM_EXIT_REASONS), \ (isa == KVM_ISA_VMX && exit_reason & ~0xffff) ? " " : "", \ (isa == KVM_ISA_VMX) ? \ __print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : "" #define TRACE_EVENT_KVM_EXIT(name) \ TRACE_EVENT(name, \ TP_PROTO(struct kvm_vcpu *vcpu, u32 isa), \ TP_ARGS(vcpu, isa), \ \ TP_STRUCT__entry( \ __field( unsigned int, exit_reason ) \ __field( unsigned long, guest_rip ) \ __field( u32, isa ) \ __field( u64, info1 ) \ __field( u64, info2 ) \ __field( u32, intr_info ) \ __field( u32, error_code ) \ __field( unsigned int, vcpu_id ) \ ), \ \ TP_fast_assign( \ __entry->guest_rip = kvm_rip_read(vcpu); \ __entry->isa = isa; \ __entry->vcpu_id = vcpu->vcpu_id; \ static_call(kvm_x86_get_exit_info)(vcpu, \ &__entry->exit_reason, \ &__entry->info1, \ &__entry->info2, \ &__entry->intr_info, \ &__entry->error_code); \ ), \ \ TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx " \ "info2 0x%016llx intr_info 0x%08x error_code 0x%08x", \ __entry->vcpu_id, \ kvm_print_exit_reason(__entry->exit_reason, __entry->isa), \ __entry->guest_rip, __entry->info1, __entry->info2, \ __entry->intr_info, __entry->error_code) \ ) /* * Tracepoint for kvm guest exit: */ TRACE_EVENT_KVM_EXIT(kvm_exit); /* * Tracepoint for kvm interrupt injection: */ TRACE_EVENT(kvm_inj_virq, TP_PROTO(unsigned int vector, bool soft, bool reinjected), TP_ARGS(vector, soft, reinjected), TP_STRUCT__entry( __field( unsigned int, vector ) __field( bool, soft ) __field( bool, reinjected ) ), TP_fast_assign( __entry->vector = vector; __entry->soft = soft; __entry->reinjected = reinjected; ), TP_printk("%s 0x%x%s", __entry->soft ? "Soft/INTn" : "IRQ", __entry->vector, __entry->reinjected ? " [reinjected]" : "") ); #define EXS(x) { x##_VECTOR, "#" #x } #define kvm_trace_sym_exc \ EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \ EXS(MF), EXS(AC), EXS(MC) /* * Tracepoint for kvm interrupt injection: */ TRACE_EVENT(kvm_inj_exception, TP_PROTO(unsigned exception, bool has_error, unsigned error_code, bool reinjected), TP_ARGS(exception, has_error, error_code, reinjected), TP_STRUCT__entry( __field( u8, exception ) __field( u8, has_error ) __field( u32, error_code ) __field( bool, reinjected ) ), TP_fast_assign( __entry->exception = exception; __entry->has_error = has_error; __entry->error_code = error_code; __entry->reinjected = reinjected; ), TP_printk("%s%s%s%s%s", __print_symbolic(__entry->exception, kvm_trace_sym_exc), !__entry->has_error ? "" : " (", !__entry->has_error ? "" : __print_symbolic(__entry->error_code, { }), !__entry->has_error ? "" : ")", __entry->reinjected ? " [reinjected]" : "") ); /* * Tracepoint for page fault. */ TRACE_EVENT(kvm_page_fault, TP_PROTO(struct kvm_vcpu *vcpu, u64 fault_address, u64 error_code), TP_ARGS(vcpu, fault_address, error_code), TP_STRUCT__entry( __field( unsigned int, vcpu_id ) __field( unsigned long, guest_rip ) __field( u64, fault_address ) __field( u64, error_code ) ), TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; __entry->guest_rip = kvm_rip_read(vcpu); __entry->fault_address = fault_address; __entry->error_code = error_code; ), TP_printk("vcpu %u rip 0x%lx address 0x%016llx error_code 0x%llx", __entry->vcpu_id, __entry->guest_rip, __entry->fault_address, __entry->error_code) ); /* * Tracepoint for guest MSR access. */ TRACE_EVENT(kvm_msr, TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception), TP_ARGS(write, ecx, data, exception), TP_STRUCT__entry( __field( unsigned, write ) __field( u32, ecx ) __field( u64, data ) __field( u8, exception ) ), TP_fast_assign( __entry->write = write; __entry->ecx = ecx; __entry->data = data; __entry->exception = exception; ), TP_printk("msr_%s %x = 0x%llx%s", __entry->write ? "write" : "read", __entry->ecx, __entry->data, __entry->exception ? " (#GP)" : "") ); #define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false) #define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false) #define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true) #define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true) /* * Tracepoint for guest CR access. */ TRACE_EVENT(kvm_cr, TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val), TP_ARGS(rw, cr, val), TP_STRUCT__entry( __field( unsigned int, rw ) __field( unsigned int, cr ) __field( unsigned long, val ) ), TP_fast_assign( __entry->rw = rw; __entry->cr = cr; __entry->val = val; ), TP_printk("cr_%s %x = 0x%lx", __entry->rw ? "write" : "read", __entry->cr, __entry->val) ); #define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val) #define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val) TRACE_EVENT(kvm_pic_set_irq, TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced), TP_ARGS(chip, pin, elcr, imr, coalesced), TP_STRUCT__entry( __field( __u8, chip ) __field( __u8, pin ) __field( __u8, elcr ) __field( __u8, imr ) __field( bool, coalesced ) ), TP_fast_assign( __entry->chip = chip; __entry->pin = pin; __entry->elcr = elcr; __entry->imr = imr; __entry->coalesced = coalesced; ), TP_printk("chip %u pin %u (%s%s)%s", __entry->chip, __entry->pin, (__entry->elcr & (1 << __entry->pin)) ? "level":"edge", (__entry->imr & (1 << __entry->pin)) ? "|masked":"", __entry->coalesced ? " (coalesced)" : "") ); #define kvm_apic_dst_shorthand \ {0x0, "dst"}, \ {0x1, "self"}, \ {0x2, "all"}, \ {0x3, "all-but-self"} TRACE_EVENT(kvm_apic_ipi, TP_PROTO(__u32 icr_low, __u32 dest_id), TP_ARGS(icr_low, dest_id), TP_STRUCT__entry( __field( __u32, icr_low ) __field( __u32, dest_id ) ), TP_fast_assign( __entry->icr_low = icr_low; __entry->dest_id = dest_id; ), TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)", __entry->dest_id, (u8)__entry->icr_low, __print_symbolic((__entry->icr_low >> 8 & 0x7), kvm_deliver_mode), (__entry->icr_low & (1<<11)) ? "logical" : "physical", (__entry->icr_low & (1<<14)) ? "assert" : "de-assert", (__entry->icr_low & (1<<15)) ? "level" : "edge", __print_symbolic((__entry->icr_low >> 18 & 0x3), kvm_apic_dst_shorthand)) ); TRACE_EVENT(kvm_apic_accept_irq, TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec), TP_ARGS(apicid, dm, tm, vec), TP_STRUCT__entry( __field( __u32, apicid ) __field( __u16, dm ) __field( __u16, tm ) __field( __u8, vec ) ), TP_fast_assign( __entry->apicid = apicid; __entry->dm = dm; __entry->tm = tm; __entry->vec = vec; ), TP_printk("apicid %x vec %u (%s|%s)", __entry->apicid, __entry->vec, __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode), __entry->tm ? "level" : "edge") ); TRACE_EVENT(kvm_eoi, TP_PROTO(struct kvm_lapic *apic, int vector), TP_ARGS(apic, vector), TP_STRUCT__entry( __field( __u32, apicid ) __field( int, vector ) ), TP_fast_assign( __entry->apicid = apic->vcpu->vcpu_id; __entry->vector = vector; ), TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) ); TRACE_EVENT(kvm_pv_eoi, TP_PROTO(struct kvm_lapic *apic, int vector), TP_ARGS(apic, vector), TP_STRUCT__entry( __field( __u32, apicid ) __field( int, vector ) ), TP_fast_assign( __entry->apicid = apic->vcpu->vcpu_id; __entry->vector = vector; ), TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) ); /* * Tracepoint for nested VMRUN */ TRACE_EVENT(kvm_nested_vmenter, TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl, __u32 event_inj, bool tdp_enabled, __u64 guest_tdp_pgd, __u64 guest_cr3, __u32 isa), TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, tdp_enabled, guest_tdp_pgd, guest_cr3, isa), TP_STRUCT__entry( __field( __u64, rip ) __field( __u64, vmcb ) __field( __u64, nested_rip ) __field( __u32, int_ctl ) __field( __u32, event_inj ) __field( bool, tdp_enabled ) __field( __u64, guest_pgd ) __field( __u32, isa ) ), TP_fast_assign( __entry->rip = rip; __entry->vmcb = vmcb; __entry->nested_rip = nested_rip; __entry->int_ctl = int_ctl; __entry->event_inj = event_inj; __entry->tdp_enabled = tdp_enabled; __entry->guest_pgd = tdp_enabled ? guest_tdp_pgd : guest_cr3; __entry->isa = isa; ), TP_printk("rip: 0x%016llx %s: 0x%016llx nested_rip: 0x%016llx " "int_ctl: 0x%08x event_inj: 0x%08x nested_%s=%s %s: 0x%016llx", __entry->rip, __entry->isa == KVM_ISA_VMX ? "vmcs" : "vmcb", __entry->vmcb, __entry->nested_rip, __entry->int_ctl, __entry->event_inj, __entry->isa == KVM_ISA_VMX ? "ept" : "npt", __entry->tdp_enabled ? "y" : "n", !__entry->tdp_enabled ? "guest_cr3" : __entry->isa == KVM_ISA_VMX ? "nested_eptp" : "nested_cr3", __entry->guest_pgd) ); TRACE_EVENT(kvm_nested_intercepts, TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u32 intercept1, __u32 intercept2, __u32 intercept3), TP_ARGS(cr_read, cr_write, exceptions, intercept1, intercept2, intercept3), TP_STRUCT__entry( __field( __u16, cr_read ) __field( __u16, cr_write ) __field( __u32, exceptions ) __field( __u32, intercept1 ) __field( __u32, intercept2 ) __field( __u32, intercept3 ) ), TP_fast_assign( __entry->cr_read = cr_read; __entry->cr_write = cr_write; __entry->exceptions = exceptions; __entry->intercept1 = intercept1; __entry->intercept2 = intercept2; __entry->intercept3 = intercept3; ), TP_printk("cr_read: %04x cr_write: %04x excp: %08x " "intercepts: %08x %08x %08x", __entry->cr_read, __entry->cr_write, __entry->exceptions, __entry->intercept1, __entry->intercept2, __entry->intercept3) ); /* * Tracepoint for #VMEXIT while nested */ TRACE_EVENT_KVM_EXIT(kvm_nested_vmexit); /* * Tracepoint for #VMEXIT reinjected to the guest */ TRACE_EVENT(kvm_nested_vmexit_inject, TP_PROTO(__u32 exit_code, __u64 exit_info1, __u64 exit_info2, __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), TP_ARGS(exit_code, exit_info1, exit_info2, exit_int_info, exit_int_info_err, isa), TP_STRUCT__entry( __field( __u32, exit_code ) __field( __u64, exit_info1 ) __field( __u64, exit_info2 ) __field( __u32, exit_int_info ) __field( __u32, exit_int_info_err ) __field( __u32, isa ) ), TP_fast_assign( __entry->exit_code = exit_code; __entry->exit_info1 = exit_info1; __entry->exit_info2 = exit_info2; __entry->exit_int_info = exit_int_info; __entry->exit_int_info_err = exit_int_info_err; __entry->isa = isa; ), TP_printk("reason: %s%s%s ext_inf1: 0x%016llx " "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", kvm_print_exit_reason(__entry->exit_code, __entry->isa), __entry->exit_info1, __entry->exit_info2, __entry->exit_int_info, __entry->exit_int_info_err) ); /* * Tracepoint for nested #vmexit because of interrupt pending */ TRACE_EVENT(kvm_nested_intr_vmexit, TP_PROTO(__u64 rip), TP_ARGS(rip), TP_STRUCT__entry( __field( __u64, rip ) ), TP_fast_assign( __entry->rip = rip ), TP_printk("rip: 0x%016llx", __entry->rip) ); /* * Tracepoint for nested #vmexit because of interrupt pending */ TRACE_EVENT(kvm_invlpga, TP_PROTO(__u64 rip, int asid, u64 address), TP_ARGS(rip, asid, address), TP_STRUCT__entry( __field( __u64, rip ) __field( int, asid ) __field( __u64, address ) ), TP_fast_assign( __entry->rip = rip; __entry->asid = asid; __entry->address = address; ), TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx", __entry->rip, __entry->asid, __entry->address) ); /* * Tracepoint for nested #vmexit because of interrupt pending */ TRACE_EVENT(kvm_skinit, TP_PROTO(__u64 rip, __u32 slb), TP_ARGS(rip, slb), TP_STRUCT__entry( __field( __u64, rip ) __field( __u32, slb ) ), TP_fast_assign( __entry->rip = rip; __entry->slb = slb; ), TP_printk("rip: 0x%016llx slb: 0x%08x", __entry->rip, __entry->slb) ); #define KVM_EMUL_INSN_F_CR0_PE (1 << 0) #define KVM_EMUL_INSN_F_EFL_VM (1 << 1) #define KVM_EMUL_INSN_F_CS_D (1 << 2) #define KVM_EMUL_INSN_F_CS_L (1 << 3) #define kvm_trace_symbol_emul_flags \ { 0, "real" }, \ { KVM_EMUL_INSN_F_CR0_PE \ | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \ { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \ { KVM_EMUL_INSN_F_CR0_PE \ | KVM_EMUL_INSN_F_CS_D, "prot32" }, \ { KVM_EMUL_INSN_F_CR0_PE \ | KVM_EMUL_INSN_F_CS_L, "prot64" } #define kei_decode_mode(mode) ({ \ u8 flags = 0xff; \ switch (mode) { \ case X86EMUL_MODE_REAL: \ flags = 0; \ break; \ case X86EMUL_MODE_VM86: \ flags = KVM_EMUL_INSN_F_EFL_VM; \ break; \ case X86EMUL_MODE_PROT16: \ flags = KVM_EMUL_INSN_F_CR0_PE; \ break; \ case X86EMUL_MODE_PROT32: \ flags = KVM_EMUL_INSN_F_CR0_PE \ | KVM_EMUL_INSN_F_CS_D; \ break; \ case X86EMUL_MODE_PROT64: \ flags = KVM_EMUL_INSN_F_CR0_PE \ | KVM_EMUL_INSN_F_CS_L; \ break; \ } \ flags; \ }) TRACE_EVENT(kvm_emulate_insn, TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed), TP_ARGS(vcpu, failed), TP_STRUCT__entry( __field( __u64, rip ) __field( __u32, csbase ) __field( __u8, len ) __array( __u8, insn, 15 ) __field( __u8, flags ) __field( __u8, failed ) ), TP_fast_assign( __entry->csbase = static_call(kvm_x86_get_segment_base)(vcpu, VCPU_SREG_CS); __entry->len = vcpu->arch.emulate_ctxt->fetch.ptr - vcpu->arch.emulate_ctxt->fetch.data; __entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len; memcpy(__entry->insn, vcpu->arch.emulate_ctxt->fetch.data, 15); __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt->mode); __entry->failed = failed; ), TP_printk("%x:%llx:%s (%s)%s", __entry->csbase, __entry->rip, __print_hex(__entry->insn, __entry->len), __print_symbolic(__entry->flags, kvm_trace_symbol_emul_flags), __entry->failed ? " failed" : "" ) ); #define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0) #define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1) TRACE_EVENT( vcpu_match_mmio, TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match), TP_ARGS(gva, gpa, write, gpa_match), TP_STRUCT__entry( __field(gva_t, gva) __field(gpa_t, gpa) __field(bool, write) __field(bool, gpa_match) ), TP_fast_assign( __entry->gva = gva; __entry->gpa = gpa; __entry->write = write; __entry->gpa_match = gpa_match ), TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa, __entry->write ? "Write" : "Read", __entry->gpa_match ? "GPA" : "GVA") ); TRACE_EVENT(kvm_write_tsc_offset, TP_PROTO(unsigned int vcpu_id, __u64 previous_tsc_offset, __u64 next_tsc_offset), TP_ARGS(vcpu_id, previous_tsc_offset, next_tsc_offset), TP_STRUCT__entry( __field( unsigned int, vcpu_id ) __field( __u64, previous_tsc_offset ) __field( __u64, next_tsc_offset ) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->previous_tsc_offset = previous_tsc_offset; __entry->next_tsc_offset = next_tsc_offset; ), TP_printk("vcpu=%u prev=%llu next=%llu", __entry->vcpu_id, __entry->previous_tsc_offset, __entry->next_tsc_offset) ); #ifdef CONFIG_X86_64 #define host_clocks \ {VDSO_CLOCKMODE_NONE, "none"}, \ {VDSO_CLOCKMODE_TSC, "tsc"} \ TRACE_EVENT(kvm_update_master_clock, TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched), TP_ARGS(use_master_clock, host_clock, offset_matched), TP_STRUCT__entry( __field( bool, use_master_clock ) __field( unsigned int, host_clock ) __field( bool, offset_matched ) ), TP_fast_assign( __entry->use_master_clock = use_master_clock; __entry->host_clock = host_clock; __entry->offset_matched = offset_matched; ), TP_printk("masterclock %d hostclock %s offsetmatched %u", __entry->use_master_clock, __print_symbolic(__entry->host_clock, host_clocks), __entry->offset_matched) ); TRACE_EVENT(kvm_track_tsc, TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched, unsigned int online_vcpus, bool use_master_clock, unsigned int host_clock), TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock, host_clock), TP_STRUCT__entry( __field( unsigned int, vcpu_id ) __field( unsigned int, nr_vcpus_matched_tsc ) __field( unsigned int, online_vcpus ) __field( bool, use_master_clock ) __field( unsigned int, host_clock ) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->nr_vcpus_matched_tsc = nr_matched; __entry->online_vcpus = online_vcpus; __entry->use_master_clock = use_master_clock; __entry->host_clock = host_clock; ), TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u" " hostclock %s", __entry->vcpu_id, __entry->use_master_clock, __entry->nr_vcpus_matched_tsc, __entry->online_vcpus, __print_symbolic(__entry->host_clock, host_clocks)) ); #endif /* CONFIG_X86_64 */ /* * Tracepoint for PML full VMEXIT. */ TRACE_EVENT(kvm_pml_full, TP_PROTO(unsigned int vcpu_id), TP_ARGS(vcpu_id), TP_STRUCT__entry( __field( unsigned int, vcpu_id ) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; ), TP_printk("vcpu %d: PML full", __entry->vcpu_id) ); TRACE_EVENT(kvm_ple_window_update, TP_PROTO(unsigned int vcpu_id, unsigned int new, unsigned int old), TP_ARGS(vcpu_id, new, old), TP_STRUCT__entry( __field( unsigned int, vcpu_id ) __field( unsigned int, new ) __field( unsigned int, old ) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->new = new; __entry->old = old; ), TP_printk("vcpu %u old %u new %u (%s)", __entry->vcpu_id, __entry->old, __entry->new, __entry->old < __entry->new ? "growed" : "shrinked") ); TRACE_EVENT(kvm_pvclock_update, TP_PROTO(unsigned int vcpu_id, struct pvclock_vcpu_time_info *pvclock), TP_ARGS(vcpu_id, pvclock), TP_STRUCT__entry( __field( unsigned int, vcpu_id ) __field( __u32, version ) __field( __u64, tsc_timestamp ) __field( __u64, system_time ) __field( __u32, tsc_to_system_mul ) __field( __s8, tsc_shift ) __field( __u8, flags ) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->version = pvclock->version; __entry->tsc_timestamp = pvclock->tsc_timestamp; __entry->system_time = pvclock->system_time; __entry->tsc_to_system_mul = pvclock->tsc_to_system_mul; __entry->tsc_shift = pvclock->tsc_shift; __entry->flags = pvclock->flags; ), TP_printk("vcpu_id %u, pvclock { version %u, tsc_timestamp 0x%llx, " "system_time 0x%llx, tsc_to_system_mul 0x%x, tsc_shift %d, " "flags 0x%x }", __entry->vcpu_id, __entry->version, __entry->tsc_timestamp, __entry->system_time, __entry->tsc_to_system_mul, __entry->tsc_shift, __entry->flags) ); TRACE_EVENT(kvm_wait_lapic_expire, TP_PROTO(unsigned int vcpu_id, s64 delta), TP_ARGS(vcpu_id, delta), TP_STRUCT__entry( __field( unsigned int, vcpu_id ) __field( s64, delta ) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->delta = delta; ), TP_printk("vcpu %u: delta %lld (%s)", __entry->vcpu_id, __entry->delta, __entry->delta < 0 ? "early" : "late") ); TRACE_EVENT(kvm_smm_transition, TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering), TP_ARGS(vcpu_id, smbase, entering), TP_STRUCT__entry( __field( unsigned int, vcpu_id ) __field( u64, smbase ) __field( bool, entering ) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->smbase = smbase; __entry->entering = entering; ), TP_printk("vcpu %u: %s SMM, smbase 0x%llx", __entry->vcpu_id, __entry->entering ? "entering" : "leaving", __entry->smbase) ); /* * Tracepoint for VT-d posted-interrupts. */ TRACE_EVENT(kvm_pi_irte_update, TP_PROTO(unsigned int host_irq, unsigned int vcpu_id, unsigned int gsi, unsigned int gvec, u64 pi_desc_addr, bool set), TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set), TP_STRUCT__entry( __field( unsigned int, host_irq ) __field( unsigned int, vcpu_id ) __field( unsigned int, gsi ) __field( unsigned int, gvec ) __field( u64, pi_desc_addr ) __field( bool, set ) ), TP_fast_assign( __entry->host_irq = host_irq; __entry->vcpu_id = vcpu_id; __entry->gsi = gsi; __entry->gvec = gvec; __entry->pi_desc_addr = pi_desc_addr; __entry->set = set; ), TP_printk("VT-d PI is %s for irq %u, vcpu %u, gsi: 0x%x, " "gvec: 0x%x, pi_desc_addr: 0x%llx", __entry->set ? "enabled and being updated" : "disabled", __entry->host_irq, __entry->vcpu_id, __entry->gsi, __entry->gvec, __entry->pi_desc_addr) ); /* * Tracepoint for kvm_hv_notify_acked_sint. */ TRACE_EVENT(kvm_hv_notify_acked_sint, TP_PROTO(int vcpu_id, u32 sint), TP_ARGS(vcpu_id, sint), TP_STRUCT__entry( __field(int, vcpu_id) __field(u32, sint) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->sint = sint; ), TP_printk("vcpu_id %d sint %u", __entry->vcpu_id, __entry->sint) ); /* * Tracepoint for synic_set_irq. */ TRACE_EVENT(kvm_hv_synic_set_irq, TP_PROTO(int vcpu_id, u32 sint, int vector, int ret), TP_ARGS(vcpu_id, sint, vector, ret), TP_STRUCT__entry( __field(int, vcpu_id) __field(u32, sint) __field(int, vector) __field(int, ret) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->sint = sint; __entry->vector = vector; __entry->ret = ret; ), TP_printk("vcpu_id %d sint %u vector %d ret %d", __entry->vcpu_id, __entry->sint, __entry->vector, __entry->ret) ); /* * Tracepoint for kvm_hv_synic_send_eoi. */ TRACE_EVENT(kvm_hv_synic_send_eoi, TP_PROTO(int vcpu_id, int vector), TP_ARGS(vcpu_id, vector), TP_STRUCT__entry( __field(int, vcpu_id) __field(u32, sint) __field(int, vector) __field(int, ret) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->vector = vector; ), TP_printk("vcpu_id %d vector %d", __entry->vcpu_id, __entry->vector) ); /* * Tracepoint for synic_set_msr. */ TRACE_EVENT(kvm_hv_synic_set_msr, TP_PROTO(int vcpu_id, u32 msr, u64 data, bool host), TP_ARGS(vcpu_id, msr, data, host), TP_STRUCT__entry( __field(int, vcpu_id) __field(u32, msr) __field(u64, data) __field(bool, host) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->msr = msr; __entry->data = data; __entry->host = host ), TP_printk("vcpu_id %d msr 0x%x data 0x%llx host %d", __entry->vcpu_id, __entry->msr, __entry->data, __entry->host) ); /* * Tracepoint for stimer_set_config. */ TRACE_EVENT(kvm_hv_stimer_set_config, TP_PROTO(int vcpu_id, int timer_index, u64 config, bool host), TP_ARGS(vcpu_id, timer_index, config, host), TP_STRUCT__entry( __field(int, vcpu_id) __field(int, timer_index) __field(u64, config) __field(bool, host) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->timer_index = timer_index; __entry->config = config; __entry->host = host; ), TP_printk("vcpu_id %d timer %d config 0x%llx host %d", __entry->vcpu_id, __entry->timer_index, __entry->config, __entry->host) ); /* * Tracepoint for stimer_set_count. */ TRACE_EVENT(kvm_hv_stimer_set_count, TP_PROTO(int vcpu_id, int timer_index, u64 count, bool host), TP_ARGS(vcpu_id, timer_index, count, host), TP_STRUCT__entry( __field(int, vcpu_id) __field(int, timer_index) __field(u64, count) __field(bool, host) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->timer_index = timer_index; __entry->count = count; __entry->host = host; ), TP_printk("vcpu_id %d timer %d count %llu host %d", __entry->vcpu_id, __entry->timer_index, __entry->count, __entry->host) ); /* * Tracepoint for stimer_start(periodic timer case). */ TRACE_EVENT(kvm_hv_stimer_start_periodic, TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 exp_time), TP_ARGS(vcpu_id, timer_index, time_now, exp_time), TP_STRUCT__entry( __field(int, vcpu_id) __field(int, timer_index) __field(u64, time_now) __field(u64, exp_time) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->timer_index = timer_index; __entry->time_now = time_now; __entry->exp_time = exp_time; ), TP_printk("vcpu_id %d timer %d time_now %llu exp_time %llu", __entry->vcpu_id, __entry->timer_index, __entry->time_now, __entry->exp_time) ); /* * Tracepoint for stimer_start(one-shot timer case). */ TRACE_EVENT(kvm_hv_stimer_start_one_shot, TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 count), TP_ARGS(vcpu_id, timer_index, time_now, count), TP_STRUCT__entry( __field(int, vcpu_id) __field(int, timer_index) __field(u64, time_now) __field(u64, count) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->timer_index = timer_index; __entry->time_now = time_now; __entry->count = count; ), TP_printk("vcpu_id %d timer %d time_now %llu count %llu", __entry->vcpu_id, __entry->timer_index, __entry->time_now, __entry->count) ); /* * Tracepoint for stimer_timer_callback. */ TRACE_EVENT(kvm_hv_stimer_callback, TP_PROTO(int vcpu_id, int timer_index), TP_ARGS(vcpu_id, timer_index), TP_STRUCT__entry( __field(int, vcpu_id) __field(int, timer_index) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->timer_index = timer_index; ), TP_printk("vcpu_id %d timer %d", __entry->vcpu_id, __entry->timer_index) ); /* * Tracepoint for stimer_expiration. */ TRACE_EVENT(kvm_hv_stimer_expiration, TP_PROTO(int vcpu_id, int timer_index, int direct, int msg_send_result), TP_ARGS(vcpu_id, timer_index, direct, msg_send_result), TP_STRUCT__entry( __field(int, vcpu_id) __field(int, timer_index) __field(int, direct) __field(int, msg_send_result) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->timer_index = timer_index; __entry->direct = direct; __entry->msg_send_result = msg_send_result; ), TP_printk("vcpu_id %d timer %d direct %d send result %d", __entry->vcpu_id, __entry->timer_index, __entry->direct, __entry->msg_send_result) ); /* * Tracepoint for stimer_cleanup. */ TRACE_EVENT(kvm_hv_stimer_cleanup, TP_PROTO(int vcpu_id, int timer_index), TP_ARGS(vcpu_id, timer_index), TP_STRUCT__entry( __field(int, vcpu_id) __field(int, timer_index) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->timer_index = timer_index; ), TP_printk("vcpu_id %d timer %d", __entry->vcpu_id, __entry->timer_index) ); TRACE_EVENT(kvm_apicv_inhibit_changed, TP_PROTO(int reason, bool set, unsigned long inhibits), TP_ARGS(reason, set, inhibits), TP_STRUCT__entry( __field(int, reason) __field(bool, set) __field(unsigned long, inhibits) ), TP_fast_assign( __entry->reason = reason; __entry->set = set; __entry->inhibits = inhibits; ), TP_printk("%s reason=%u, inhibits=0x%lx", __entry->set ? "set" : "cleared", __entry->reason, __entry->inhibits) ); TRACE_EVENT(kvm_apicv_accept_irq, TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec), TP_ARGS(apicid, dm, tm, vec), TP_STRUCT__entry( __field( __u32, apicid ) __field( __u16, dm ) __field( __u16, tm ) __field( __u8, vec ) ), TP_fast_assign( __entry->apicid = apicid; __entry->dm = dm; __entry->tm = tm; __entry->vec = vec; ), TP_printk("apicid %x vec %u (%s|%s)", __entry->apicid, __entry->vec, __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode), __entry->tm ? "level" : "edge") ); /* * Tracepoint for AMD AVIC */ TRACE_EVENT(kvm_avic_incomplete_ipi, TP_PROTO(u32 vcpu, u32 icrh, u32 icrl, u32 id, u32 index), TP_ARGS(vcpu, icrh, icrl, id, index), TP_STRUCT__entry( __field(u32, vcpu) __field(u32, icrh) __field(u32, icrl) __field(u32, id) __field(u32, index) ), TP_fast_assign( __entry->vcpu = vcpu; __entry->icrh = icrh; __entry->icrl = icrl; __entry->id = id; __entry->index = index; ), TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u", __entry->vcpu, __entry->icrh, __entry->icrl, __entry->id, __entry->index) ); TRACE_EVENT(kvm_avic_unaccelerated_access, TP_PROTO(u32 vcpu, u32 offset, bool ft, bool rw, u32 vec), TP_ARGS(vcpu, offset, ft, rw, vec), TP_STRUCT__entry( __field(u32, vcpu) __field(u32, offset) __field(bool, ft) __field(bool, rw) __field(u32, vec) ), TP_fast_assign( __entry->vcpu = vcpu; __entry->offset = offset; __entry->ft = ft; __entry->rw = rw; __entry->vec = vec; ), TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x", __entry->vcpu, __entry->offset, __print_symbolic(__entry->offset, kvm_trace_symbol_apic), __entry->ft ? "trap" : "fault", __entry->rw ? "write" : "read", __entry->vec) ); TRACE_EVENT(kvm_avic_ga_log, TP_PROTO(u32 vmid, u32 vcpuid), TP_ARGS(vmid, vcpuid), TP_STRUCT__entry( __field(u32, vmid) __field(u32, vcpuid) ), TP_fast_assign( __entry->vmid = vmid; __entry->vcpuid = vcpuid; ), TP_printk("vmid=%u, vcpuid=%u", __entry->vmid, __entry->vcpuid) ); TRACE_EVENT(kvm_avic_kick_vcpu_slowpath, TP_PROTO(u32 icrh, u32 icrl, u32 index), TP_ARGS(icrh, icrl, index), TP_STRUCT__entry( __field(u32, icrh) __field(u32, icrl) __field(u32, index) ), TP_fast_assign( __entry->icrh = icrh; __entry->icrl = icrl; __entry->index = index; ), TP_printk("icrh:icrl=%#08x:%08x, index=%u", __entry->icrh, __entry->icrl, __entry->index) ); TRACE_EVENT(kvm_avic_doorbell, TP_PROTO(u32 vcpuid, u32 apicid), TP_ARGS(vcpuid, apicid), TP_STRUCT__entry( __field(u32, vcpuid) __field(u32, apicid) ), TP_fast_assign( __entry->vcpuid = vcpuid; __entry->apicid = apicid; ), TP_printk("vcpuid=%u, apicid=%u", __entry->vcpuid, __entry->apicid) ); TRACE_EVENT(kvm_hv_timer_state, TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use), TP_ARGS(vcpu_id, hv_timer_in_use), TP_STRUCT__entry( __field(unsigned int, vcpu_id) __field(unsigned int, hv_timer_in_use) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->hv_timer_in_use = hv_timer_in_use; ), TP_printk("vcpu_id %x hv_timer %x", __entry->vcpu_id, __entry->hv_timer_in_use) ); /* * Tracepoint for kvm_hv_flush_tlb. */ TRACE_EVENT(kvm_hv_flush_tlb, TP_PROTO(u64 processor_mask, u64 address_space, u64 flags, bool guest_mode), TP_ARGS(processor_mask, address_space, flags, guest_mode), TP_STRUCT__entry( __field(u64, processor_mask) __field(u64, address_space) __field(u64, flags) __field(bool, guest_mode) ), TP_fast_assign( __entry->processor_mask = processor_mask; __entry->address_space = address_space; __entry->flags = flags; __entry->guest_mode = guest_mode; ), TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx %s", __entry->processor_mask, __entry->address_space, __entry->flags, __entry->guest_mode ? "(L2)" : "") ); /* * Tracepoint for kvm_hv_flush_tlb_ex. */ TRACE_EVENT(kvm_hv_flush_tlb_ex, TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags, bool guest_mode), TP_ARGS(valid_bank_mask, format, address_space, flags, guest_mode), TP_STRUCT__entry( __field(u64, valid_bank_mask) __field(u64, format) __field(u64, address_space) __field(u64, flags) __field(bool, guest_mode) ), TP_fast_assign( __entry->valid_bank_mask = valid_bank_mask; __entry->format = format; __entry->address_space = address_space; __entry->flags = flags; __entry->guest_mode = guest_mode; ), TP_printk("valid_bank_mask 0x%llx format 0x%llx " "address_space 0x%llx flags 0x%llx %s", __entry->valid_bank_mask, __entry->format, __entry->address_space, __entry->flags, __entry->guest_mode ? "(L2)" : "") ); /* * Tracepoints for kvm_hv_send_ipi. */ TRACE_EVENT(kvm_hv_send_ipi, TP_PROTO(u32 vector, u64 processor_mask), TP_ARGS(vector, processor_mask), TP_STRUCT__entry( __field(u32, vector) __field(u64, processor_mask) ), TP_fast_assign( __entry->vector = vector; __entry->processor_mask = processor_mask; ), TP_printk("vector %x processor_mask 0x%llx", __entry->vector, __entry->processor_mask) ); TRACE_EVENT(kvm_hv_send_ipi_ex, TP_PROTO(u32 vector, u64 format, u64 valid_bank_mask), TP_ARGS(vector, format, valid_bank_mask), TP_STRUCT__entry( __field(u32, vector) __field(u64, format) __field(u64, valid_bank_mask) ), TP_fast_assign( __entry->vector = vector; __entry->format = format; __entry->valid_bank_mask = valid_bank_mask; ), TP_printk("vector %x format %llx valid_bank_mask 0x%llx", __entry->vector, __entry->format, __entry->valid_bank_mask) ); TRACE_EVENT(kvm_pv_tlb_flush, TP_PROTO(unsigned int vcpu_id, bool need_flush_tlb), TP_ARGS(vcpu_id, need_flush_tlb), TP_STRUCT__entry( __field( unsigned int, vcpu_id ) __field( bool, need_flush_tlb ) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->need_flush_tlb = need_flush_tlb; ), TP_printk("vcpu %u need_flush_tlb %s", __entry->vcpu_id, __entry->need_flush_tlb ? "true" : "false") ); /* * Tracepoint for failed nested VMX VM-Enter. */ TRACE_EVENT(kvm_nested_vmenter_failed, TP_PROTO(const char *msg, u32 err), TP_ARGS(msg, err), TP_STRUCT__entry( __string(msg, msg) __field(u32, err) ), TP_fast_assign( __assign_str(msg, msg); __entry->err = err; ), TP_printk("%s%s", __get_str(msg), !__entry->err ? "" : __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS)) ); /* * Tracepoint for syndbg_set_msr. */ TRACE_EVENT(kvm_hv_syndbg_set_msr, TP_PROTO(int vcpu_id, u32 vp_index, u32 msr, u64 data), TP_ARGS(vcpu_id, vp_index, msr, data), TP_STRUCT__entry( __field(int, vcpu_id) __field(u32, vp_index) __field(u32, msr) __field(u64, data) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->vp_index = vp_index; __entry->msr = msr; __entry->data = data; ), TP_printk("vcpu_id %d vp_index %u msr 0x%x data 0x%llx", __entry->vcpu_id, __entry->vp_index, __entry->msr, __entry->data) ); /* * Tracepoint for syndbg_get_msr. */ TRACE_EVENT(kvm_hv_syndbg_get_msr, TP_PROTO(int vcpu_id, u32 vp_index, u32 msr, u64 data), TP_ARGS(vcpu_id, vp_index, msr, data), TP_STRUCT__entry( __field(int, vcpu_id) __field(u32, vp_index) __field(u32, msr) __field(u64, data) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->vp_index = vp_index; __entry->msr = msr; __entry->data = data; ), TP_printk("vcpu_id %d vp_index %u msr 0x%x data 0x%llx", __entry->vcpu_id, __entry->vp_index, __entry->msr, __entry->data) ); /* * Tracepoint for the start of VMGEXIT processing */ TRACE_EVENT(kvm_vmgexit_enter, TP_PROTO(unsigned int vcpu_id, struct ghcb *ghcb), TP_ARGS(vcpu_id, ghcb), TP_STRUCT__entry( __field(unsigned int, vcpu_id) __field(u64, exit_reason) __field(u64, info1) __field(u64, info2) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->exit_reason = ghcb->save.sw_exit_code; __entry->info1 = ghcb->save.sw_exit_info_1; __entry->info2 = ghcb->save.sw_exit_info_2; ), TP_printk("vcpu %u, exit_reason %llx, exit_info1 %llx, exit_info2 %llx", __entry->vcpu_id, __entry->exit_reason, __entry->info1, __entry->info2) ); /* * Tracepoint for the end of VMGEXIT processing */ TRACE_EVENT(kvm_vmgexit_exit, TP_PROTO(unsigned int vcpu_id, struct ghcb *ghcb), TP_ARGS(vcpu_id, ghcb), TP_STRUCT__entry( __field(unsigned int, vcpu_id) __field(u64, exit_reason) __field(u64, info1) __field(u64, info2) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->exit_reason = ghcb->save.sw_exit_code; __entry->info1 = ghcb->save.sw_exit_info_1; __entry->info2 = ghcb->save.sw_exit_info_2; ), TP_printk("vcpu %u, exit_reason %llx, exit_info1 %llx, exit_info2 %llx", __entry->vcpu_id, __entry->exit_reason, __entry->info1, __entry->info2) ); /* * Tracepoint for the start of VMGEXIT MSR procotol processing */ TRACE_EVENT(kvm_vmgexit_msr_protocol_enter, TP_PROTO(unsigned int vcpu_id, u64 ghcb_gpa), TP_ARGS(vcpu_id, ghcb_gpa), TP_STRUCT__entry( __field(unsigned int, vcpu_id) __field(u64, ghcb_gpa) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->ghcb_gpa = ghcb_gpa; ), TP_printk("vcpu %u, ghcb_gpa %016llx", __entry->vcpu_id, __entry->ghcb_gpa) ); /* * Tracepoint for the end of VMGEXIT MSR procotol processing */ TRACE_EVENT(kvm_vmgexit_msr_protocol_exit, TP_PROTO(unsigned int vcpu_id, u64 ghcb_gpa, int result), TP_ARGS(vcpu_id, ghcb_gpa, result), TP_STRUCT__entry( __field(unsigned int, vcpu_id) __field(u64, ghcb_gpa) __field(int, result) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->ghcb_gpa = ghcb_gpa; __entry->result = result; ), TP_printk("vcpu %u, ghcb_gpa %016llx, result %d", __entry->vcpu_id, __entry->ghcb_gpa, __entry->result) ); #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH ../../arch/x86/kvm #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace /* This part must be outside protection */ #include <trace/define_trace.h>
91 16 2969 2912 2975 2968 16 2968 8568 209 8576 8572 8570 8568 8568 8565 8565 8561 233 233 233 233 90 90 90 90 4981 4977 4977 4976 4974 4265 4976 4706 3968 4978 4976 496 4953 4949 4950 116 116 116 2397 2392 2401 2396 2394 2395 25 2395 1908 2372 214 2373 2366 2368 2365 8 2230 2237 1162 2155 29435 2150 932 1270 1267 1409 1408 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/file_table.c * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) */ #include <linux/string.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/eventpoll.h> #include <linux/rcupdate.h> #include <linux/mount.h> #include <linux/capability.h> #include <linux/cdev.h> #include <linux/fsnotify.h> #include <linux/sysctl.h> #include <linux/percpu_counter.h> #include <linux/percpu.h> #include <linux/task_work.h> #include <linux/ima.h> #include <linux/swap.h> #include <linux/kmemleak.h> #include <linux/atomic.h> #include "internal.h" /* sysctl tunables... */ static struct files_stat_struct files_stat = { .max_files = NR_FILE }; /* SLAB cache for file structures */ static struct kmem_cache *filp_cachep __ro_after_init; static struct percpu_counter nr_files __cacheline_aligned_in_smp; /* Container for backing file with optional user path */ struct backing_file { struct file file; struct path user_path; }; static inline struct backing_file *backing_file(struct file *f) { return container_of(f, struct backing_file, file); } struct path *backing_file_user_path(struct file *f) { return &backing_file(f)->user_path; } EXPORT_SYMBOL_GPL(backing_file_user_path); static inline void file_free(struct file *f) { security_file_free(f); if (likely(!(f->f_mode & FMODE_NOACCOUNT))) percpu_counter_dec(&nr_files); put_cred(f->f_cred); if (unlikely(f->f_mode & FMODE_BACKING)) { path_put(backing_file_user_path(f)); kfree(backing_file(f)); } else { kmem_cache_free(filp_cachep, f); } } /* * Return the total number of open files in the system */ static long get_nr_files(void) { return percpu_counter_read_positive(&nr_files); } /* * Return the maximum number of open files in the system */ unsigned long get_max_files(void) { return files_stat.max_files; } EXPORT_SYMBOL_GPL(get_max_files); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* * Handle nr_files sysctl */ static int proc_nr_files(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static struct ctl_table fs_stat_sysctls[] = { { .procname = "file-nr", .data = &files_stat, .maxlen = sizeof(files_stat), .mode = 0444, .proc_handler = proc_nr_files, }, { .procname = "file-max", .data = &files_stat.max_files, .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = SYSCTL_LONG_ZERO, .extra2 = SYSCTL_LONG_MAX, }, { .procname = "nr_open", .data = &sysctl_nr_open, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &sysctl_nr_open_min, .extra2 = &sysctl_nr_open_max, }, }; static int __init init_fs_stat_sysctls(void) { register_sysctl_init("fs", fs_stat_sysctls); if (IS_ENABLED(CONFIG_BINFMT_MISC)) { struct ctl_table_header *hdr; hdr = register_sysctl_mount_point("fs/binfmt_misc"); kmemleak_not_leak(hdr); } return 0; } fs_initcall(init_fs_stat_sysctls); #endif static int init_file(struct file *f, int flags, const struct cred *cred) { int error; f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { put_cred(f->f_cred); return error; } rwlock_init(&f->f_owner.lock); spin_lock_init(&f->f_lock); mutex_init(&f->f_pos_lock); f->f_flags = flags; f->f_mode = OPEN_FMODE(flags); /* f->f_version: 0 */ /* * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While * fget-rcu pattern users need to be able to handle spurious * refcount bumps we should reinitialize the reused file first. */ atomic_long_set(&f->f_count, 1); return 0; } /* Find an unused file structure and return a pointer to it. * Returns an error pointer if some error happend e.g. we over file * structures limit, run out of memory or operation is not permitted. * * Be very careful using this. You are responsible for * getting write access to any mount that you might assign * to this filp, if it is opened for write. If this is not * done, you will imbalance int the mount's writer count * and a warning at __fput() time. */ struct file *alloc_empty_file(int flags, const struct cred *cred) { static long old_max; struct file *f; int error; /* * Privileged users can go above max_files */ if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. */ if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files) goto over; } f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); error = init_file(f, flags, cred); if (unlikely(error)) { kmem_cache_free(filp_cachep, f); return ERR_PTR(error); } percpu_counter_inc(&nr_files); return f; over: /* Ran out of filps - report that */ if (get_nr_files() > old_max) { pr_info("VFS: file-max limit %lu reached\n", get_max_files()); old_max = get_nr_files(); } return ERR_PTR(-ENFILE); } /* * Variant of alloc_empty_file() that doesn't check and modify nr_files. * * This is only for kernel internal use, and the allocate file must not be * installed into file tables or such. */ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) { struct file *f; int error; f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); error = init_file(f, flags, cred); if (unlikely(error)) { kmem_cache_free(filp_cachep, f); return ERR_PTR(error); } f->f_mode |= FMODE_NOACCOUNT; return f; } /* * Variant of alloc_empty_file() that allocates a backing_file container * and doesn't check and modify nr_files. * * This is only for kernel internal use, and the allocate file must not be * installed into file tables or such. */ struct file *alloc_empty_backing_file(int flags, const struct cred *cred) { struct backing_file *ff; int error; ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL); if (unlikely(!ff)) return ERR_PTR(-ENOMEM); error = init_file(&ff->file, flags, cred); if (unlikely(error)) { kfree(ff); return ERR_PTR(error); } ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT; return &ff->file; } /** * alloc_file - allocate and initialize a 'struct file' * * @path: the (dentry, vfsmount) pair for the new file * @flags: O_... flags with which the new file will be opened * @fop: the 'struct file_operations' for the new file */ static struct file *alloc_file(const struct path *path, int flags, const struct file_operations *fop) { struct file *file; file = alloc_empty_file(flags, current_cred()); if (IS_ERR(file)) return file; file->f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); file->f_sb_err = file_sample_sb_err(file); if (fop->llseek) file->f_mode |= FMODE_LSEEK; if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) file->f_mode |= FMODE_CAN_READ; if ((file->f_mode & FMODE_WRITE) && likely(fop->write || fop->write_iter)) file->f_mode |= FMODE_CAN_WRITE; file->f_iocb_flags = iocb_flags(file); file->f_mode |= FMODE_OPENED; file->f_op = fop; if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(path->dentry->d_inode); return file; } struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, const char *name, int flags, const struct file_operations *fops) { struct qstr this = QSTR_INIT(name, strlen(name)); struct path path; struct file *file; path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this); if (!path.dentry) return ERR_PTR(-ENOMEM); path.mnt = mntget(mnt); d_instantiate(path.dentry, inode); file = alloc_file(&path, flags, fops); if (IS_ERR(file)) { ihold(inode); path_put(&path); } return file; } EXPORT_SYMBOL(alloc_file_pseudo); struct file *alloc_file_clone(struct file *base, int flags, const struct file_operations *fops) { struct file *f = alloc_file(&base->f_path, flags, fops); if (!IS_ERR(f)) { path_get(&f->f_path); f->f_mapping = base->f_mapping; } return f; } /* the real guts of fput() - releasing the last reference to file */ static void __fput(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct vfsmount *mnt = file->f_path.mnt; struct inode *inode = file->f_inode; fmode_t mode = file->f_mode; if (unlikely(!(file->f_mode & FMODE_OPENED))) goto out; might_sleep(); fsnotify_close(file); /* * The function eventpoll_release() should be the first called * in the file cleanup chain. */ eventpoll_release(file); locks_remove_file(file); ima_file_free(file); if (unlikely(file->f_flags & FASYNC)) { if (file->f_op->fasync) file->f_op->fasync(-1, file, 0); } if (file->f_op->release) file->f_op->release(inode, file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && !(mode & FMODE_PATH))) { cdev_put(inode->i_cdev); } fops_put(file->f_op); put_pid(file->f_owner.pid); put_file_access(file); dput(dentry); if (unlikely(mode & FMODE_NEED_UNMOUNT)) dissolve_on_fput(mnt); mntput(mnt); out: file_free(file); } static LLIST_HEAD(delayed_fput_list); static void delayed_fput(struct work_struct *unused) { struct llist_node *node = llist_del_all(&delayed_fput_list); struct file *f, *t; llist_for_each_entry_safe(f, t, node, f_llist) __fput(f); } static void ____fput(struct callback_head *work) { __fput(container_of(work, struct file, f_task_work)); } /* * If kernel thread really needs to have the final fput() it has done * to complete, call this. The only user right now is the boot - we * *do* need to make sure our writes to binaries on initramfs has * not left us with opened struct file waiting for __fput() - execve() * won't work without that. Please, don't add more callers without * very good reasons; in particular, never call that with locks * held and never call that from a thread that might need to do * some work on any kind of umount. */ void flush_delayed_fput(void) { delayed_fput(NULL); } EXPORT_SYMBOL_GPL(flush_delayed_fput); static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); void fput(struct file *file) { if (atomic_long_dec_and_test(&file->f_count)) { struct task_struct *task = current; if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { file_free(file); return; } if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { init_task_work(&file->f_task_work, ____fput); if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) return; /* * After this task has run exit_task_work(), * task_work_add() will fail. Fall through to delayed * fput to avoid leaking *file. */ } if (llist_add(&file->f_llist, &delayed_fput_list)) schedule_delayed_work(&delayed_fput_work, 1); } } /* * synchronous analog of fput(); for kernel threads that might be needed * in some umount() (and thus can't use flush_delayed_fput() without * risking deadlocks), need to wait for completion of __fput() and know * for this specific struct file it won't involve anything that would * need them. Use only if you really need it - at the very least, * don't blindly convert fput() by kernel thread to that. */ void __fput_sync(struct file *file) { if (atomic_long_dec_and_test(&file->f_count)) __fput(file); } EXPORT_SYMBOL(fput); EXPORT_SYMBOL(__fput_sync); void __init files_init(void) { filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } /* * One file with associated inode and dcache is very roughly 1K. Per default * do not use more than 10% of our memory for files. */ void __init files_maxfiles_init(void) { unsigned long n; unsigned long nr_pages = totalram_pages(); unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2; memreserve = min(memreserve, nr_pages - 1); n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); }
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 // SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * * Changes by Acxiom Corporation to add protocol version to kernel * communication, Copyright Acxiom Corporation, 2005. * * See COPYING in top-level directory. */ #include "protocol.h" #include "orangefs-kernel.h" #include "orangefs-dev-proto.h" #include "orangefs-bufmap.h" #include "orangefs-debugfs.h" #include <linux/debugfs.h> #include <linux/slab.h> /* this file implements the /dev/pvfs2-req device node */ uint32_t orangefs_userspace_version; static int open_access_count; static DEFINE_MUTEX(devreq_mutex); #define DUMP_DEVICE_ERROR() \ do { \ gossip_err("*****************************************************\n");\ gossip_err("ORANGEFS Device Error: You cannot open the device file "); \ gossip_err("\n/dev/%s more than once. Please make sure that\nthere " \ "are no ", ORANGEFS_REQDEVICE_NAME); \ gossip_err("instances of a program using this device\ncurrently " \ "running. (You must verify this!)\n"); \ gossip_err("For example, you can use the lsof program as follows:\n");\ gossip_err("'lsof | grep %s' (run this as root)\n", \ ORANGEFS_REQDEVICE_NAME); \ gossip_err(" open_access_count = %d\n", open_access_count); \ gossip_err("*****************************************************\n");\ } while (0) static int hash_func(__u64 tag, int table_size) { return do_div(tag, (unsigned int)table_size); } static void orangefs_devreq_add_op(struct orangefs_kernel_op_s *op) { int index = hash_func(op->tag, hash_table_size); list_add_tail(&op->list, &orangefs_htable_ops_in_progress[index]); } /* * find the op with this tag and remove it from the in progress * hash table. */ static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag) { struct orangefs_kernel_op_s *op, *next; int index; index = hash_func(tag, hash_table_size); spin_lock(&orangefs_htable_ops_in_progress_lock); list_for_each_entry_safe(op, next, &orangefs_htable_ops_in_progress[index], list) { if (op->tag == tag && !op_state_purged(op) && !op_state_given_up(op)) { list_del_init(&op->list); spin_unlock(&orangefs_htable_ops_in_progress_lock); return op; } } spin_unlock(&orangefs_htable_ops_in_progress_lock); return NULL; } /* Returns whether any FS are still pending remounted */ static int mark_all_pending_mounts(void) { int unmounted = 1; struct orangefs_sb_info_s *orangefs_sb = NULL; spin_lock(&orangefs_superblocks_lock); list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) { /* All of these file system require a remount */ orangefs_sb->mount_pending = 1; unmounted = 0; } spin_unlock(&orangefs_superblocks_lock); return unmounted; } /* * Determine if a given file system needs to be remounted or not * Returns -1 on error * 0 if already mounted * 1 if needs remount */ static int fs_mount_pending(__s32 fsid) { int mount_pending = -1; struct orangefs_sb_info_s *orangefs_sb = NULL; spin_lock(&orangefs_superblocks_lock); list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) { if (orangefs_sb->fs_id == fsid) { mount_pending = orangefs_sb->mount_pending; break; } } spin_unlock(&orangefs_superblocks_lock); return mount_pending; } static int orangefs_devreq_open(struct inode *inode, struct file *file) { int ret = -EINVAL; /* in order to ensure that the filesystem driver sees correct UIDs */ if (file->f_cred->user_ns != &init_user_ns) { gossip_err("%s: device cannot be opened outside init_user_ns\n", __func__); goto out; } if (!(file->f_flags & O_NONBLOCK)) { gossip_err("%s: device cannot be opened in blocking mode\n", __func__); goto out; } ret = -EACCES; gossip_debug(GOSSIP_DEV_DEBUG, "client-core: opening device\n"); mutex_lock(&devreq_mutex); if (open_access_count == 0) { open_access_count = 1; ret = 0; } else { DUMP_DEVICE_ERROR(); } mutex_unlock(&devreq_mutex); out: gossip_debug(GOSSIP_DEV_DEBUG, "pvfs2-client-core: open device complete (ret = %d)\n", ret); return ret; } /* Function for read() callers into the device */ static ssize_t orangefs_devreq_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { struct orangefs_kernel_op_s *op, *temp; __s32 proto_ver = ORANGEFS_KERNEL_PROTO_VERSION; static __s32 magic = ORANGEFS_DEVREQ_MAGIC; struct orangefs_kernel_op_s *cur_op; unsigned long ret; /* We do not support blocking IO. */ if (!(file->f_flags & O_NONBLOCK)) { gossip_err("%s: blocking read from client-core.\n", __func__); return -EINVAL; } /* * The client will do an ioctl to find MAX_DEV_REQ_UPSIZE, then * always read with that size buffer. */ if (count != MAX_DEV_REQ_UPSIZE) { gossip_err("orangefs: client-core tried to read wrong size\n"); return -EINVAL; } /* Check for an empty list before locking. */ if (list_empty(&orangefs_request_list)) return -EAGAIN; restart: cur_op = NULL; /* Get next op (if any) from top of list. */ spin_lock(&orangefs_request_list_lock); list_for_each_entry_safe(op, temp, &orangefs_request_list, list) { __s32 fsid; /* This lock is held past the end of the loop when we break. */ spin_lock(&op->lock); if (unlikely(op_state_purged(op) || op_state_given_up(op))) { spin_unlock(&op->lock); continue; } fsid = fsid_of_op(op); if (fsid != ORANGEFS_FS_ID_NULL) { int ret; /* Skip ops whose filesystem needs to be mounted. */ ret = fs_mount_pending(fsid); if (ret == 1) { gossip_debug(GOSSIP_DEV_DEBUG, "%s: mount pending, skipping op tag " "%llu %s\n", __func__, llu(op->tag), get_opname_string(op)); spin_unlock(&op->lock); continue; /* * Skip ops whose filesystem we don't know about unless * it is being mounted or unmounted. It is possible for * a filesystem we don't know about to be unmounted if * it fails to mount in the kernel after userspace has * been sent the mount request. */ /* XXX: is there a better way to detect this? */ } else if (ret == -1 && !(op->upcall.type == ORANGEFS_VFS_OP_FS_MOUNT || op->upcall.type == ORANGEFS_VFS_OP_GETATTR || op->upcall.type == ORANGEFS_VFS_OP_FS_UMOUNT)) { gossip_debug(GOSSIP_DEV_DEBUG, "orangefs: skipping op tag %llu %s\n", llu(op->tag), get_opname_string(op)); gossip_err( "orangefs: ERROR: fs_mount_pending %d\n", fsid); spin_unlock(&op->lock); continue; } } /* * Either this op does not pertain to a filesystem, is mounting * a filesystem, or pertains to a mounted filesystem. Let it * through. */ cur_op = op; break; } /* * At this point we either have a valid op and can continue or have not * found an op and must ask the client to try again later. */ if (!cur_op) { spin_unlock(&orangefs_request_list_lock); return -EAGAIN; } gossip_debug(GOSSIP_DEV_DEBUG, "%s: reading op tag %llu %s\n", __func__, llu(cur_op->tag), get_opname_string(cur_op)); /* * Such an op should never be on the list in the first place. If so, we * will abort. */ if (op_state_in_progress(cur_op) || op_state_serviced(cur_op)) { gossip_err("orangefs: ERROR: Current op already queued.\n"); list_del_init(&cur_op->list); spin_unlock(&cur_op->lock); spin_unlock(&orangefs_request_list_lock); return -EAGAIN; } list_del_init(&cur_op->list); spin_unlock(&orangefs_request_list_lock); spin_unlock(&cur_op->lock); /* Push the upcall out. */ ret = copy_to_user(buf, &proto_ver, sizeof(__s32)); if (ret != 0) goto error; ret = copy_to_user(buf + sizeof(__s32), &magic, sizeof(__s32)); if (ret != 0) goto error; ret = copy_to_user(buf + 2 * sizeof(__s32), &cur_op->tag, sizeof(__u64)); if (ret != 0) goto error; ret = copy_to_user(buf + 2 * sizeof(__s32) + sizeof(__u64), &cur_op->upcall, sizeof(struct orangefs_upcall_s)); if (ret != 0) goto error; spin_lock(&orangefs_htable_ops_in_progress_lock); spin_lock(&cur_op->lock); if (unlikely(op_state_given_up(cur_op))) { spin_unlock(&cur_op->lock); spin_unlock(&orangefs_htable_ops_in_progress_lock); complete(&cur_op->waitq); goto restart; } /* * Set the operation to be in progress and move it between lists since * it has been sent to the client. */ set_op_state_inprogress(cur_op); gossip_debug(GOSSIP_DEV_DEBUG, "%s: 1 op:%s: op_state:%d: process:%s:\n", __func__, get_opname_string(cur_op), cur_op->op_state, current->comm); orangefs_devreq_add_op(cur_op); spin_unlock(&cur_op->lock); spin_unlock(&orangefs_htable_ops_in_progress_lock); /* The client only asks to read one size buffer. */ return MAX_DEV_REQ_UPSIZE; error: /* * We were unable to copy the op data to the client. Put the op back in * list. If client has crashed, the op will be purged later when the * device is released. */ gossip_err("orangefs: Failed to copy data to user space\n"); spin_lock(&orangefs_request_list_lock); spin_lock(&cur_op->lock); if (likely(!op_state_given_up(cur_op))) { set_op_state_waiting(cur_op); gossip_debug(GOSSIP_DEV_DEBUG, "%s: 2 op:%s: op_state:%d: process:%s:\n", __func__, get_opname_string(cur_op), cur_op->op_state, current->comm); list_add(&cur_op->list, &orangefs_request_list); spin_unlock(&cur_op->lock); } else { spin_unlock(&cur_op->lock); complete(&cur_op->waitq); } spin_unlock(&orangefs_request_list_lock); return -EFAULT; } /* * Function for writev() callers into the device. * * Userspace should have written: * - __u32 version * - __u32 magic * - __u64 tag * - struct orangefs_downcall_s * - trailer buffer (in the case of READDIR operations) */ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, struct iov_iter *iter) { ssize_t ret; struct orangefs_kernel_op_s *op = NULL; struct { __u32 version; __u32 magic; __u64 tag; } head; int total = ret = iov_iter_count(iter); int downcall_size = sizeof(struct orangefs_downcall_s); int head_size = sizeof(head); gossip_debug(GOSSIP_DEV_DEBUG, "%s: total:%d: ret:%zd:\n", __func__, total, ret); if (total < MAX_DEV_REQ_DOWNSIZE) { gossip_err("%s: total:%d: must be at least:%u:\n", __func__, total, (unsigned int) MAX_DEV_REQ_DOWNSIZE); return -EFAULT; } if (!copy_from_iter_full(&head, head_size, iter)) { gossip_err("%s: failed to copy head.\n", __func__); return -EFAULT; } if (head.version < ORANGEFS_MINIMUM_USERSPACE_VERSION) { gossip_err("%s: userspace claims version" "%d, minimum version required: %d.\n", __func__, head.version, ORANGEFS_MINIMUM_USERSPACE_VERSION); return -EPROTO; } if (head.magic != ORANGEFS_DEVREQ_MAGIC) { gossip_err("Error: Device magic number does not match.\n"); return -EPROTO; } if (!orangefs_userspace_version) { orangefs_userspace_version = head.version; } else if (orangefs_userspace_version != head.version) { gossip_err("Error: userspace version changes\n"); return -EPROTO; } /* remove the op from the in progress hash table */ op = orangefs_devreq_remove_op(head.tag); if (!op) { gossip_debug(GOSSIP_DEV_DEBUG, "%s: No one's waiting for tag %llu\n", __func__, llu(head.tag)); return ret; } if (!copy_from_iter_full(&op->downcall, downcall_size, iter)) { gossip_err("%s: failed to copy downcall.\n", __func__); goto Efault; } if (op->downcall.status) goto wakeup; /* * We've successfully peeled off the head and the downcall. * Something has gone awry if total doesn't equal the * sum of head_size, downcall_size and trailer_size. */ if ((head_size + downcall_size + op->downcall.trailer_size) != total) { gossip_err("%s: funky write, head_size:%d" ": downcall_size:%d: trailer_size:%lld" ": total size:%d:\n", __func__, head_size, downcall_size, op->downcall.trailer_size, total); goto Efault; } /* Only READDIR operations should have trailers. */ if ((op->downcall.type != ORANGEFS_VFS_OP_READDIR) && (op->downcall.trailer_size != 0)) { gossip_err("%s: %x operation with trailer.", __func__, op->downcall.type); goto Efault; } /* READDIR operations should always have trailers. */ if ((op->downcall.type == ORANGEFS_VFS_OP_READDIR) && (op->downcall.trailer_size == 0)) { gossip_err("%s: %x operation with no trailer.", __func__, op->downcall.type); goto Efault; } if (op->downcall.type != ORANGEFS_VFS_OP_READDIR) goto wakeup; op->downcall.trailer_buf = vzalloc(op->downcall.trailer_size); if (!op->downcall.trailer_buf) goto Enomem; if (!copy_from_iter_full(op->downcall.trailer_buf, op->downcall.trailer_size, iter)) { gossip_err("%s: failed to copy trailer.\n", __func__); vfree(op->downcall.trailer_buf); goto Efault; } wakeup: /* * Return to vfs waitqueue, and back to service_operation * through wait_for_matching_downcall. */ spin_lock(&op->lock); if (unlikely(op_is_cancel(op))) { spin_unlock(&op->lock); put_cancel(op); } else if (unlikely(op_state_given_up(op))) { spin_unlock(&op->lock); complete(&op->waitq); } else { set_op_state_serviced(op); gossip_debug(GOSSIP_DEV_DEBUG, "%s: op:%s: op_state:%d: process:%s:\n", __func__, get_opname_string(op), op->op_state, current->comm); spin_unlock(&op->lock); } return ret; Efault: op->downcall.status = -(ORANGEFS_ERROR_BIT | 9); ret = -EFAULT; goto wakeup; Enomem: op->downcall.status = -(ORANGEFS_ERROR_BIT | 8); ret = -ENOMEM; goto wakeup; } /* * NOTE: gets called when the last reference to this device is dropped. * Using the open_access_count variable, we enforce a reference count * on this file so that it can be opened by only one process at a time. * the devreq_mutex is used to make sure all i/o has completed * before we call orangefs_bufmap_finalize, and similar such tricky * situations */ static int orangefs_devreq_release(struct inode *inode, struct file *file) { int unmounted = 0; gossip_debug(GOSSIP_DEV_DEBUG, "%s:pvfs2-client-core: exiting, closing device\n", __func__); mutex_lock(&devreq_mutex); orangefs_bufmap_finalize(); open_access_count = -1; unmounted = mark_all_pending_mounts(); gossip_debug(GOSSIP_DEV_DEBUG, "ORANGEFS Device Close: Filesystem(s) %s\n", (unmounted ? "UNMOUNTED" : "MOUNTED")); purge_waiting_ops(); purge_inprogress_ops(); orangefs_bufmap_run_down(); gossip_debug(GOSSIP_DEV_DEBUG, "pvfs2-client-core: device close complete\n"); open_access_count = 0; orangefs_userspace_version = 0; mutex_unlock(&devreq_mutex); return 0; } int is_daemon_in_service(void) { int in_service; /* * What this function does is checks if client-core is alive * based on the access count we maintain on the device. */ mutex_lock(&devreq_mutex); in_service = open_access_count == 1 ? 0 : -EIO; mutex_unlock(&devreq_mutex); return in_service; } bool __is_daemon_in_service(void) { return open_access_count == 1; } static inline long check_ioctl_command(unsigned int command) { /* Check for valid ioctl codes */ if (_IOC_TYPE(command) != ORANGEFS_DEV_MAGIC) { gossip_err("device ioctl magic numbers don't match! Did you rebuild pvfs2-client-core/libpvfs2? [cmd %x, magic %x != %x]\n", command, _IOC_TYPE(command), ORANGEFS_DEV_MAGIC); return -EINVAL; } /* and valid ioctl commands */ if (_IOC_NR(command) >= ORANGEFS_DEV_MAXNR || _IOC_NR(command) <= 0) { gossip_err("Invalid ioctl command number [%d >= %d]\n", _IOC_NR(command), ORANGEFS_DEV_MAXNR); return -ENOIOCTLCMD; } return 0; } static long dispatch_ioctl_command(unsigned int command, unsigned long arg) { static __s32 magic = ORANGEFS_DEVREQ_MAGIC; static __s32 max_up_size = MAX_DEV_REQ_UPSIZE; static __s32 max_down_size = MAX_DEV_REQ_DOWNSIZE; struct ORANGEFS_dev_map_desc user_desc; int ret = 0; int upstream_kmod = 1; struct orangefs_sb_info_s *orangefs_sb; /* mtmoore: add locking here */ switch (command) { case ORANGEFS_DEV_GET_MAGIC: return ((put_user(magic, (__s32 __user *) arg) == -EFAULT) ? -EIO : 0); case ORANGEFS_DEV_GET_MAX_UPSIZE: return ((put_user(max_up_size, (__s32 __user *) arg) == -EFAULT) ? -EIO : 0); case ORANGEFS_DEV_GET_MAX_DOWNSIZE: return ((put_user(max_down_size, (__s32 __user *) arg) == -EFAULT) ? -EIO : 0); case ORANGEFS_DEV_MAP: ret = copy_from_user(&user_desc, (struct ORANGEFS_dev_map_desc __user *) arg, sizeof(struct ORANGEFS_dev_map_desc)); /* WTF -EIO and not -EFAULT? */ return ret ? -EIO : orangefs_bufmap_initialize(&user_desc); case ORANGEFS_DEV_REMOUNT_ALL: gossip_debug(GOSSIP_DEV_DEBUG, "%s: got ORANGEFS_DEV_REMOUNT_ALL\n", __func__); /* * remount all mounted orangefs volumes to regain the lost * dynamic mount tables (if any) -- NOTE: this is done * without keeping the superblock list locked due to the * upcall/downcall waiting. also, the request mutex is * used to ensure that no operations will be serviced until * all of the remounts are serviced (to avoid ops between * mounts to fail) */ ret = mutex_lock_interruptible(&orangefs_request_mutex); if (ret < 0) return ret; gossip_debug(GOSSIP_DEV_DEBUG, "%s: priority remount in progress\n", __func__); spin_lock(&orangefs_superblocks_lock); list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) { /* * We have to drop the spinlock, so entries can be * removed. They can't be freed, though, so we just * keep the forward pointers and zero the back ones - * that way we can get to the rest of the list. */ if (!orangefs_sb->list.prev) continue; gossip_debug(GOSSIP_DEV_DEBUG, "%s: Remounting SB %p\n", __func__, orangefs_sb); spin_unlock(&orangefs_superblocks_lock); ret = orangefs_remount(orangefs_sb); spin_lock(&orangefs_superblocks_lock); if (ret) { gossip_debug(GOSSIP_DEV_DEBUG, "SB %p remount failed\n", orangefs_sb); break; } } spin_unlock(&orangefs_superblocks_lock); gossip_debug(GOSSIP_DEV_DEBUG, "%s: priority remount complete\n", __func__); mutex_unlock(&orangefs_request_mutex); return ret; case ORANGEFS_DEV_UPSTREAM: ret = copy_to_user((void __user *)arg, &upstream_kmod, sizeof(upstream_kmod)); if (ret != 0) return -EIO; else return ret; case ORANGEFS_DEV_CLIENT_MASK: return orangefs_debugfs_new_client_mask((void __user *)arg); case ORANGEFS_DEV_CLIENT_STRING: return orangefs_debugfs_new_client_string((void __user *)arg); case ORANGEFS_DEV_DEBUG: return orangefs_debugfs_new_debug((void __user *)arg); default: return -ENOIOCTLCMD; } return -ENOIOCTLCMD; } static long orangefs_devreq_ioctl(struct file *file, unsigned int command, unsigned long arg) { long ret; /* Check for properly constructed commands */ ret = check_ioctl_command(command); if (ret < 0) return (int)ret; return (int)dispatch_ioctl_command(command, arg); } #ifdef CONFIG_COMPAT /* CONFIG_COMPAT is in .config */ /* Compat structure for the ORANGEFS_DEV_MAP ioctl */ struct ORANGEFS_dev_map_desc32 { compat_uptr_t ptr; __s32 total_size; __s32 size; __s32 count; }; /* * 32 bit user-space apps' ioctl handlers when kernel modules * is compiled as a 64 bit one */ static long orangefs_devreq_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long args) { long ret; /* Check for properly constructed commands */ ret = check_ioctl_command(cmd); if (ret < 0) return ret; if (cmd == ORANGEFS_DEV_MAP) { struct ORANGEFS_dev_map_desc desc; struct ORANGEFS_dev_map_desc32 d32; if (copy_from_user(&d32, (void __user *)args, sizeof(d32))) return -EFAULT; desc.ptr = compat_ptr(d32.ptr); desc.total_size = d32.total_size; desc.size = d32.size; desc.count = d32.count; return orangefs_bufmap_initialize(&desc); } /* no other ioctl requires translation */ return dispatch_ioctl_command(cmd, args); } #endif /* CONFIG_COMPAT is in .config */ static __poll_t orangefs_devreq_poll(struct file *file, struct poll_table_struct *poll_table) { __poll_t poll_revent_mask = 0; poll_wait(file, &orangefs_request_list_waitq, poll_table); if (!list_empty(&orangefs_request_list)) poll_revent_mask |= EPOLLIN; return poll_revent_mask; } /* the assigned character device major number */ static int orangefs_dev_major; static const struct file_operations orangefs_devreq_file_operations = { .owner = THIS_MODULE, .read = orangefs_devreq_read, .write_iter = orangefs_devreq_write_iter, .open = orangefs_devreq_open, .release = orangefs_devreq_release, .unlocked_ioctl = orangefs_devreq_ioctl, #ifdef CONFIG_COMPAT /* CONFIG_COMPAT is in .config */ .compat_ioctl = orangefs_devreq_compat_ioctl, #endif .poll = orangefs_devreq_poll }; /* * Initialize orangefs device specific state: * Must be called at module load time only */ int orangefs_dev_init(void) { /* register orangefs-req device */ orangefs_dev_major = register_chrdev(0, ORANGEFS_REQDEVICE_NAME, &orangefs_devreq_file_operations); if (orangefs_dev_major < 0) { gossip_debug(GOSSIP_DEV_DEBUG, "Failed to register /dev/%s (error %d)\n", ORANGEFS_REQDEVICE_NAME, orangefs_dev_major); return orangefs_dev_major; } gossip_debug(GOSSIP_DEV_DEBUG, "*** /dev/%s character device registered ***\n", ORANGEFS_REQDEVICE_NAME); gossip_debug(GOSSIP_DEV_DEBUG, "'mknod /dev/%s c %d 0'.\n", ORANGEFS_REQDEVICE_NAME, orangefs_dev_major); return 0; } void orangefs_dev_cleanup(void) { unregister_chrdev(orangefs_dev_major, ORANGEFS_REQDEVICE_NAME); gossip_debug(GOSSIP_DEV_DEBUG, "*** /dev/%s character device unregistered ***\n", ORANGEFS_REQDEVICE_NAME); }
605 605 599 605 605 1 1 1 1 7 30 7 7 11 11 10 7 6 5 9 11 6 5 3 2 2 5 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <linux/rculist.h> #include <linux/list.h> #include <linux/hash.h> #include <linux/types.h> #include <linux/spinlock.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/bpf_local_storage.h> #include <net/bpf_sk_storage.h> #include <net/sock.h> #include <uapi/linux/sock_diag.h> #include <uapi/linux/btf.h> #include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(sk_cache); static struct bpf_local_storage_data * bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) { struct bpf_local_storage *sk_storage; struct bpf_local_storage_map *smap; sk_storage = rcu_dereference_check(sk->sk_bpf_storage, bpf_rcu_lock_held()); if (!sk_storage) return NULL; smap = (struct bpf_local_storage_map *)map; return bpf_local_storage_lookup(sk_storage, smap, cacheit_lockit); } static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) { struct bpf_local_storage_data *sdata; sdata = bpf_sk_storage_lookup(sk, map, false); if (!sdata) return -ENOENT; bpf_selem_unlink(SELEM(sdata), false); return 0; } /* Called by __sk_destruct() & bpf_sk_storage_clone() */ void bpf_sk_storage_free(struct sock *sk) { struct bpf_local_storage *sk_storage; rcu_read_lock(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage) { rcu_read_unlock(); return; } bpf_local_storage_destroy(sk_storage); rcu_read_unlock(); } static void bpf_sk_storage_map_free(struct bpf_map *map) { bpf_local_storage_map_free(map, &sk_cache, NULL); } static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) { return bpf_local_storage_map_alloc(attr, &sk_cache, false); } static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) { return -ENOTSUPP; } static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) { struct bpf_local_storage_data *sdata; struct socket *sock; int fd, err; fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { sdata = bpf_sk_storage_lookup(sock->sk, map, true); sockfd_put(sock); return sdata ? sdata->data : NULL; } return ERR_PTR(err); } static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct socket *sock; int fd, err; fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { sdata = bpf_local_storage_update( sock->sk, (struct bpf_local_storage_map *)map, value, map_flags, GFP_ATOMIC); sockfd_put(sock); return PTR_ERR_OR_ZERO(sdata); } return err; } static long bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) { struct socket *sock; int fd, err; fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { err = bpf_sk_storage_del(sock->sk, map); sockfd_put(sock); return err; } return err; } static struct bpf_local_storage_elem * bpf_sk_storage_clone_elem(struct sock *newsk, struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) { struct bpf_local_storage_elem *copy_selem; copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, GFP_ATOMIC); if (!copy_selem) return NULL; if (btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)) copy_map_value_locked(&smap->map, SDATA(copy_selem)->data, SDATA(selem)->data, true); else copy_map_value(&smap->map, SDATA(copy_selem)->data, SDATA(selem)->data); return copy_selem; } int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) { struct bpf_local_storage *new_sk_storage = NULL; struct bpf_local_storage *sk_storage; struct bpf_local_storage_elem *selem; int ret = 0; RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); rcu_read_lock(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage || hlist_empty(&sk_storage->list)) goto out; hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { struct bpf_local_storage_elem *copy_selem; struct bpf_local_storage_map *smap; struct bpf_map *map; smap = rcu_dereference(SDATA(selem)->smap); if (!(smap->map.map_flags & BPF_F_CLONE)) continue; /* Note that for lockless listeners adding new element * here can race with cleanup in bpf_local_storage_map_free. * Try to grab map refcnt to make sure that it's still * alive and prevent concurrent removal. */ map = bpf_map_inc_not_zero(&smap->map); if (IS_ERR(map)) continue; copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem); if (!copy_selem) { ret = -ENOMEM; bpf_map_put(map); goto out; } if (new_sk_storage) { bpf_selem_link_map(smap, copy_selem); bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); } else { ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); if (ret) { bpf_selem_free(copy_selem, smap, true); atomic_sub(smap->elem_size, &newsk->sk_omem_alloc); bpf_map_put(map); goto out; } new_sk_storage = rcu_dereference(copy_selem->local_storage); } bpf_map_put(map); } out: rcu_read_unlock(); /* In case of an error, don't free anything explicitly here, the * caller is responsible to call bpf_sk_storage_free. */ return ret; } /* *gfp_flags* is a hidden argument provided by the verifier */ BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE) return (unsigned long)NULL; sdata = bpf_sk_storage_lookup(sk, map, true); if (sdata) return (unsigned long)sdata->data; if (flags == BPF_SK_STORAGE_GET_F_CREATE && /* Cannot add new elem to a going away sk. * Otherwise, the new elem may become a leak * (and also other memory issues during map * destruction). */ refcount_inc_not_zero(&sk->sk_refcnt)) { sdata = bpf_local_storage_update( sk, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, gfp_flags); /* sk must be a fullsock (guaranteed by verifier), * so sock_gen_put() is unnecessary. */ sock_put(sk); return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } return (unsigned long)NULL; } BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) { WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!sk || !sk_fullsock(sk)) return -EINVAL; if (refcount_inc_not_zero(&sk->sk_refcnt)) { int err; err = bpf_sk_storage_del(sk, map); sock_put(sk); return err; } return -ENOENT; } static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) { struct sock *sk = (struct sock *)owner; int optmem_max; optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); /* same check as in sock_kmalloc() */ if (size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { atomic_add(size, &sk->sk_omem_alloc); return 0; } return -ENOMEM; } static void bpf_sk_storage_uncharge(struct bpf_local_storage_map *smap, void *owner, u32 size) { struct sock *sk = owner; atomic_sub(size, &sk->sk_omem_alloc); } static struct bpf_local_storage __rcu ** bpf_sk_storage_ptr(void *owner) { struct sock *sk = owner; return &sk->sk_bpf_storage; } const struct bpf_map_ops sk_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, .map_alloc = bpf_sk_storage_map_alloc, .map_free = bpf_sk_storage_map_free, .map_get_next_key = notsupp_get_next_key, .map_lookup_elem = bpf_fd_sk_storage_lookup_elem, .map_update_elem = bpf_fd_sk_storage_update_elem, .map_delete_elem = bpf_fd_sk_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_local_storage_charge = bpf_sk_storage_charge, .map_local_storage_uncharge = bpf_sk_storage_uncharge, .map_owner_storage_ptr = bpf_sk_storage_ptr, .map_mem_usage = bpf_local_storage_map_mem_usage, }; const struct bpf_func_proto bpf_sk_storage_get_proto = { .func = bpf_sk_storage_get, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto = { .func = bpf_sk_storage_get, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_CTX, /* context is 'struct sock' */ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_sk_storage_delete_proto = { .func = bpf_sk_storage_delete, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) { const struct btf *btf_vmlinux; const struct btf_type *t; const char *tname; u32 btf_id; if (prog->aux->dst_prog) return false; /* Ensure the tracing program is not tracing * any bpf_sk_storage*() function and also * use the bpf_sk_storage_(get|delete) helper. */ switch (prog->expected_attach_type) { case BPF_TRACE_ITER: case BPF_TRACE_RAW_TP: /* bpf_sk_storage has no trace point */ return true; case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: btf_vmlinux = bpf_get_btf_vmlinux(); if (IS_ERR_OR_NULL(btf_vmlinux)) return false; btf_id = prog->aux->attach_btf_id; t = btf_type_by_id(btf_vmlinux, btf_id); tname = btf_name_by_offset(btf_vmlinux, t->name_off); return !!strncmp(tname, "bpf_sk_storage", strlen("bpf_sk_storage")); default: return false; } return false; } /* *gfp_flags* is a hidden argument provided by the verifier */ BPF_CALL_5(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, void *, value, u64, flags, gfp_t, gfp_flags) { WARN_ON_ONCE(!bpf_rcu_lock_held()); if (in_hardirq() || in_nmi()) return (unsigned long)NULL; return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags, gfp_flags); } BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, struct sock *, sk) { WARN_ON_ONCE(!bpf_rcu_lock_held()); if (in_hardirq() || in_nmi()) return -EPERM; return ____bpf_sk_storage_delete(map, sk); } const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = { .func = bpf_sk_storage_get_tracing, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, .allowed = bpf_sk_storage_tracing_allowed, }; const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = { .func = bpf_sk_storage_delete_tracing, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .allowed = bpf_sk_storage_tracing_allowed, }; struct bpf_sk_storage_diag { u32 nr_maps; struct bpf_map *maps[]; }; /* The reply will be like: * INET_DIAG_BPF_SK_STORAGES (nla_nest) * SK_DIAG_BPF_STORAGE (nla_nest) * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) * SK_DIAG_BPF_STORAGE (nla_nest) * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) * .... */ static int nla_value_size(u32 value_size) { /* SK_DIAG_BPF_STORAGE (nla_nest) * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) */ return nla_total_size(0) + nla_total_size(sizeof(u32)) + nla_total_size_64bit(value_size); } void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag) { u32 i; if (!diag) return; for (i = 0; i < diag->nr_maps; i++) bpf_map_put(diag->maps[i]); kfree(diag); } EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_free); static bool diag_check_dup(const struct bpf_sk_storage_diag *diag, const struct bpf_map *map) { u32 i; for (i = 0; i < diag->nr_maps; i++) { if (diag->maps[i] == map) return true; } return false; } struct bpf_sk_storage_diag * bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) { struct bpf_sk_storage_diag *diag; struct nlattr *nla; u32 nr_maps = 0; int rem, err; /* bpf_local_storage_map is currently limited to CAP_SYS_ADMIN as * the map_alloc_check() side also does. */ if (!bpf_capable()) return ERR_PTR(-EPERM); nla_for_each_nested(nla, nla_stgs, rem) { if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) { if (nla_len(nla) != sizeof(u32)) return ERR_PTR(-EINVAL); nr_maps++; } } diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL); if (!diag) return ERR_PTR(-ENOMEM); nla_for_each_nested(nla, nla_stgs, rem) { struct bpf_map *map; int map_fd; if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD) continue; map_fd = nla_get_u32(nla); map = bpf_map_get(map_fd); if (IS_ERR(map)) { err = PTR_ERR(map); goto err_free; } if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) { bpf_map_put(map); err = -EINVAL; goto err_free; } if (diag_check_dup(diag, map)) { bpf_map_put(map); err = -EEXIST; goto err_free; } diag->maps[diag->nr_maps++] = map; } return diag; err_free: bpf_sk_storage_diag_free(diag); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc); static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb) { struct nlattr *nla_stg, *nla_value; struct bpf_local_storage_map *smap; /* It cannot exceed max nlattr's payload */ BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE); nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE); if (!nla_stg) return -EMSGSIZE; smap = rcu_dereference(sdata->smap); if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id)) goto errout; nla_value = nla_reserve_64bit(skb, SK_DIAG_BPF_STORAGE_MAP_VALUE, smap->map.value_size, SK_DIAG_BPF_STORAGE_PAD); if (!nla_value) goto errout; if (btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)) copy_map_value_locked(&smap->map, nla_data(nla_value), sdata->data, true); else copy_map_value(&smap->map, nla_data(nla_value), sdata->data); nla_nest_end(skb, nla_stg); return 0; errout: nla_nest_cancel(skb, nla_stg); return -EMSGSIZE; } static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb, int stg_array_type, unsigned int *res_diag_size) { /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ unsigned int diag_size = nla_total_size(0); struct bpf_local_storage *sk_storage; struct bpf_local_storage_elem *selem; struct bpf_local_storage_map *smap; struct nlattr *nla_stgs; unsigned int saved_len; int err = 0; rcu_read_lock(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage || hlist_empty(&sk_storage->list)) { rcu_read_unlock(); return 0; } nla_stgs = nla_nest_start(skb, stg_array_type); if (!nla_stgs) /* Continue to learn diag_size */ err = -EMSGSIZE; saved_len = skb->len; hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { smap = rcu_dereference(SDATA(selem)->smap); diag_size += nla_value_size(smap->map.value_size); if (nla_stgs && diag_get(SDATA(selem), skb)) /* Continue to learn diag_size */ err = -EMSGSIZE; } rcu_read_unlock(); if (nla_stgs) { if (saved_len == skb->len) nla_nest_cancel(skb, nla_stgs); else nla_nest_end(skb, nla_stgs); } if (diag_size == nla_total_size(0)) { *res_diag_size = 0; return 0; } *res_diag_size = diag_size; return err; } int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, struct sock *sk, struct sk_buff *skb, int stg_array_type, unsigned int *res_diag_size) { /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ unsigned int diag_size = nla_total_size(0); struct bpf_local_storage *sk_storage; struct bpf_local_storage_data *sdata; struct nlattr *nla_stgs; unsigned int saved_len; int err = 0; u32 i; *res_diag_size = 0; /* No map has been specified. Dump all. */ if (!diag->nr_maps) return bpf_sk_storage_diag_put_all(sk, skb, stg_array_type, res_diag_size); rcu_read_lock(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage || hlist_empty(&sk_storage->list)) { rcu_read_unlock(); return 0; } nla_stgs = nla_nest_start(skb, stg_array_type); if (!nla_stgs) /* Continue to learn diag_size */ err = -EMSGSIZE; saved_len = skb->len; for (i = 0; i < diag->nr_maps; i++) { sdata = bpf_local_storage_lookup(sk_storage, (struct bpf_local_storage_map *)diag->maps[i], false); if (!sdata) continue; diag_size += nla_value_size(diag->maps[i]->value_size); if (nla_stgs && diag_get(sdata, skb)) /* Continue to learn diag_size */ err = -EMSGSIZE; } rcu_read_unlock(); if (nla_stgs) { if (saved_len == skb->len) nla_nest_cancel(skb, nla_stgs); else nla_nest_end(skb, nla_stgs); } if (diag_size == nla_total_size(0)) { *res_diag_size = 0; return 0; } *res_diag_size = diag_size; return err; } EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put); struct bpf_iter_seq_sk_storage_map_info { struct bpf_map *map; unsigned int bucket_id; unsigned skip_elems; }; static struct bpf_local_storage_elem * bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, struct bpf_local_storage_elem *prev_selem) __acquires(RCU) __releases(RCU) { struct bpf_local_storage *sk_storage; struct bpf_local_storage_elem *selem; u32 skip_elems = info->skip_elems; struct bpf_local_storage_map *smap; u32 bucket_id = info->bucket_id; u32 i, count, n_buckets; struct bpf_local_storage_map_bucket *b; smap = (struct bpf_local_storage_map *)info->map; n_buckets = 1U << smap->bucket_log; if (bucket_id >= n_buckets) return NULL; /* try to find next selem in the same bucket */ selem = prev_selem; count = 0; while (selem) { selem = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&selem->map_node)), struct bpf_local_storage_elem, map_node); if (!selem) { /* not found, unlock and go to the next bucket */ b = &smap->buckets[bucket_id++]; rcu_read_unlock(); skip_elems = 0; break; } sk_storage = rcu_dereference(selem->local_storage); if (sk_storage) { info->skip_elems = skip_elems + count; return selem; } count++; } for (i = bucket_id; i < (1U << smap->bucket_log); i++) { b = &smap->buckets[i]; rcu_read_lock(); count = 0; hlist_for_each_entry_rcu(selem, &b->list, map_node) { sk_storage = rcu_dereference(selem->local_storage); if (sk_storage && count >= skip_elems) { info->bucket_id = i; info->skip_elems = count; return selem; } count++; } rcu_read_unlock(); skip_elems = 0; } info->bucket_id = i; info->skip_elems = 0; return NULL; } static void *bpf_sk_storage_map_seq_start(struct seq_file *seq, loff_t *pos) { struct bpf_local_storage_elem *selem; selem = bpf_sk_storage_map_seq_find_next(seq->private, NULL); if (!selem) return NULL; if (*pos == 0) ++*pos; return selem; } static void *bpf_sk_storage_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_iter_seq_sk_storage_map_info *info = seq->private; ++*pos; ++info->skip_elems; return bpf_sk_storage_map_seq_find_next(seq->private, v); } struct bpf_iter__bpf_sk_storage_map { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct bpf_map *, map); __bpf_md_ptr(struct sock *, sk); __bpf_md_ptr(void *, value); }; DEFINE_BPF_ITER_FUNC(bpf_sk_storage_map, struct bpf_iter_meta *meta, struct bpf_map *map, struct sock *sk, void *value) static int __bpf_sk_storage_map_seq_show(struct seq_file *seq, struct bpf_local_storage_elem *selem) { struct bpf_iter_seq_sk_storage_map_info *info = seq->private; struct bpf_iter__bpf_sk_storage_map ctx = {}; struct bpf_local_storage *sk_storage; struct bpf_iter_meta meta; struct bpf_prog *prog; int ret = 0; meta.seq = seq; prog = bpf_iter_get_info(&meta, selem == NULL); if (prog) { ctx.meta = &meta; ctx.map = info->map; if (selem) { sk_storage = rcu_dereference(selem->local_storage); ctx.sk = sk_storage->owner; ctx.value = SDATA(selem)->data; } ret = bpf_iter_run_prog(prog, &ctx); } return ret; } static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v) { return __bpf_sk_storage_map_seq_show(seq, v); } static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { if (!v) (void)__bpf_sk_storage_map_seq_show(seq, v); else rcu_read_unlock(); } static int bpf_iter_init_sk_storage_map(void *priv_data, struct bpf_iter_aux_info *aux) { struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data; bpf_map_inc_with_uref(aux->map); seq_info->map = aux->map; return 0; } static void bpf_iter_fini_sk_storage_map(void *priv_data) { struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data; bpf_map_put_with_uref(seq_info->map); } static int bpf_iter_attach_map(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) { struct bpf_map *map; int err = -EINVAL; if (!linfo->map.map_fd) return -EBADF; map = bpf_map_get_with_uref(linfo->map.map_fd); if (IS_ERR(map)) return PTR_ERR(map); if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) goto put_map; if (prog->aux->max_rdwr_access > map->value_size) { err = -EACCES; goto put_map; } aux->map = map; return 0; put_map: bpf_map_put_with_uref(map); return err; } static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux) { bpf_map_put_with_uref(aux->map); } static const struct seq_operations bpf_sk_storage_map_seq_ops = { .start = bpf_sk_storage_map_seq_start, .next = bpf_sk_storage_map_seq_next, .stop = bpf_sk_storage_map_seq_stop, .show = bpf_sk_storage_map_seq_show, }; static const struct bpf_iter_seq_info iter_seq_info = { .seq_ops = &bpf_sk_storage_map_seq_ops, .init_seq_private = bpf_iter_init_sk_storage_map, .fini_seq_private = bpf_iter_fini_sk_storage_map, .seq_priv_size = sizeof(struct bpf_iter_seq_sk_storage_map_info), }; static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { .target = "bpf_sk_storage_map", .attach_target = bpf_iter_attach_map, .detach_target = bpf_iter_detach_map, .show_fdinfo = bpf_iter_map_show_fdinfo, .fill_link_info = bpf_iter_map_fill_link_info, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), PTR_TO_BTF_ID_OR_NULL }, { offsetof(struct bpf_iter__bpf_sk_storage_map, value), PTR_TO_BUF | PTR_MAYBE_NULL }, }, .seq_info = &iter_seq_info, }; static int __init bpf_sk_storage_map_iter_init(void) { bpf_sk_storage_map_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK]; return bpf_iter_reg_target(&bpf_sk_storage_map_reg_info); } late_initcall(bpf_sk_storage_map_iter_init);
1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS: Source Hashing scheduling module * * Authors: Wensong Zhang <wensong@gnuchina.org> * * Changes: */ /* * The sh algorithm is to select server by the hash key of source IP * address. The pseudo code is as follows: * * n <- servernode[src_ip]; * if (n is dead) OR * (n is overloaded) or (n.weight <= 0) then * return NULL; * * return n; * * Notes that servernode is a 256-bucket hash table that maps the hash * index derived from packet source IP address to the current server * array. If the sh scheduler is used in cache cluster, it is good to * combine it with cache_bypass feature. When the statically assigned * server is dead or overloaded, the load balancer can bypass the cache * server and send requests to the original server directly. * * The weight destination attribute can be used to control the * distribution of connections to the destinations in servernode. The * greater the weight, the more connections the destination * will receive. * */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/ip.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <net/ip_vs.h> #include <net/tcp.h> #include <linux/udp.h> #include <linux/sctp.h> /* * IPVS SH bucket */ struct ip_vs_sh_bucket { struct ip_vs_dest __rcu *dest; /* real server (cache) */ }; /* * for IPVS SH entry hash table */ #ifndef CONFIG_IP_VS_SH_TAB_BITS #define CONFIG_IP_VS_SH_TAB_BITS 8 #endif #define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS #define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) #define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) struct ip_vs_sh_state { struct rcu_head rcu_head; struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE]; }; /* Helper function to determine if server is unavailable */ static inline bool is_unavailable(struct ip_vs_dest *dest) { return atomic_read(&dest->weight) <= 0 || dest->flags & IP_VS_DEST_F_OVERLOAD; } /* * Returns hash value for IPVS SH entry */ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr, __be16 port, unsigned int offset) { __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif return (offset + hash_32(ntohs(port) + ntohl(addr_fold), IP_VS_SH_TAB_BITS)) & IP_VS_SH_TAB_MASK; } /* * Get ip_vs_dest associated with supplied parameters. */ static inline struct ip_vs_dest * ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s, const union nf_inet_addr *addr, __be16 port) { unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0); struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest); return (!dest || is_unavailable(dest)) ? NULL : dest; } /* As ip_vs_sh_get, but with fallback if selected server is unavailable * * The fallback strategy loops around the table starting from a "random" * point (in fact, it is chosen to be the original hash value to make the * algorithm deterministic) to find a new server. */ static inline struct ip_vs_dest * ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s, const union nf_inet_addr *addr, __be16 port) { unsigned int offset, roffset; unsigned int hash, ihash; struct ip_vs_dest *dest; /* first try the dest it's supposed to go to */ ihash = ip_vs_sh_hashkey(svc->af, addr, port, 0); dest = rcu_dereference(s->buckets[ihash].dest); if (!dest) return NULL; if (!is_unavailable(dest)) return dest; IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting", IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); /* if the original dest is unavailable, loop around the table * starting from ihash to find a new dest */ for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) { roffset = (offset + ihash) % IP_VS_SH_TAB_SIZE; hash = ip_vs_sh_hashkey(svc->af, addr, port, roffset); dest = rcu_dereference(s->buckets[hash].dest); if (!dest) break; if (!is_unavailable(dest)) return dest; IP_VS_DBG_BUF(6, "SH: selected unavailable " "server %s:%d (offset %d), reselecting", IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), roffset); } return NULL; } /* * Assign all the hash buckets of the specified table with the service. */ static int ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc) { int i; struct ip_vs_sh_bucket *b; struct list_head *p; struct ip_vs_dest *dest; int d_count; bool empty; b = &s->buckets[0]; p = &svc->destinations; empty = list_empty(p); d_count = 0; for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { dest = rcu_dereference_protected(b->dest, 1); if (dest) ip_vs_dest_put(dest); if (empty) RCU_INIT_POINTER(b->dest, NULL); else { if (p == &svc->destinations) p = p->next; dest = list_entry(p, struct ip_vs_dest, n_list); ip_vs_dest_hold(dest); RCU_INIT_POINTER(b->dest, dest); IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n", i, IP_VS_DBG_ADDR(dest->af, &dest->addr), atomic_read(&dest->weight)); /* Don't move to next dest until filling weight */ if (++d_count >= atomic_read(&dest->weight)) { p = p->next; d_count = 0; } } b++; } return 0; } /* * Flush all the hash buckets of the specified table. */ static void ip_vs_sh_flush(struct ip_vs_sh_state *s) { int i; struct ip_vs_sh_bucket *b; struct ip_vs_dest *dest; b = &s->buckets[0]; for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { dest = rcu_dereference_protected(b->dest, 1); if (dest) { ip_vs_dest_put(dest); RCU_INIT_POINTER(b->dest, NULL); } b++; } } static int ip_vs_sh_init_svc(struct ip_vs_service *svc) { struct ip_vs_sh_state *s; /* allocate the SH table for this service */ s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL); if (s == NULL) return -ENOMEM; svc->sched_data = s; IP_VS_DBG(6, "SH hash table (memory=%zdbytes) allocated for " "current service\n", sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); /* assign the hash buckets with current dests */ ip_vs_sh_reassign(s, svc); return 0; } static void ip_vs_sh_done_svc(struct ip_vs_service *svc) { struct ip_vs_sh_state *s = svc->sched_data; /* got to clean up hash buckets here */ ip_vs_sh_flush(s); /* release the table itself */ kfree_rcu(s, rcu_head); IP_VS_DBG(6, "SH hash table (memory=%zdbytes) released\n", sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); } static int ip_vs_sh_dest_changed(struct ip_vs_service *svc, struct ip_vs_dest *dest) { struct ip_vs_sh_state *s = svc->sched_data; /* assign the hash buckets with the updated service */ ip_vs_sh_reassign(s, svc); return 0; } /* Helper function to get port number */ static inline __be16 ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) { __be16 _ports[2], *ports; /* At this point we know that we have a valid packet of some kind. * Because ICMP packets are only guaranteed to have the first 8 * bytes, let's just grab the ports. Fortunately they're in the * same position for all three of the protocols we care about. */ switch (iph->protocol) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_SCTP: ports = skb_header_pointer(skb, iph->len, sizeof(_ports), &_ports); if (unlikely(!ports)) return 0; if (likely(!ip_vs_iph_inverse(iph))) return ports[0]; else return ports[1]; default: return 0; } } /* * Source Hashing scheduling */ static struct ip_vs_dest * ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest; struct ip_vs_sh_state *s; __be16 port = 0; const union nf_inet_addr *hash_addr; hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr; IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT) port = ip_vs_sh_get_port(skb, iph); s = (struct ip_vs_sh_state *) svc->sched_data; if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK) dest = ip_vs_sh_get_fallback(svc, s, hash_addr, port); else dest = ip_vs_sh_get(svc, s, hash_addr, port); if (!dest) { ip_vs_scheduler_err(svc, "no destination available"); return NULL; } IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, hash_addr), IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; } /* * IPVS SH Scheduler structure */ static struct ip_vs_scheduler ip_vs_sh_scheduler = { .name = "sh", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), .init_service = ip_vs_sh_init_svc, .done_service = ip_vs_sh_done_svc, .add_dest = ip_vs_sh_dest_changed, .del_dest = ip_vs_sh_dest_changed, .upd_dest = ip_vs_sh_dest_changed, .schedule = ip_vs_sh_schedule, }; static int __init ip_vs_sh_init(void) { return register_ip_vs_scheduler(&ip_vs_sh_scheduler); } static void __exit ip_vs_sh_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); synchronize_rcu(); } module_init(ip_vs_sh_init); module_exit(ip_vs_sh_cleanup); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ipvs source hashing scheduler");
77 3 77 3 3 3 3 1550 1512 1512 1550 106 15 106 15 15 15 15 1474 1369 1369 1475 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/bitmap.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) */ #include <linux/buffer_head.h> #include "ext4.h" unsigned int ext4_count_free(char *bitmap, unsigned int numchars) { return numchars * BITS_PER_BYTE - memweight(bitmap, numchars); } int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz) { __u32 hi; __u32 provided, calculated; struct ext4_sb_info *sbi = EXT4_SB(sb); if (!ext4_has_metadata_csum(sb)) return 1; provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi); provided |= (hi << 16); } else calculated &= 0xFFFF; return provided == calculated; } void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz) { __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); if (!ext4_has_metadata_csum(sb)) return; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); } int ext4_block_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh) { __u32 hi; __u32 provided, calculated; struct ext4_sb_info *sbi = EXT4_SB(sb); int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; if (!ext4_has_metadata_csum(sb)) return 1; provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi); provided |= (hi << 16); } else calculated &= 0xFFFF; return provided == calculated; } void ext4_block_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh) { int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); if (!ext4_has_metadata_csum(sb)) return; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16); }
26 26 26 26 26 25 9 9 9 9 5 5 2 5 4 8 7 7 4 5 5 5 5 22 22 26 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 // SPDX-License-Identifier: GPL-2.0-or-later /* * Create default crypto algorithm instances. * * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/aead.h> #include <linux/completion.h> #include <linux/ctype.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/notifier.h> #include <linux/rtnetlink.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/string.h> #include "internal.h" struct cryptomgr_param { struct rtattr *tb[CRYPTO_MAX_ATTRS + 2]; struct { struct rtattr attr; struct crypto_attr_type data; } type; struct { struct rtattr attr; struct crypto_attr_alg data; } attrs[CRYPTO_MAX_ATTRS]; char template[CRYPTO_MAX_ALG_NAME]; struct crypto_larval *larval; u32 otype; u32 omask; }; struct crypto_test_param { char driver[CRYPTO_MAX_ALG_NAME]; char alg[CRYPTO_MAX_ALG_NAME]; u32 type; }; static int cryptomgr_probe(void *data) { struct cryptomgr_param *param = data; struct crypto_template *tmpl; int err; tmpl = crypto_lookup_template(param->template); if (!tmpl) goto out; do { err = tmpl->create(tmpl, param->tb); } while (err == -EAGAIN && !signal_pending(current)); crypto_tmpl_put(tmpl); out: complete_all(&param->larval->completion); crypto_alg_put(&param->larval->alg); kfree(param); module_put_and_kthread_exit(0); } static int cryptomgr_schedule_probe(struct crypto_larval *larval) { struct task_struct *thread; struct cryptomgr_param *param; const char *name = larval->alg.cra_name; const char *p; unsigned int len; int i; if (!try_module_get(THIS_MODULE)) goto err; param = kzalloc(sizeof(*param), GFP_KERNEL); if (!param) goto err_put_module; for (p = name; isalnum(*p) || *p == '-' || *p == '_'; p++) ; len = p - name; if (!len || *p != '(') goto err_free_param; memcpy(param->template, name, len); i = 0; for (;;) { name = ++p; for (; isalnum(*p) || *p == '-' || *p == '_'; p++) ; if (*p == '(') { int recursion = 0; for (;;) { if (!*++p) goto err_free_param; if (*p == '(') recursion++; else if (*p == ')' && !recursion--) break; } p++; } len = p - name; if (!len) goto err_free_param; param->attrs[i].attr.rta_len = sizeof(param->attrs[i]); param->attrs[i].attr.rta_type = CRYPTOA_ALG; memcpy(param->attrs[i].data.name, name, len); param->tb[i + 1] = &param->attrs[i].attr; i++; if (i >= CRYPTO_MAX_ATTRS) goto err_free_param; if (*p == ')') break; if (*p != ',') goto err_free_param; } if (!i) goto err_free_param; param->tb[i + 1] = NULL; param->type.attr.rta_len = sizeof(param->type); param->type.attr.rta_type = CRYPTOA_TYPE; param->type.data.type = larval->alg.cra_flags & ~CRYPTO_ALG_TESTED; param->type.data.mask = larval->mask & ~CRYPTO_ALG_TESTED; param->tb[0] = &param->type.attr; param->otype = larval->alg.cra_flags; param->omask = larval->mask; crypto_alg_get(&larval->alg); param->larval = larval; thread = kthread_run(cryptomgr_probe, param, "cryptomgr_probe"); if (IS_ERR(thread)) goto err_put_larval; return NOTIFY_STOP; err_put_larval: crypto_alg_put(&larval->alg); err_free_param: kfree(param); err_put_module: module_put(THIS_MODULE); err: return NOTIFY_OK; } static int cryptomgr_test(void *data) { struct crypto_test_param *param = data; u32 type = param->type; int err; err = alg_test(param->driver, param->alg, type, CRYPTO_ALG_TESTED); crypto_alg_tested(param->driver, err); kfree(param); module_put_and_kthread_exit(0); } static int cryptomgr_schedule_test(struct crypto_alg *alg) { struct task_struct *thread; struct crypto_test_param *param; if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS)) return NOTIFY_DONE; if (!try_module_get(THIS_MODULE)) goto err; param = kzalloc(sizeof(*param), GFP_KERNEL); if (!param) goto err_put_module; memcpy(param->driver, alg->cra_driver_name, sizeof(param->driver)); memcpy(param->alg, alg->cra_name, sizeof(param->alg)); param->type = alg->cra_flags; thread = kthread_run(cryptomgr_test, param, "cryptomgr_test"); if (IS_ERR(thread)) goto err_free_param; return NOTIFY_STOP; err_free_param: kfree(param); err_put_module: module_put(THIS_MODULE); err: return NOTIFY_OK; } static int cryptomgr_notify(struct notifier_block *this, unsigned long msg, void *data) { switch (msg) { case CRYPTO_MSG_ALG_REQUEST: return cryptomgr_schedule_probe(data); case CRYPTO_MSG_ALG_REGISTER: return cryptomgr_schedule_test(data); case CRYPTO_MSG_ALG_LOADED: break; } return NOTIFY_DONE; } static struct notifier_block cryptomgr_notifier = { .notifier_call = cryptomgr_notify, }; static int __init cryptomgr_init(void) { return crypto_register_notifier(&cryptomgr_notifier); } static void __exit cryptomgr_exit(void) { int err = crypto_unregister_notifier(&cryptomgr_notifier); BUG_ON(err); } /* * This is arch_initcall() so that the crypto self-tests are run on algorithms * registered early by subsys_initcall(). subsys_initcall() is needed for * generic implementations so that they're available for comparison tests when * other implementations are registered later by module_init(). */ arch_initcall(cryptomgr_init); module_exit(cryptomgr_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Crypto Algorithm Manager");
789 788 55 55 790 789 786 789 788 25 25 25 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 // SPDX-License-Identifier: GPL-2.0-or-later /* Structure dynamic extension infrastructure * Copyright (C) 2004 Rusty Russell IBM Corporation * Copyright (C) 2007 Netfilter Core Team <coreteam@netfilter.org> * Copyright (C) 2007 USAGI/WIDE Project <http://www.linux-ipv6.org> */ #include <linux/kernel.h> #include <linux/kmemleak.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/rcupdate.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_conntrack_act_ct.h> #include <net/netfilter/nf_nat.h> #define NF_CT_EXT_PREALLOC 128u /* conntrack events are on by default */ atomic_t nf_conntrack_ext_genid __read_mostly = ATOMIC_INIT(1); static const u8 nf_ct_ext_type_len[NF_CT_EXT_NUM] = { [NF_CT_EXT_HELPER] = sizeof(struct nf_conn_help), #if IS_ENABLED(CONFIG_NF_NAT) [NF_CT_EXT_NAT] = sizeof(struct nf_conn_nat), #endif [NF_CT_EXT_SEQADJ] = sizeof(struct nf_conn_seqadj), [NF_CT_EXT_ACCT] = sizeof(struct nf_conn_acct), #ifdef CONFIG_NF_CONNTRACK_EVENTS [NF_CT_EXT_ECACHE] = sizeof(struct nf_conntrack_ecache), #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP [NF_CT_EXT_TSTAMP] = sizeof(struct nf_conn_tstamp), #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT [NF_CT_EXT_TIMEOUT] = sizeof(struct nf_conn_timeout), #endif #ifdef CONFIG_NF_CONNTRACK_LABELS [NF_CT_EXT_LABELS] = sizeof(struct nf_conn_labels), #endif #if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) [NF_CT_EXT_SYNPROXY] = sizeof(struct nf_conn_synproxy), #endif #if IS_ENABLED(CONFIG_NET_ACT_CT) [NF_CT_EXT_ACT_CT] = sizeof(struct nf_conn_act_ct_ext), #endif }; static __always_inline unsigned int total_extension_size(void) { /* remember to add new extensions below */ BUILD_BUG_ON(NF_CT_EXT_NUM > 10); return sizeof(struct nf_ct_ext) + sizeof(struct nf_conn_help) #if IS_ENABLED(CONFIG_NF_NAT) + sizeof(struct nf_conn_nat) #endif + sizeof(struct nf_conn_seqadj) + sizeof(struct nf_conn_acct) #ifdef CONFIG_NF_CONNTRACK_EVENTS + sizeof(struct nf_conntrack_ecache) #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + sizeof(struct nf_conn_tstamp) #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT + sizeof(struct nf_conn_timeout) #endif #ifdef CONFIG_NF_CONNTRACK_LABELS + sizeof(struct nf_conn_labels) #endif #if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) + sizeof(struct nf_conn_synproxy) #endif #if IS_ENABLED(CONFIG_NET_ACT_CT) + sizeof(struct nf_conn_act_ct_ext) #endif ; } void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) { unsigned int newlen, newoff, oldlen, alloc; struct nf_ct_ext *new; /* Conntrack must not be confirmed to avoid races on reallocation. */ WARN_ON(nf_ct_is_confirmed(ct)); /* struct nf_ct_ext uses u8 to store offsets/size */ BUILD_BUG_ON(total_extension_size() > 255u); if (ct->ext) { const struct nf_ct_ext *old = ct->ext; if (__nf_ct_ext_exist(old, id)) return NULL; oldlen = old->len; } else { oldlen = sizeof(*new); } newoff = ALIGN(oldlen, __alignof__(struct nf_ct_ext)); newlen = newoff + nf_ct_ext_type_len[id]; alloc = max(newlen, NF_CT_EXT_PREALLOC); new = krealloc(ct->ext, alloc, gfp); if (!new) return NULL; if (!ct->ext) { memset(new->offset, 0, sizeof(new->offset)); new->gen_id = atomic_read(&nf_conntrack_ext_genid); } new->offset[id] = newoff; new->len = newlen; memset((void *)new + newoff, 0, newlen - newoff); ct->ext = new; return (void *)new + newoff; } EXPORT_SYMBOL(nf_ct_ext_add); /* Use nf_ct_ext_find wrapper. This is only useful for unconfirmed entries. */ void *__nf_ct_ext_find(const struct nf_ct_ext *ext, u8 id) { unsigned int gen_id = atomic_read(&nf_conntrack_ext_genid); unsigned int this_id = READ_ONCE(ext->gen_id); if (!__nf_ct_ext_exist(ext, id)) return NULL; if (this_id == 0 || ext->gen_id == gen_id) return (void *)ext + ext->offset[id]; return NULL; } EXPORT_SYMBOL(__nf_ct_ext_find); void nf_ct_ext_bump_genid(void) { unsigned int value = atomic_inc_return(&nf_conntrack_ext_genid); if (value == UINT_MAX) atomic_set(&nf_conntrack_ext_genid, 1); msleep(HZ); }
1 1 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 // SPDX-License-Identifier: GPL-2.0-only /* fs/fat/nfs.c */ #include <linux/exportfs.h> #include "fat.h" struct fat_fid { u32 i_gen; u32 i_pos_low; u16 i_pos_hi; u16 parent_i_pos_hi; u32 parent_i_pos_low; u32 parent_i_gen; }; #define FAT_FID_SIZE_WITHOUT_PARENT 3 #define FAT_FID_SIZE_WITH_PARENT (sizeof(struct fat_fid)/sizeof(u32)) /* * Look up a directory inode given its starting cluster. */ static struct inode *fat_dget(struct super_block *sb, int i_logstart) { struct msdos_sb_info *sbi = MSDOS_SB(sb); struct hlist_head *head; struct msdos_inode_info *i; struct inode *inode = NULL; head = sbi->dir_hashtable + fat_dir_hash(i_logstart); spin_lock(&sbi->dir_hash_lock); hlist_for_each_entry(i, head, i_dir_hash) { BUG_ON(i->vfs_inode.i_sb != sb); if (i->i_logstart != i_logstart) continue; inode = igrab(&i->vfs_inode); if (inode) break; } spin_unlock(&sbi->dir_hash_lock); return inode; } static struct inode *fat_ilookup(struct super_block *sb, u64 ino, loff_t i_pos) { if (MSDOS_SB(sb)->options.nfs == FAT_NFS_NOSTALE_RO) return fat_iget(sb, i_pos); else { if ((ino < MSDOS_ROOT_INO) || (ino == MSDOS_FSINFO_INO)) return NULL; return ilookup(sb, ino); } } static struct inode *__fat_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation, loff_t i_pos) { struct inode *inode = fat_ilookup(sb, ino, i_pos); if (inode && generation && (inode->i_generation != generation)) { iput(inode); inode = NULL; } if (inode == NULL && MSDOS_SB(sb)->options.nfs == FAT_NFS_NOSTALE_RO) { struct buffer_head *bh = NULL; struct msdos_dir_entry *de ; sector_t blocknr; int offset; fat_get_blknr_offset(MSDOS_SB(sb), i_pos, &blocknr, &offset); bh = sb_bread(sb, blocknr); if (!bh) { fat_msg(sb, KERN_ERR, "unable to read block(%llu) for building NFS inode", (llu)blocknr); return inode; } de = (struct msdos_dir_entry *)bh->b_data; /* If a file is deleted on server and client is not updated * yet, we must not build the inode upon a lookup call. */ if (IS_FREE(de[offset].name)) inode = NULL; else inode = fat_build_inode(sb, &de[offset], i_pos); brelse(bh); } return inode; } static struct inode *fat_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { return __fat_nfs_get_inode(sb, ino, generation, 0); } static int fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp, struct inode *parent) { int len = *lenp; struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); struct fat_fid *fid = (struct fat_fid *) fh; loff_t i_pos; int type = FILEID_FAT_WITHOUT_PARENT; if (parent) { if (len < FAT_FID_SIZE_WITH_PARENT) { *lenp = FAT_FID_SIZE_WITH_PARENT; return FILEID_INVALID; } } else { if (len < FAT_FID_SIZE_WITHOUT_PARENT) { *lenp = FAT_FID_SIZE_WITHOUT_PARENT; return FILEID_INVALID; } } i_pos = fat_i_pos_read(sbi, inode); *lenp = FAT_FID_SIZE_WITHOUT_PARENT; fid->i_gen = inode->i_generation; fid->i_pos_low = i_pos & 0xFFFFFFFF; fid->i_pos_hi = (i_pos >> 32) & 0xFFFF; if (parent) { i_pos = fat_i_pos_read(sbi, parent); fid->parent_i_pos_hi = (i_pos >> 32) & 0xFFFF; fid->parent_i_pos_low = i_pos & 0xFFFFFFFF; fid->parent_i_gen = parent->i_generation; type = FILEID_FAT_WITH_PARENT; *lenp = FAT_FID_SIZE_WITH_PARENT; } return type; } /* * Map a NFS file handle to a corresponding dentry. * The dentry may or may not be connected to the filesystem root. */ static struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, fat_nfs_get_inode); } static struct dentry *fat_fh_to_dentry_nostale(struct super_block *sb, struct fid *fh, int fh_len, int fh_type) { struct inode *inode = NULL; struct fat_fid *fid = (struct fat_fid *)fh; loff_t i_pos; switch (fh_type) { case FILEID_FAT_WITHOUT_PARENT: if (fh_len < FAT_FID_SIZE_WITHOUT_PARENT) return NULL; break; case FILEID_FAT_WITH_PARENT: if (fh_len < FAT_FID_SIZE_WITH_PARENT) return NULL; break; default: return NULL; } i_pos = fid->i_pos_hi; i_pos = (i_pos << 32) | (fid->i_pos_low); inode = __fat_nfs_get_inode(sb, 0, fid->i_gen, i_pos); return d_obtain_alias(inode); } /* * Find the parent for a file specified by NFS handle. * This requires that the handle contain the i_ino of the parent. */ static struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, fat_nfs_get_inode); } static struct dentry *fat_fh_to_parent_nostale(struct super_block *sb, struct fid *fh, int fh_len, int fh_type) { struct inode *inode = NULL; struct fat_fid *fid = (struct fat_fid *)fh; loff_t i_pos; if (fh_len < FAT_FID_SIZE_WITH_PARENT) return NULL; switch (fh_type) { case FILEID_FAT_WITH_PARENT: i_pos = fid->parent_i_pos_hi; i_pos = (i_pos << 32) | (fid->parent_i_pos_low); inode = __fat_nfs_get_inode(sb, 0, fid->parent_i_gen, i_pos); break; } return d_obtain_alias(inode); } /* * Rebuild the parent for a directory that is not connected * to the filesystem root */ static struct inode *fat_rebuild_parent(struct super_block *sb, int parent_logstart) { int search_clus, clus_to_match; struct msdos_dir_entry *de; struct inode *parent = NULL; struct inode *dummy_grand_parent = NULL; struct fat_slot_info sinfo; struct msdos_sb_info *sbi = MSDOS_SB(sb); sector_t blknr = fat_clus_to_blknr(sbi, parent_logstart); struct buffer_head *parent_bh = sb_bread(sb, blknr); if (!parent_bh) { fat_msg(sb, KERN_ERR, "unable to read cluster of parent directory"); return NULL; } de = (struct msdos_dir_entry *) parent_bh->b_data; clus_to_match = fat_get_start(sbi, &de[0]); search_clus = fat_get_start(sbi, &de[1]); dummy_grand_parent = fat_dget(sb, search_clus); if (!dummy_grand_parent) { dummy_grand_parent = new_inode(sb); if (!dummy_grand_parent) { brelse(parent_bh); return parent; } dummy_grand_parent->i_ino = iunique(sb, MSDOS_ROOT_INO); fat_fill_inode(dummy_grand_parent, &de[1]); MSDOS_I(dummy_grand_parent)->i_pos = -1; } if (!fat_scan_logstart(dummy_grand_parent, clus_to_match, &sinfo)) parent = fat_build_inode(sb, sinfo.de, sinfo.i_pos); brelse(parent_bh); iput(dummy_grand_parent); return parent; } /* * Find the parent for a directory that is not currently connected to * the filesystem root. * * On entry, the caller holds d_inode(child_dir)->i_mutex. */ static struct dentry *fat_get_parent(struct dentry *child_dir) { struct super_block *sb = child_dir->d_sb; struct buffer_head *bh = NULL; struct msdos_dir_entry *de; struct inode *parent_inode = NULL; struct msdos_sb_info *sbi = MSDOS_SB(sb); if (!fat_get_dotdot_entry(d_inode(child_dir), &bh, &de)) { int parent_logstart = fat_get_start(sbi, de); parent_inode = fat_dget(sb, parent_logstart); if (!parent_inode && sbi->options.nfs == FAT_NFS_NOSTALE_RO) parent_inode = fat_rebuild_parent(sb, parent_logstart); } brelse(bh); return d_obtain_alias(parent_inode); } const struct export_operations fat_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = fat_fh_to_dentry, .fh_to_parent = fat_fh_to_parent, .get_parent = fat_get_parent, }; const struct export_operations fat_export_ops_nostale = { .encode_fh = fat_encode_fh_nostale, .fh_to_dentry = fat_fh_to_dentry_nostale, .fh_to_parent = fat_fh_to_parent_nostale, .get_parent = fat_get_parent, };
1157 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 // SPDX-License-Identifier: GPL-2.0 /* * ACPI support * * Copyright (C) 2020, Intel Corporation * Author: Mika Westerberg <mika.westerberg@linux.intel.com> */ #include <linux/acpi.h> #include <linux/pm_runtime.h> #include "tb.h" static acpi_status tb_acpi_add_link(acpi_handle handle, u32 level, void *data, void **ret) { struct acpi_device *adev = acpi_fetch_acpi_dev(handle); struct fwnode_handle *fwnode; struct tb_nhi *nhi = data; struct pci_dev *pdev; struct device *dev; if (!adev) return AE_OK; fwnode = fwnode_find_reference(acpi_fwnode_handle(adev), "usb4-host-interface", 0); if (IS_ERR(fwnode)) return AE_OK; /* It needs to reference this NHI */ if (dev_fwnode(&nhi->pdev->dev) != fwnode) goto out_put; /* * Try to find physical device walking upwards to the hierarcy. * We need to do this because the xHCI driver might not yet be * bound so the USB3 SuperSpeed ports are not yet created. */ do { dev = acpi_get_first_physical_node(adev); if (dev) break; adev = acpi_dev_parent(adev); } while (adev); /* * Check that the device is PCIe. This is because USB3 * SuperSpeed ports have this property and they are not power * managed with the xHCI and the SuperSpeed hub so we create the * link from xHCI instead. */ while (dev && !dev_is_pci(dev)) dev = dev->parent; if (!dev) goto out_put; /* * Check that this actually matches the type of device we * expect. It should either be xHCI or PCIe root/downstream * port. */ pdev = to_pci_dev(dev); if (pdev->class == PCI_CLASS_SERIAL_USB_XHCI || (pci_is_pcie(pdev) && (pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT || pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM))) { const struct device_link *link; /* * Make them both active first to make sure the NHI does * not runtime suspend before the consumer. The * pm_runtime_put() below then allows the consumer to * runtime suspend again (which then allows NHI runtime * suspend too now that the device link is established). */ pm_runtime_get_sync(&pdev->dev); link = device_link_add(&pdev->dev, &nhi->pdev->dev, DL_FLAG_AUTOREMOVE_SUPPLIER | DL_FLAG_RPM_ACTIVE | DL_FLAG_PM_RUNTIME); if (link) { dev_dbg(&nhi->pdev->dev, "created link from %s\n", dev_name(&pdev->dev)); *(bool *)ret = true; } else { dev_warn(&nhi->pdev->dev, "device link creation from %s failed\n", dev_name(&pdev->dev)); } pm_runtime_put(&pdev->dev); } out_put: fwnode_handle_put(fwnode); return AE_OK; } /** * tb_acpi_add_links() - Add device links based on ACPI description * @nhi: Pointer to NHI * * Goes over ACPI namespace finding tunneled ports that reference to * @nhi ACPI node. For each reference a device link is added. The link * is automatically removed by the driver core. * * Returns %true if at least one link was created. */ bool tb_acpi_add_links(struct tb_nhi *nhi) { acpi_status status; bool ret = false; if (!has_acpi_companion(&nhi->pdev->dev)) return false; /* * Find all devices that have usb4-host-controller interface * property that references to this NHI. */ status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, 32, tb_acpi_add_link, NULL, nhi, (void **)&ret); if (ACPI_FAILURE(status)) { dev_warn(&nhi->pdev->dev, "failed to enumerate tunneled ports\n"); return false; } return ret; } /** * tb_acpi_is_native() - Did the platform grant native TBT/USB4 control * * Returns %true if the platform granted OS native control over * TBT/USB4. In this case software based connection manager can be used, * otherwise there is firmware based connection manager running. */ bool tb_acpi_is_native(void) { return osc_sb_native_usb4_support_confirmed && osc_sb_native_usb4_control; } /** * tb_acpi_may_tunnel_usb3() - Is USB3 tunneling allowed by the platform * * When software based connection manager is used, this function * returns %true if platform allows native USB3 tunneling. */ bool tb_acpi_may_tunnel_usb3(void) { if (tb_acpi_is_native()) return osc_sb_native_usb4_control & OSC_USB_USB3_TUNNELING; return true; } /** * tb_acpi_may_tunnel_dp() - Is DisplayPort tunneling allowed by the platform * * When software based connection manager is used, this function * returns %true if platform allows native DP tunneling. */ bool tb_acpi_may_tunnel_dp(void) { if (tb_acpi_is_native()) return osc_sb_native_usb4_control & OSC_USB_DP_TUNNELING; return true; } /** * tb_acpi_may_tunnel_pcie() - Is PCIe tunneling allowed by the platform * * When software based connection manager is used, this function * returns %true if platform allows native PCIe tunneling. */ bool tb_acpi_may_tunnel_pcie(void) { if (tb_acpi_is_native()) return osc_sb_native_usb4_control & OSC_USB_PCIE_TUNNELING; return true; } /** * tb_acpi_is_xdomain_allowed() - Are XDomain connections allowed * * When software based connection manager is used, this function * returns %true if platform allows XDomain connections. */ bool tb_acpi_is_xdomain_allowed(void) { if (tb_acpi_is_native()) return osc_sb_native_usb4_control & OSC_USB_XDOMAIN; return true; } /* UUID for retimer _DSM: e0053122-795b-4122-8a5e-57be1d26acb3 */ static const guid_t retimer_dsm_guid = GUID_INIT(0xe0053122, 0x795b, 0x4122, 0x8a, 0x5e, 0x57, 0xbe, 0x1d, 0x26, 0xac, 0xb3); #define RETIMER_DSM_QUERY_ONLINE_STATE 1 #define RETIMER_DSM_SET_ONLINE_STATE 2 static int tb_acpi_retimer_set_power(struct tb_port *port, bool power) { struct usb4_port *usb4 = port->usb4; union acpi_object argv4[2]; struct acpi_device *adev; union acpi_object *obj; int ret; if (!usb4->can_offline) return 0; adev = ACPI_COMPANION(&usb4->dev); if (WARN_ON(!adev)) return 0; /* Check if we are already powered on (and in correct mode) */ obj = acpi_evaluate_dsm_typed(adev->handle, &retimer_dsm_guid, 1, RETIMER_DSM_QUERY_ONLINE_STATE, NULL, ACPI_TYPE_INTEGER); if (!obj) { tb_port_warn(port, "ACPI: query online _DSM failed\n"); return -EIO; } ret = obj->integer.value; ACPI_FREE(obj); if (power == ret) return 0; tb_port_dbg(port, "ACPI: calling _DSM to power %s retimers\n", power ? "on" : "off"); argv4[0].type = ACPI_TYPE_PACKAGE; argv4[0].package.count = 1; argv4[0].package.elements = &argv4[1]; argv4[1].integer.type = ACPI_TYPE_INTEGER; argv4[1].integer.value = power; obj = acpi_evaluate_dsm_typed(adev->handle, &retimer_dsm_guid, 1, RETIMER_DSM_SET_ONLINE_STATE, argv4, ACPI_TYPE_INTEGER); if (!obj) { tb_port_warn(port, "ACPI: set online state _DSM evaluation failed\n"); return -EIO; } ret = obj->integer.value; ACPI_FREE(obj); if (ret >= 0) { if (power) return ret == 1 ? 0 : -EBUSY; return 0; } tb_port_warn(port, "ACPI: set online state _DSM failed with error %d\n", ret); return -EIO; } /** * tb_acpi_power_on_retimers() - Call platform to power on retimers * @port: USB4 port * * Calls platform to turn on power to all retimers behind this USB4 * port. After this function returns successfully the caller can * continue with the normal retimer flows (as specified in the USB4 * spec). Note if this returns %-EBUSY it means the type-C port is in * non-USB4/TBT mode (there is non-USB4/TBT device connected). * * This should only be called if the USB4/TBT link is not up. * * Returns %0 on success. */ int tb_acpi_power_on_retimers(struct tb_port *port) { return tb_acpi_retimer_set_power(port, true); } /** * tb_acpi_power_off_retimers() - Call platform to power off retimers * @port: USB4 port * * This is the opposite of tb_acpi_power_on_retimers(). After returning * successfully the normal operations with the @port can continue. * * Returns %0 on success. */ int tb_acpi_power_off_retimers(struct tb_port *port) { return tb_acpi_retimer_set_power(port, false); } static bool tb_acpi_bus_match(struct device *dev) { return tb_is_switch(dev) || tb_is_usb4_port_device(dev); } static struct acpi_device *tb_acpi_switch_find_companion(struct tb_switch *sw) { struct tb_switch *parent_sw = tb_switch_parent(sw); struct acpi_device *adev = NULL; /* * Device routers exists under the downstream facing USB4 port * of the parent router. Their _ADR is always 0. */ if (parent_sw) { struct tb_port *port = tb_switch_downstream_port(sw); struct acpi_device *port_adev; port_adev = acpi_find_child_by_adr(ACPI_COMPANION(&parent_sw->dev), port->port); if (port_adev) adev = acpi_find_child_device(port_adev, 0, false); } else { struct tb_nhi *nhi = sw->tb->nhi; struct acpi_device *parent_adev; parent_adev = ACPI_COMPANION(&nhi->pdev->dev); if (parent_adev) adev = acpi_find_child_device(parent_adev, 0, false); } return adev; } static struct acpi_device *tb_acpi_find_companion(struct device *dev) { /* * The Thunderbolt/USB4 hierarchy looks like following: * * Device (NHI) * Device (HR) // Host router _ADR == 0 * Device (DFP0) // Downstream port _ADR == lane 0 adapter * Device (DR) // Device router _ADR == 0 * Device (UFP) // Upstream port _ADR == lane 0 adapter * Device (DFP1) // Downstream port _ADR == lane 0 adapter number * * At the moment we bind the host router to the corresponding * Linux device. */ if (tb_is_switch(dev)) return tb_acpi_switch_find_companion(tb_to_switch(dev)); if (tb_is_usb4_port_device(dev)) return acpi_find_child_by_adr(ACPI_COMPANION(dev->parent), tb_to_usb4_port_device(dev)->port->port); return NULL; } static void tb_acpi_setup(struct device *dev) { struct acpi_device *adev = ACPI_COMPANION(dev); struct usb4_port *usb4 = tb_to_usb4_port_device(dev); if (!adev || !usb4) return; if (acpi_check_dsm(adev->handle, &retimer_dsm_guid, 1, BIT(RETIMER_DSM_QUERY_ONLINE_STATE) | BIT(RETIMER_DSM_SET_ONLINE_STATE))) usb4->can_offline = true; } static struct acpi_bus_type tb_acpi_bus = { .name = "thunderbolt", .match = tb_acpi_bus_match, .find_companion = tb_acpi_find_companion, .setup = tb_acpi_setup, }; int tb_acpi_init(void) { return register_acpi_bus_type(&tb_acpi_bus); } void tb_acpi_exit(void) { unregister_acpi_bus_type(&tb_acpi_bus); }
10 13 3 3 13 21 20 20 20 20 19 20 20 21 21 20 10 9 10 9 1 1 10 1 1 1 8 15 7 1 1 1 15 16 15 15 15 13 12 6 8 8 8 9 15 16 4 13 11 10 9 10 8 3 5 9 2 8 9 8 8 8 8 45 7 1 7 7 3 3 2 1 3 1 3 3 4 1 4 1 4 1 4 88 4 4 15 15 13 2 1 5 5 1 2 2 1 88 66 36 45 66 3 2 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 // SPDX-License-Identifier: GPL-2.0-only /* * vhost transport for vsock * * Copyright (C) 2013-2015 Red Hat, Inc. * Author: Asias He <asias@redhat.com> * Stefan Hajnoczi <stefanha@redhat.com> */ #include <linux/miscdevice.h> #include <linux/atomic.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/vmalloc.h> #include <net/sock.h> #include <linux/virtio_vsock.h> #include <linux/vhost.h> #include <linux/hashtable.h> #include <net/af_vsock.h> #include "vhost.h" #define VHOST_VSOCK_DEFAULT_HOST_CID 2 /* Max number of bytes transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others. */ #define VHOST_VSOCK_WEIGHT 0x80000 /* Max number of packets transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others with * small pkts. */ #define VHOST_VSOCK_PKT_WEIGHT 256 enum { VHOST_VSOCK_FEATURES = VHOST_FEATURES | (1ULL << VIRTIO_F_ACCESS_PLATFORM) | (1ULL << VIRTIO_VSOCK_F_SEQPACKET) }; enum { VHOST_VSOCK_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) }; /* Used to track all the vhost_vsock instances on the system. */ static DEFINE_MUTEX(vhost_vsock_mutex); static DEFINE_READ_MOSTLY_HASHTABLE(vhost_vsock_hash, 8); struct vhost_vsock { struct vhost_dev dev; struct vhost_virtqueue vqs[2]; /* Link to global vhost_vsock_hash, writes use vhost_vsock_mutex */ struct hlist_node hash; struct vhost_work send_pkt_work; struct sk_buff_head send_pkt_queue; /* host->guest pending packets */ atomic_t queued_replies; u32 guest_cid; bool seqpacket_allow; }; static u32 vhost_transport_get_local_cid(void) { return VHOST_VSOCK_DEFAULT_HOST_CID; } /* Callers that dereference the return value must hold vhost_vsock_mutex or the * RCU read lock. */ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid) { struct vhost_vsock *vsock; hash_for_each_possible_rcu(vhost_vsock_hash, vsock, hash, guest_cid) { u32 other_cid = vsock->guest_cid; /* Skip instances that have no CID yet */ if (other_cid == 0) continue; if (other_cid == guest_cid) return vsock; } return NULL; } static void vhost_transport_do_send_pkt(struct vhost_vsock *vsock, struct vhost_virtqueue *vq) { struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX]; int pkts = 0, total_len = 0; bool added = false; bool restart_tx = false; mutex_lock(&vq->mutex); if (!vhost_vq_get_backend(vq)) goto out; if (!vq_meta_prefetch(vq)) goto out; /* Avoid further vmexits, we're already processing the virtqueue */ vhost_disable_notify(&vsock->dev, vq); do { struct virtio_vsock_hdr *hdr; size_t iov_len, payload_len; struct iov_iter iov_iter; u32 flags_to_restore = 0; struct sk_buff *skb; unsigned out, in; size_t nbytes; u32 offset; int head; skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue); if (!skb) { vhost_enable_notify(&vsock->dev, vq); break; } head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), &out, &in, NULL, NULL); if (head < 0) { virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); break; } if (head == vq->num) { virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); /* We cannot finish yet if more buffers snuck in while * re-enabling notify. */ if (unlikely(vhost_enable_notify(&vsock->dev, vq))) { vhost_disable_notify(&vsock->dev, vq); continue; } break; } if (out) { kfree_skb(skb); vq_err(vq, "Expected 0 output buffers, got %u\n", out); break; } iov_len = iov_length(&vq->iov[out], in); if (iov_len < sizeof(*hdr)) { kfree_skb(skb); vq_err(vq, "Buffer len [%zu] too small\n", iov_len); break; } iov_iter_init(&iov_iter, ITER_DEST, &vq->iov[out], in, iov_len); offset = VIRTIO_VSOCK_SKB_CB(skb)->offset; payload_len = skb->len - offset; hdr = virtio_vsock_hdr(skb); /* If the packet is greater than the space available in the * buffer, we split it using multiple buffers. */ if (payload_len > iov_len - sizeof(*hdr)) { payload_len = iov_len - sizeof(*hdr); /* As we are copying pieces of large packet's buffer to * small rx buffers, headers of packets in rx queue are * created dynamically and are initialized with header * of current packet(except length). But in case of * SOCK_SEQPACKET, we also must clear message delimeter * bit (VIRTIO_VSOCK_SEQ_EOM) and MSG_EOR bit * (VIRTIO_VSOCK_SEQ_EOR) if set. Otherwise, * there will be sequence of packets with these * bits set. After initialized header will be copied to * rx buffer, these required bits will be restored. */ if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) { hdr->flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM); flags_to_restore |= VIRTIO_VSOCK_SEQ_EOM; if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR) { hdr->flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); flags_to_restore |= VIRTIO_VSOCK_SEQ_EOR; } } } /* Set the correct length in the header */ hdr->len = cpu_to_le32(payload_len); nbytes = copy_to_iter(hdr, sizeof(*hdr), &iov_iter); if (nbytes != sizeof(*hdr)) { kfree_skb(skb); vq_err(vq, "Faulted on copying pkt hdr\n"); break; } if (skb_copy_datagram_iter(skb, offset, &iov_iter, payload_len)) { kfree_skb(skb); vq_err(vq, "Faulted on copying pkt buf\n"); break; } /* Deliver to monitoring devices all packets that we * will transmit. */ virtio_transport_deliver_tap_pkt(skb); vhost_add_used(vq, head, sizeof(*hdr) + payload_len); added = true; VIRTIO_VSOCK_SKB_CB(skb)->offset += payload_len; total_len += payload_len; /* If we didn't send all the payload we can requeue the packet * to send it with the next available buffer. */ if (VIRTIO_VSOCK_SKB_CB(skb)->offset < skb->len) { hdr->flags |= cpu_to_le32(flags_to_restore); /* We are queueing the same skb to handle * the remaining bytes, and we want to deliver it * to monitoring devices in the next iteration. */ virtio_vsock_skb_clear_tap_delivered(skb); virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); } else { if (virtio_vsock_skb_reply(skb)) { int val; val = atomic_dec_return(&vsock->queued_replies); /* Do we have resources to resume tx * processing? */ if (val + 1 == tx_vq->num) restart_tx = true; } consume_skb(skb); } } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); if (added) vhost_signal(&vsock->dev, vq); out: mutex_unlock(&vq->mutex); if (restart_tx) vhost_poll_queue(&tx_vq->poll); } static void vhost_transport_send_pkt_work(struct vhost_work *work) { struct vhost_virtqueue *vq; struct vhost_vsock *vsock; vsock = container_of(work, struct vhost_vsock, send_pkt_work); vq = &vsock->vqs[VSOCK_VQ_RX]; vhost_transport_do_send_pkt(vsock, vq); } static int vhost_transport_send_pkt(struct sk_buff *skb) { struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vhost_vsock *vsock; int len = skb->len; rcu_read_lock(); /* Find the vhost_vsock according to guest context id */ vsock = vhost_vsock_get(le64_to_cpu(hdr->dst_cid)); if (!vsock) { rcu_read_unlock(); kfree_skb(skb); return -ENODEV; } if (virtio_vsock_skb_reply(skb)) atomic_inc(&vsock->queued_replies); virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb); vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work); rcu_read_unlock(); return len; } static int vhost_transport_cancel_pkt(struct vsock_sock *vsk) { struct vhost_vsock *vsock; int cnt = 0; int ret = -ENODEV; rcu_read_lock(); /* Find the vhost_vsock according to guest context id */ vsock = vhost_vsock_get(vsk->remote_addr.svm_cid); if (!vsock) goto out; cnt = virtio_transport_purge_skbs(vsk, &vsock->send_pkt_queue); if (cnt) { struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX]; int new_cnt; new_cnt = atomic_sub_return(cnt, &vsock->queued_replies); if (new_cnt + cnt >= tx_vq->num && new_cnt < tx_vq->num) vhost_poll_queue(&tx_vq->poll); } ret = 0; out: rcu_read_unlock(); return ret; } static struct sk_buff * vhost_vsock_alloc_skb(struct vhost_virtqueue *vq, unsigned int out, unsigned int in) { struct virtio_vsock_hdr *hdr; struct iov_iter iov_iter; struct sk_buff *skb; size_t payload_len; size_t nbytes; size_t len; if (in != 0) { vq_err(vq, "Expected 0 input buffers, got %u\n", in); return NULL; } len = iov_length(vq->iov, out); /* len contains both payload and hdr */ skb = virtio_vsock_alloc_skb(len, GFP_KERNEL); if (!skb) return NULL; iov_iter_init(&iov_iter, ITER_SOURCE, vq->iov, out, len); hdr = virtio_vsock_hdr(skb); nbytes = copy_from_iter(hdr, sizeof(*hdr), &iov_iter); if (nbytes != sizeof(*hdr)) { vq_err(vq, "Expected %zu bytes for pkt->hdr, got %zu bytes\n", sizeof(*hdr), nbytes); kfree_skb(skb); return NULL; } payload_len = le32_to_cpu(hdr->len); /* No payload */ if (!payload_len) return skb; /* The pkt is too big or the length in the header is invalid */ if (payload_len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || payload_len + sizeof(*hdr) > len) { kfree_skb(skb); return NULL; } virtio_vsock_skb_rx_put(skb); nbytes = copy_from_iter(skb->data, payload_len, &iov_iter); if (nbytes != payload_len) { vq_err(vq, "Expected %zu byte payload, got %zu bytes\n", payload_len, nbytes); kfree_skb(skb); return NULL; } return skb; } /* Is there space left for replies to rx packets? */ static bool vhost_vsock_more_replies(struct vhost_vsock *vsock) { struct vhost_virtqueue *vq = &vsock->vqs[VSOCK_VQ_TX]; int val; smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */ val = atomic_read(&vsock->queued_replies); return val < vq->num; } static bool vhost_transport_msgzerocopy_allow(void) { return true; } static bool vhost_transport_seqpacket_allow(u32 remote_cid); static struct virtio_transport vhost_transport = { .transport = { .module = THIS_MODULE, .get_local_cid = vhost_transport_get_local_cid, .init = virtio_transport_do_socket_init, .destruct = virtio_transport_destruct, .release = virtio_transport_release, .connect = virtio_transport_connect, .shutdown = virtio_transport_shutdown, .cancel_pkt = vhost_transport_cancel_pkt, .dgram_enqueue = virtio_transport_dgram_enqueue, .dgram_dequeue = virtio_transport_dgram_dequeue, .dgram_bind = virtio_transport_dgram_bind, .dgram_allow = virtio_transport_dgram_allow, .stream_enqueue = virtio_transport_stream_enqueue, .stream_dequeue = virtio_transport_stream_dequeue, .stream_has_data = virtio_transport_stream_has_data, .stream_has_space = virtio_transport_stream_has_space, .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, .stream_is_active = virtio_transport_stream_is_active, .stream_allow = virtio_transport_stream_allow, .seqpacket_dequeue = virtio_transport_seqpacket_dequeue, .seqpacket_enqueue = virtio_transport_seqpacket_enqueue, .seqpacket_allow = vhost_transport_seqpacket_allow, .seqpacket_has_data = virtio_transport_seqpacket_has_data, .msgzerocopy_allow = vhost_transport_msgzerocopy_allow, .notify_poll_in = virtio_transport_notify_poll_in, .notify_poll_out = virtio_transport_notify_poll_out, .notify_recv_init = virtio_transport_notify_recv_init, .notify_recv_pre_block = virtio_transport_notify_recv_pre_block, .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue, .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue, .notify_send_init = virtio_transport_notify_send_init, .notify_send_pre_block = virtio_transport_notify_send_pre_block, .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, .notify_buffer_size = virtio_transport_notify_buffer_size, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, .read_skb = virtio_transport_read_skb, }, .send_pkt = vhost_transport_send_pkt, }; static bool vhost_transport_seqpacket_allow(u32 remote_cid) { struct vhost_vsock *vsock; bool seqpacket_allow = false; rcu_read_lock(); vsock = vhost_vsock_get(remote_cid); if (vsock) seqpacket_allow = vsock->seqpacket_allow; rcu_read_unlock(); return seqpacket_allow; } static void vhost_vsock_handle_tx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock, dev); int head, pkts = 0, total_len = 0; unsigned int out, in; struct sk_buff *skb; bool added = false; mutex_lock(&vq->mutex); if (!vhost_vq_get_backend(vq)) goto out; if (!vq_meta_prefetch(vq)) goto out; vhost_disable_notify(&vsock->dev, vq); do { struct virtio_vsock_hdr *hdr; if (!vhost_vsock_more_replies(vsock)) { /* Stop tx until the device processes already * pending replies. Leave tx virtqueue * callbacks disabled. */ goto no_more_replies; } head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), &out, &in, NULL, NULL); if (head < 0) break; if (head == vq->num) { if (unlikely(vhost_enable_notify(&vsock->dev, vq))) { vhost_disable_notify(&vsock->dev, vq); continue; } break; } skb = vhost_vsock_alloc_skb(vq, out, in); if (!skb) { vq_err(vq, "Faulted on pkt\n"); continue; } total_len += sizeof(*hdr) + skb->len; /* Deliver to monitoring devices all received packets */ virtio_transport_deliver_tap_pkt(skb); hdr = virtio_vsock_hdr(skb); /* Only accept correctly addressed packets */ if (le64_to_cpu(hdr->src_cid) == vsock->guest_cid && le64_to_cpu(hdr->dst_cid) == vhost_transport_get_local_cid()) virtio_transport_recv_pkt(&vhost_transport, skb); else kfree_skb(skb); vhost_add_used(vq, head, 0); added = true; } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); no_more_replies: if (added) vhost_signal(&vsock->dev, vq); out: mutex_unlock(&vq->mutex); } static void vhost_vsock_handle_rx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock, dev); vhost_transport_do_send_pkt(vsock, vq); } static int vhost_vsock_start(struct vhost_vsock *vsock) { struct vhost_virtqueue *vq; size_t i; int ret; mutex_lock(&vsock->dev.mutex); ret = vhost_dev_check_owner(&vsock->dev); if (ret) goto err; for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { vq = &vsock->vqs[i]; mutex_lock(&vq->mutex); if (!vhost_vq_access_ok(vq)) { ret = -EFAULT; goto err_vq; } if (!vhost_vq_get_backend(vq)) { vhost_vq_set_backend(vq, vsock); ret = vhost_vq_init_access(vq); if (ret) goto err_vq; } mutex_unlock(&vq->mutex); } /* Some packets may have been queued before the device was started, * let's kick the send worker to send them. */ vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work); mutex_unlock(&vsock->dev.mutex); return 0; err_vq: vhost_vq_set_backend(vq, NULL); mutex_unlock(&vq->mutex); for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { vq = &vsock->vqs[i]; mutex_lock(&vq->mutex); vhost_vq_set_backend(vq, NULL); mutex_unlock(&vq->mutex); } err: mutex_unlock(&vsock->dev.mutex); return ret; } static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner) { size_t i; int ret = 0; mutex_lock(&vsock->dev.mutex); if (check_owner) { ret = vhost_dev_check_owner(&vsock->dev); if (ret) goto err; } for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { struct vhost_virtqueue *vq = &vsock->vqs[i]; mutex_lock(&vq->mutex); vhost_vq_set_backend(vq, NULL); mutex_unlock(&vq->mutex); } err: mutex_unlock(&vsock->dev.mutex); return ret; } static void vhost_vsock_free(struct vhost_vsock *vsock) { kvfree(vsock); } static int vhost_vsock_dev_open(struct inode *inode, struct file *file) { struct vhost_virtqueue **vqs; struct vhost_vsock *vsock; int ret; /* This struct is large and allocation could fail, fall back to vmalloc * if there is no other way. */ vsock = kvmalloc(sizeof(*vsock), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!vsock) return -ENOMEM; vqs = kmalloc_array(ARRAY_SIZE(vsock->vqs), sizeof(*vqs), GFP_KERNEL); if (!vqs) { ret = -ENOMEM; goto out; } vsock->guest_cid = 0; /* no CID assigned yet */ atomic_set(&vsock->queued_replies, 0); vqs[VSOCK_VQ_TX] = &vsock->vqs[VSOCK_VQ_TX]; vqs[VSOCK_VQ_RX] = &vsock->vqs[VSOCK_VQ_RX]; vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick; vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick; vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs), UIO_MAXIOV, VHOST_VSOCK_PKT_WEIGHT, VHOST_VSOCK_WEIGHT, true, NULL); file->private_data = vsock; skb_queue_head_init(&vsock->send_pkt_queue); vhost_work_init(&vsock->send_pkt_work, vhost_transport_send_pkt_work); return 0; out: vhost_vsock_free(vsock); return ret; } static void vhost_vsock_flush(struct vhost_vsock *vsock) { vhost_dev_flush(&vsock->dev); } static void vhost_vsock_reset_orphans(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); /* vmci_transport.c doesn't take sk_lock here either. At least we're * under vsock_table_lock so the sock cannot disappear while we're * executing. */ /* If the peer is still valid, no need to reset connection */ if (vhost_vsock_get(vsk->remote_addr.svm_cid)) return; /* If the close timeout is pending, let it expire. This avoids races * with the timeout callback. */ if (vsk->close_work_scheduled) return; sock_set_flag(sk, SOCK_DONE); vsk->peer_shutdown = SHUTDOWN_MASK; sk->sk_state = SS_UNCONNECTED; sk->sk_err = ECONNRESET; sk_error_report(sk); } static int vhost_vsock_dev_release(struct inode *inode, struct file *file) { struct vhost_vsock *vsock = file->private_data; mutex_lock(&vhost_vsock_mutex); if (vsock->guest_cid) hash_del_rcu(&vsock->hash); mutex_unlock(&vhost_vsock_mutex); /* Wait for other CPUs to finish using vsock */ synchronize_rcu(); /* Iterating over all connections for all CIDs to find orphans is * inefficient. Room for improvement here. */ vsock_for_each_connected_socket(&vhost_transport.transport, vhost_vsock_reset_orphans); /* Don't check the owner, because we are in the release path, so we * need to stop the vsock device in any case. * vhost_vsock_stop() can not fail in this case, so we don't need to * check the return code. */ vhost_vsock_stop(vsock, false); vhost_vsock_flush(vsock); vhost_dev_stop(&vsock->dev); virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue); vhost_dev_cleanup(&vsock->dev); kfree(vsock->dev.vqs); vhost_vsock_free(vsock); return 0; } static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid) { struct vhost_vsock *other; /* Refuse reserved CIDs */ if (guest_cid <= VMADDR_CID_HOST || guest_cid == U32_MAX) return -EINVAL; /* 64-bit CIDs are not yet supported */ if (guest_cid > U32_MAX) return -EINVAL; /* Refuse if CID is assigned to the guest->host transport (i.e. nested * VM), to make the loopback work. */ if (vsock_find_cid(guest_cid)) return -EADDRINUSE; /* Refuse if CID is already in use */ mutex_lock(&vhost_vsock_mutex); other = vhost_vsock_get(guest_cid); if (other && other != vsock) { mutex_unlock(&vhost_vsock_mutex); return -EADDRINUSE; } if (vsock->guest_cid) hash_del_rcu(&vsock->hash); vsock->guest_cid = guest_cid; hash_add_rcu(vhost_vsock_hash, &vsock->hash, vsock->guest_cid); mutex_unlock(&vhost_vsock_mutex); return 0; } static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features) { struct vhost_virtqueue *vq; int i; if (features & ~VHOST_VSOCK_FEATURES) return -EOPNOTSUPP; mutex_lock(&vsock->dev.mutex); if ((features & (1 << VHOST_F_LOG_ALL)) && !vhost_log_access_ok(&vsock->dev)) { goto err; } if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) { if (vhost_init_device_iotlb(&vsock->dev)) goto err; } if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET)) vsock->seqpacket_allow = true; for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { vq = &vsock->vqs[i]; mutex_lock(&vq->mutex); vq->acked_features = features; mutex_unlock(&vq->mutex); } mutex_unlock(&vsock->dev.mutex); return 0; err: mutex_unlock(&vsock->dev.mutex); return -EFAULT; } static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) { struct vhost_vsock *vsock = f->private_data; void __user *argp = (void __user *)arg; u64 guest_cid; u64 features; int start; int r; switch (ioctl) { case VHOST_VSOCK_SET_GUEST_CID: if (copy_from_user(&guest_cid, argp, sizeof(guest_cid))) return -EFAULT; return vhost_vsock_set_cid(vsock, guest_cid); case VHOST_VSOCK_SET_RUNNING: if (copy_from_user(&start, argp, sizeof(start))) return -EFAULT; if (start) return vhost_vsock_start(vsock); else return vhost_vsock_stop(vsock, true); case VHOST_GET_FEATURES: features = VHOST_VSOCK_FEATURES; if (copy_to_user(argp, &features, sizeof(features))) return -EFAULT; return 0; case VHOST_SET_FEATURES: if (copy_from_user(&features, argp, sizeof(features))) return -EFAULT; return vhost_vsock_set_features(vsock, features); case VHOST_GET_BACKEND_FEATURES: features = VHOST_VSOCK_BACKEND_FEATURES; if (copy_to_user(argp, &features, sizeof(features))) return -EFAULT; return 0; case VHOST_SET_BACKEND_FEATURES: if (copy_from_user(&features, argp, sizeof(features))) return -EFAULT; if (features & ~VHOST_VSOCK_BACKEND_FEATURES) return -EOPNOTSUPP; vhost_set_backend_features(&vsock->dev, features); return 0; default: mutex_lock(&vsock->dev.mutex); r = vhost_dev_ioctl(&vsock->dev, ioctl, argp); if (r == -ENOIOCTLCMD) r = vhost_vring_ioctl(&vsock->dev, ioctl, argp); else vhost_vsock_flush(vsock); mutex_unlock(&vsock->dev.mutex); return r; } } static ssize_t vhost_vsock_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct vhost_vsock *vsock = file->private_data; struct vhost_dev *dev = &vsock->dev; int noblock = file->f_flags & O_NONBLOCK; return vhost_chr_read_iter(dev, to, noblock); } static ssize_t vhost_vsock_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct vhost_vsock *vsock = file->private_data; struct vhost_dev *dev = &vsock->dev; return vhost_chr_write_iter(dev, from); } static __poll_t vhost_vsock_chr_poll(struct file *file, poll_table *wait) { struct vhost_vsock *vsock = file->private_data; struct vhost_dev *dev = &vsock->dev; return vhost_chr_poll(file, dev, wait); } static const struct file_operations vhost_vsock_fops = { .owner = THIS_MODULE, .open = vhost_vsock_dev_open, .release = vhost_vsock_dev_release, .llseek = noop_llseek, .unlocked_ioctl = vhost_vsock_dev_ioctl, .compat_ioctl = compat_ptr_ioctl, .read_iter = vhost_vsock_chr_read_iter, .write_iter = vhost_vsock_chr_write_iter, .poll = vhost_vsock_chr_poll, }; static struct miscdevice vhost_vsock_misc = { .minor = VHOST_VSOCK_MINOR, .name = "vhost-vsock", .fops = &vhost_vsock_fops, }; static int __init vhost_vsock_init(void) { int ret; ret = vsock_core_register(&vhost_transport.transport, VSOCK_TRANSPORT_F_H2G); if (ret < 0) return ret; ret = misc_register(&vhost_vsock_misc); if (ret) { vsock_core_unregister(&vhost_transport.transport); return ret; } return 0; }; static void __exit vhost_vsock_exit(void) { misc_deregister(&vhost_vsock_misc); vsock_core_unregister(&vhost_transport.transport); }; module_init(vhost_vsock_init); module_exit(vhost_vsock_exit); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Asias He"); MODULE_DESCRIPTION("vhost transport for vsock "); MODULE_ALIAS_MISCDEV(VHOST_VSOCK_MINOR); MODULE_ALIAS("devname:vhost-vsock");
39 36 158 2474 676 162 145 2969 8966 1 1 1 1 117 389 1888 81 81 96 38 93 7 69 44 43 653 434 434 312 312 1097 2425 606 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CGROUP_H #define _LINUX_CGROUP_H /* * cgroup interface * * Copyright (C) 2003 BULL SA * Copyright (C) 2004-2006 Silicon Graphics, Inc. * */ #include <linux/sched.h> #include <linux/cpumask.h> #include <linux/nodemask.h> #include <linux/rculist.h> #include <linux/cgroupstats.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/kernfs.h> #include <linux/jump_label.h> #include <linux/types.h> #include <linux/ns_common.h> #include <linux/nsproxy.h> #include <linux/user_namespace.h> #include <linux/refcount.h> #include <linux/kernel_stat.h> #include <linux/cgroup-defs.h> struct kernel_clone_args; #ifdef CONFIG_CGROUPS /* * All weight knobs on the default hierarchy should use the following min, * default and max values. The default value is the logarithmic center of * MIN and MAX and allows 100x to be expressed in both directions. */ #define CGROUP_WEIGHT_MIN 1 #define CGROUP_WEIGHT_DFL 100 #define CGROUP_WEIGHT_MAX 10000 enum { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ CSS_TASK_ITER_SKIPPED = (1U << 16), /* internal flags */ }; /* a css_task_iter should be treated as an opaque object */ struct css_task_iter { struct cgroup_subsys *ss; unsigned int flags; struct list_head *cset_pos; struct list_head *cset_head; struct list_head *tcset_pos; struct list_head *tcset_head; struct list_head *task_pos; struct list_head *cur_tasks_head; struct css_set *cur_cset; struct css_set *cur_dcset; struct task_struct *cur_task; struct list_head iters_node; /* css_set->task_iters */ }; extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; extern spinlock_t css_set_lock; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; #include <linux/cgroup_subsys.h> #undef SUBSYS #define SUBSYS(_x) \ extern struct static_key_true _x ## _cgrp_subsys_enabled_key; \ extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key; #include <linux/cgroup_subsys.h> #undef SUBSYS /** * cgroup_subsys_enabled - fast test on whether a subsys is enabled * @ss: subsystem in question */ #define cgroup_subsys_enabled(ss) \ static_branch_likely(&ss ## _enabled_key) /** * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy * @ss: subsystem in question */ #define cgroup_subsys_on_dfl(ss) \ static_branch_likely(&ss ## _on_dfl_key) bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); struct cgroup *cgroup_get_from_path(const char *path); struct cgroup *cgroup_get_from_fd(int fd); struct cgroup *cgroup_v1v2_get_from_fd(int fd); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); void cgroup_file_notify(struct cgroup_file *cfile); void cgroup_file_show(struct cgroup_file *cfile, bool show); int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); void cgroup_fork(struct task_struct *p); extern int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); void cgroup_exit(struct task_struct *p); void cgroup_release(struct task_struct *p); void cgroup_free(struct task_struct *p); int cgroup_init_early(void); int cgroup_init(void); int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v); /* * Iteration helpers and macros. */ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *parent); struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos); struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it); struct task_struct *css_task_iter_next(struct css_task_iter *it); void css_task_iter_end(struct css_task_iter *it); /** * css_for_each_child - iterate through children of a css * @pos: the css * to use as the loop cursor * @parent: css whose children to walk * * Walk @parent's children. Must be called under rcu_read_lock(). * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ #define css_for_each_child(pos, parent) \ for ((pos) = css_next_child(NULL, (parent)); (pos); \ (pos) = css_next_child((pos), (parent))) /** * css_for_each_descendant_pre - pre-order walk of a css's descendants * @pos: the css * to use as the loop cursor * @root: css whose descendants to walk * * Walk @root's descendants. @root is included in the iteration and the * first node to be visited. Must be called under rcu_read_lock(). * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * For example, the following guarantees that a descendant can't escape * state updates of its ancestors. * * my_online(@css) * { * Lock @css's parent and @css; * Inherit state from the parent; * Unlock both. * } * * my_update_state(@css) * { * css_for_each_descendant_pre(@pos, @css) { * Lock @pos; * if (@pos == @css) * Update @css's state; * else * Verify @pos is alive and inherit state from its parent; * Unlock @pos; * } * } * * As long as the inheriting step, including checking the parent state, is * enclosed inside @pos locking, double-locking the parent isn't necessary * while inheriting. The state update to the parent is guaranteed to be * visible by walking order and, as long as inheriting operations to the * same @pos are atomic to each other, multiple updates racing each other * still result in the correct state. It's guaranateed that at least one * inheritance happens for any css after the latest update to its parent. * * If checking parent's state requires locking the parent, each inheriting * iteration should lock and unlock both @pos->parent and @pos. * * Alternatively, a subsystem may choose to use a single global lock to * synchronize ->css_online() and ->css_offline() against tree-walking * operations. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ #define css_for_each_descendant_pre(pos, css) \ for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ (pos) = css_next_descendant_pre((pos), (css))) /** * css_for_each_descendant_post - post-order walk of a css's descendants * @pos: the css * to use as the loop cursor * @css: css whose descendants to walk * * Similar to css_for_each_descendant_pre() but performs post-order * traversal instead. @root is included in the iteration and the last * node to be visited. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * Note that the walk visibility guarantee example described in pre-order * walk doesn't apply the same to post-order walks. */ #define css_for_each_descendant_post(pos, css) \ for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ (pos) = css_next_descendant_post((pos), (css))) /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor * @dst_css: the destination css * @tset: taskset to iterate * * @tset may contain multiple tasks and they may belong to multiple * processes. * * On the v2 hierarchy, there may be tasks from multiple processes and they * may not share the source or destination csses. * * On traditional hierarchies, when there are multiple tasks in @tset, if a * task of a process is in @tset, all tasks of the process are in @tset. * Also, all are guaranteed to share the same source and destination csses. * * Iteration is not in any specific order. */ #define cgroup_taskset_for_each(task, dst_css, tset) \ for ((task) = cgroup_taskset_first((tset), &(dst_css)); \ (task); \ (task) = cgroup_taskset_next((tset), &(dst_css))) /** * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset * @leader: the loop cursor * @dst_css: the destination css * @tset: taskset to iterate * * Iterate threadgroup leaders of @tset. For single-task migrations, @tset * may not contain any. */ #define cgroup_taskset_for_each_leader(leader, dst_css, tset) \ for ((leader) = cgroup_taskset_first((tset), &(dst_css)); \ (leader); \ (leader) = cgroup_taskset_next((tset), &(dst_css))) \ if ((leader) != (leader)->group_leader) \ ; \ else /* * Inline functions. */ #ifdef CONFIG_DEBUG_CGROUP_REF void css_get(struct cgroup_subsys_state *css); void css_get_many(struct cgroup_subsys_state *css, unsigned int n); bool css_tryget(struct cgroup_subsys_state *css); bool css_tryget_online(struct cgroup_subsys_state *css); void css_put(struct cgroup_subsys_state *css); void css_put_many(struct cgroup_subsys_state *css, unsigned int n); #else #define CGROUP_REF_FN_ATTRS static inline #define CGROUP_REF_EXPORT(fn) #include <linux/cgroup_refcnt.h> #endif static inline u64 cgroup_id(const struct cgroup *cgrp) { return cgrp->kn->id; } /** * css_is_dying - test whether the specified css is dying * @css: target css * * Test whether @css is in the process of offlining or already offline. In * most cases, ->css_online() and ->css_offline() callbacks should be * enough; however, the actual offline operations are RCU delayed and this * test returns %true also when @css is scheduled to be offlined. * * This is useful, for example, when the use case requires synchronous * behavior with respect to cgroup removal. cgroup removal schedules css * offlining but the css can seem alive while the operation is being * delayed. If the delay affects user visible semantics, this test can be * used to resolve the situation. */ static inline bool css_is_dying(struct cgroup_subsys_state *css) { return !(css->flags & CSS_NO_REF) && percpu_ref_is_dying(&css->refcnt); } static inline void cgroup_get(struct cgroup *cgrp) { css_get(&cgrp->self); } static inline bool cgroup_tryget(struct cgroup *cgrp) { return css_tryget(&cgrp->self); } static inline void cgroup_put(struct cgroup *cgrp) { css_put(&cgrp->self); } extern struct mutex cgroup_mutex; static inline void cgroup_lock(void) { mutex_lock(&cgroup_mutex); } static inline void cgroup_unlock(void) { mutex_unlock(&cgroup_mutex); } /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for * @__c: extra condition expression to be passed to rcu_dereference_check() * * A task's css_set is RCU protected, initialized and exited while holding * task_lock(), and can only be modified while holding both cgroup_mutex * and task_lock() while the task is alive. This macro verifies that the * caller is inside proper critical section and returns @task's css_set. * * The caller can also specify additional allowed conditions via @__c, such * as locks used during the cgroup_subsys::attach() methods. */ #ifdef CONFIG_PROVE_RCU #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ rcu_read_lock_sched_held() || \ lockdep_is_held(&cgroup_mutex) || \ lockdep_is_held(&css_set_lock) || \ ((task)->flags & PF_EXITING) || (__c)) #else #define task_css_set_check(task, __c) \ rcu_dereference((task)->cgroups) #endif /** * task_css_check - obtain css for (task, subsys) w/ extra access conds * @task: the target task * @subsys_id: the target subsystem ID * @__c: extra condition expression to be passed to rcu_dereference_check() * * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The * synchronization rules are the same as task_css_set_check(). */ #define task_css_check(task, subsys_id, __c) \ task_css_set_check((task), (__c))->subsys[(subsys_id)] /** * task_css_set - obtain a task's css_set * @task: the task to obtain css_set for * * See task_css_set_check(). */ static inline struct css_set *task_css_set(struct task_struct *task) { return task_css_set_check(task, false); } /** * task_css - obtain css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * * See task_css_check(). */ static inline struct cgroup_subsys_state *task_css(struct task_struct *task, int subsys_id) { return task_css_check(task, subsys_id, false); } /** * task_get_css - find and get the css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * * Find the css for the (@task, @subsys_id) combination, increment a * reference on and return it. This function is guaranteed to return a * valid css. The returned css may already have been offlined. */ static inline struct cgroup_subsys_state * task_get_css(struct task_struct *task, int subsys_id) { struct cgroup_subsys_state *css;