| 685 812 110 806 263 230 147 1152 214 1050 580 525 429 243 2 25 208 25 187 1032 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 | // SPDX-License-Identifier: GPL-2.0 // Generated by scripts/atomic/gen-atomic-long.sh // DO NOT MODIFY THIS FILE DIRECTLY #ifndef _LINUX_ATOMIC_LONG_H #define _LINUX_ATOMIC_LONG_H #include <linux/compiler.h> #include <asm/types.h> #ifdef CONFIG_64BIT typedef atomic64_t atomic_long_t; #define ATOMIC_LONG_INIT(i) ATOMIC64_INIT(i) #define atomic_long_cond_read_acquire atomic64_cond_read_acquire #define atomic_long_cond_read_relaxed atomic64_cond_read_relaxed #else typedef atomic_t atomic_long_t; #define ATOMIC_LONG_INIT(i) ATOMIC_INIT(i) #define atomic_long_cond_read_acquire atomic_cond_read_acquire #define atomic_long_cond_read_relaxed atomic_cond_read_relaxed #endif /** * raw_atomic_long_read() - atomic load with relaxed ordering * @v: pointer to atomic_long_t * * Atomically loads the value of @v with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_read() elsewhere. * * Return: The value loaded from @v. */ static __always_inline long raw_atomic_long_read(const atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_read(v); #else return raw_atomic_read(v); #endif } /** * raw_atomic_long_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic_long_t * * Atomically loads the value of @v with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_read_acquire() elsewhere. * * Return: The value loaded from @v. */ static __always_inline long raw_atomic_long_read_acquire(const atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_read_acquire(v); #else return raw_atomic_read_acquire(v); #endif } /** * raw_atomic_long_set() - atomic set with relaxed ordering * @v: pointer to atomic_long_t * @i: long value to assign * * Atomically sets @v to @i with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_set() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_set(atomic_long_t *v, long i) { #ifdef CONFIG_64BIT raw_atomic64_set(v, i); #else raw_atomic_set(v, i); #endif } /** * raw_atomic_long_set_release() - atomic set with release ordering * @v: pointer to atomic_long_t * @i: long value to assign * * Atomically sets @v to @i with release ordering. * * Safe to use in noinstr code; prefer atomic_long_set_release() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_set_release(atomic_long_t *v, long i) { #ifdef CONFIG_64BIT raw_atomic64_set_release(v, i); #else raw_atomic_set_release(v, i); #endif } /** * raw_atomic_long_add() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_add() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_add(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_add(i, v); #else raw_atomic_add(i, v); #endif } /** * raw_atomic_long_add_return() - atomic add with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return(i, v); #else return raw_atomic_add_return(i, v); #endif } /** * raw_atomic_long_add_return_acquire() - atomic add with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return_acquire(i, v); #else return raw_atomic_add_return_acquire(i, v); #endif } /** * raw_atomic_long_add_return_release() - atomic add with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return_release(i, v); #else return raw_atomic_add_return_release(i, v); #endif } /** * raw_atomic_long_add_return_relaxed() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return_relaxed(i, v); #else return raw_atomic_add_return_relaxed(i, v); #endif } /** * raw_atomic_long_fetch_add() - atomic add with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add(i, v); #else return raw_atomic_fetch_add(i, v); #endif } /** * raw_atomic_long_fetch_add_acquire() - atomic add with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_acquire(i, v); #else return raw_atomic_fetch_add_acquire(i, v); #endif } /** * raw_atomic_long_fetch_add_release() - atomic add with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_release(i, v); #else return raw_atomic_fetch_add_release(i, v); #endif } /** * raw_atomic_long_fetch_add_relaxed() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_relaxed(i, v); #else return raw_atomic_fetch_add_relaxed(i, v); #endif } /** * raw_atomic_long_sub() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_sub() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_sub(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_sub(i, v); #else raw_atomic_sub(i, v); #endif } /** * raw_atomic_long_sub_return() - atomic subtract with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return(i, v); #else return raw_atomic_sub_return(i, v); #endif } /** * raw_atomic_long_sub_return_acquire() - atomic subtract with acquire ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return_acquire(i, v); #else return raw_atomic_sub_return_acquire(i, v); #endif } /** * raw_atomic_long_sub_return_release() - atomic subtract with release ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return_release(i, v); #else return raw_atomic_sub_return_release(i, v); #endif } /** * raw_atomic_long_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return_relaxed(i, v); #else return raw_atomic_sub_return_relaxed(i, v); #endif } /** * raw_atomic_long_fetch_sub() - atomic subtract with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub(i, v); #else return raw_atomic_fetch_sub(i, v); #endif } /** * raw_atomic_long_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub_acquire(i, v); #else return raw_atomic_fetch_sub_acquire(i, v); #endif } /** * raw_atomic_long_fetch_sub_release() - atomic subtract with release ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub_release(i, v); #else return raw_atomic_fetch_sub_release(i, v); #endif } /** * raw_atomic_long_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub_relaxed(i, v); #else return raw_atomic_fetch_sub_relaxed(i, v); #endif } /** * raw_atomic_long_inc() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_inc() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_inc(atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_inc(v); #else raw_atomic_inc(v); #endif } /** * raw_atomic_long_inc_return() - atomic increment with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return(v); #else return raw_atomic_inc_return(v); #endif } /** * raw_atomic_long_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return_acquire(v); #else return raw_atomic_inc_return_acquire(v); #endif } /** * raw_atomic_long_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return_release(v); #else return raw_atomic_inc_return_release(v); #endif } /** * raw_atomic_long_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return_relaxed(v); #else return raw_atomic_inc_return_relaxed(v); #endif } /** * raw_atomic_long_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc(v); #else return raw_atomic_fetch_inc(v); #endif } /** * raw_atomic_long_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc_acquire(v); #else return raw_atomic_fetch_inc_acquire(v); #endif } /** * raw_atomic_long_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc_release(v); #else return raw_atomic_fetch_inc_release(v); #endif } /** * raw_atomic_long_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc_relaxed(v); #else return raw_atomic_fetch_inc_relaxed(v); #endif } /** * raw_atomic_long_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_dec() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_dec(atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_dec(v); #else raw_atomic_dec(v); #endif } /** * raw_atomic_long_dec_return() - atomic decrement with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return(v); #else return raw_atomic_dec_return(v); #endif } /** * raw_atomic_long_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return_acquire(v); #else return raw_atomic_dec_return_acquire(v); #endif } /** * raw_atomic_long_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return_release(v); #else return raw_atomic_dec_return_release(v); #endif } /** * raw_atomic_long_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return_relaxed(v); #else return raw_atomic_dec_return_relaxed(v); #endif } /** * raw_atomic_long_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec(v); #else return raw_atomic_fetch_dec(v); #endif } /** * raw_atomic_long_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec_acquire(v); #else return raw_atomic_fetch_dec_acquire(v); #endif } /** * raw_atomic_long_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec_release(v); #else return raw_atomic_fetch_dec_release(v); #endif } /** * raw_atomic_long_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec_relaxed(v); #else return raw_atomic_fetch_dec_relaxed(v); #endif } /** * raw_atomic_long_and() - atomic bitwise AND with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_and() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_and(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_and(i, v); #else raw_atomic_and(i, v); #endif } /** * raw_atomic_long_fetch_and() - atomic bitwise AND with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and(i, v); #else return raw_atomic_fetch_and(i, v); #endif } /** * raw_atomic_long_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and_acquire(i, v); #else return raw_atomic_fetch_and_acquire(i, v); #endif } /** * raw_atomic_long_fetch_and_release() - atomic bitwise AND with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and_release(i, v); #else return raw_atomic_fetch_and_release(i, v); #endif } /** * raw_atomic_long_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and_relaxed(i, v); #else return raw_atomic_fetch_and_relaxed(i, v); #endif } /** * raw_atomic_long_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_andnot() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_andnot(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_andnot(i, v); #else raw_atomic_andnot(i, v); #endif } /** * raw_atomic_long_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot(i, v); #else return raw_atomic_fetch_andnot(i, v); #endif } /** * raw_atomic_long_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot_acquire(i, v); #else return raw_atomic_fetch_andnot_acquire(i, v); #endif } /** * raw_atomic_long_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot_release(i, v); #else return raw_atomic_fetch_andnot_release(i, v); #endif } /** * raw_atomic_long_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot_relaxed(i, v); #else return raw_atomic_fetch_andnot_relaxed(i, v); #endif } /** * raw_atomic_long_or() - atomic bitwise OR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_or() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_or(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_or(i, v); #else raw_atomic_or(i, v); #endif } /** * raw_atomic_long_fetch_or() - atomic bitwise OR with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or(i, v); #else return raw_atomic_fetch_or(i, v); #endif } /** * raw_atomic_long_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or_acquire(i, v); #else return raw_atomic_fetch_or_acquire(i, v); #endif } /** * raw_atomic_long_fetch_or_release() - atomic bitwise OR with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or_release(i, v); #else return raw_atomic_fetch_or_release(i, v); #endif } /** * raw_atomic_long_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or_relaxed(i, v); #else return raw_atomic_fetch_or_relaxed(i, v); #endif } /** * raw_atomic_long_xor() - atomic bitwise XOR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_xor() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_xor(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_xor(i, v); #else raw_atomic_xor(i, v); #endif } /** * raw_atomic_long_fetch_xor() - atomic bitwise XOR with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor(i, v); #else return raw_atomic_fetch_xor(i, v); #endif } /** * raw_atomic_long_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor_acquire(i, v); #else return raw_atomic_fetch_xor_acquire(i, v); #endif } /** * raw_atomic_long_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor_release(i, v); #else return raw_atomic_fetch_xor_release(i, v); #endif } /** * raw_atomic_long_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor_relaxed(i, v); #else return raw_atomic_fetch_xor_relaxed(i, v); #endif } /** * raw_atomic_long_xchg() - atomic exchange with full ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with full ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg(v, new); #else return raw_atomic_xchg(v, new); #endif } /** * raw_atomic_long_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg_acquire(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg_acquire(v, new); #else return raw_atomic_xchg_acquire(v, new); #endif } /** * raw_atomic_long_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with release ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg_release(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg_release(v, new); #else return raw_atomic_xchg_release(v, new); #endif } /** * raw_atomic_long_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg_relaxed(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg_relaxed(v, new); #else return raw_atomic_xchg_relaxed(v, new); #endif } /** * raw_atomic_long_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg(v, old, new); #else return raw_atomic_cmpxchg(v, old, new); #endif } /** * raw_atomic_long_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg_acquire(v, old, new); #else return raw_atomic_cmpxchg_acquire(v, old, new); #endif } /** * raw_atomic_long_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg_release(v, old, new); #else return raw_atomic_cmpxchg_release(v, old, new); #endif } /** * raw_atomic_long_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg_relaxed(v, old, new); #else return raw_atomic_cmpxchg_relaxed(v, old, new); #endif } /** * raw_atomic_long_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg(v, (int *)old, new); #endif } /** * raw_atomic_long_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_acquire() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg_acquire(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg_acquire(v, (int *)old, new); #endif } /** * raw_atomic_long_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_release() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg_release(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg_release(v, (int *)old, new); #endif } /** * raw_atomic_long_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_relaxed() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg_relaxed(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg_relaxed(v, (int *)old, new); #endif } /** * raw_atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_long_sub_and_test(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_and_test(i, v); #else return raw_atomic_sub_and_test(i, v); #endif } /** * raw_atomic_long_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_long_dec_and_test(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_and_test(v); #else return raw_atomic_dec_and_test(v); #endif } /** * raw_atomic_long_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_long_inc_and_test(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_and_test(v); #else return raw_atomic_inc_and_test(v); #endif } /** * raw_atomic_long_add_negative() - atomic add and test if negative with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative(i, v); #else return raw_atomic_add_negative(i, v); #endif } /** * raw_atomic_long_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative_acquire() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative_acquire(i, v); #else return raw_atomic_add_negative_acquire(i, v); #endif } /** * raw_atomic_long_add_negative_release() - atomic add and test if negative with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative_release() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative_release(i, v); #else return raw_atomic_add_negative_release(i, v); #endif } /** * raw_atomic_long_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative_relaxed() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative_relaxed(i, v); #else return raw_atomic_add_negative_relaxed(i, v); #endif } /** * raw_atomic_long_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_long_t * @a: long value to add * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_unless() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_unless(v, a, u); #else return raw_atomic_fetch_add_unless(v, a, u); #endif } /** * raw_atomic_long_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_long_t * @a: long value to add * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_add_unless() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_add_unless(atomic_long_t *v, long a, long u) { #ifdef CONFIG_64BIT return raw_atomic64_add_unless(v, a, u); #else return raw_atomic_add_unless(v, a, u); #endif } /** * raw_atomic_long_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic_long_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_inc_not_zero() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_inc_not_zero(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_not_zero(v); #else return raw_atomic_inc_not_zero(v); #endif } /** * raw_atomic_long_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic_long_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_inc_unless_negative() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_inc_unless_negative(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_unless_negative(v); #else return raw_atomic_inc_unless_negative(v); #endif } /** * raw_atomic_long_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic_long_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_dec_unless_positive() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_dec_unless_positive(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_unless_positive(v); #else return raw_atomic_dec_unless_positive(v); #endif } /** * raw_atomic_long_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic_long_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_dec_if_positive() elsewhere. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline long raw_atomic_long_dec_if_positive(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_if_positive(v); #else return raw_atomic_dec_if_positive(v); #endif } #endif /* _LINUX_ATOMIC_LONG_H */ // eadf183c3600b8b92b91839dd3be6bcc560c752d |
| 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Multicast support for IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c */ /* Changes: * * yoshfuji : fix format of router-alert option * YOSHIFUJI Hideaki @USAGI: * Fixed source address for MLD message based on * <draft-ietf-magma-mld-source-05.txt>. * YOSHIFUJI Hideaki @USAGI: * - Ignore Queries for invalid addresses. * - MLD for link-local addresses. * David L Stevens <dlstevens@us.ibm.com>: * - MLDv2 support */ #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/jiffies.h> #include <linux/net.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_addr.h> #include <linux/if_arp.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/pkt_sched.h> #include <net/mld.h> #include <linux/workqueue.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/if_inet6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/inet_common.h> #include <net/ip6_checksum.h> /* Ensure that we have struct in6_addr aligned on 32bit word. */ static int __mld2_query_bugs[] __attribute__((__unused__)) = { BUILD_BUG_ON_ZERO(offsetof(struct mld2_query, mld2q_srcs) % 4), BUILD_BUG_ON_ZERO(offsetof(struct mld2_report, mld2r_grec) % 4), BUILD_BUG_ON_ZERO(offsetof(struct mld2_grec, grec_mca) % 4) }; static struct workqueue_struct *mld_wq; static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT; static void igmp6_join_group(struct ifmcaddr6 *ma); static void igmp6_leave_group(struct ifmcaddr6 *ma); static void mld_mca_work(struct work_struct *work); static void mld_ifc_event(struct inet6_dev *idev); static bool mld_in_v1_mode(const struct inet6_dev *idev); static int sf_setstate(struct ifmcaddr6 *pmc); static void sf_markstate(struct ifmcaddr6 *pmc); static void ip6_mc_clear_src(struct ifmcaddr6 *pmc); static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta); static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta); static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, struct inet6_dev *idev); static int __ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr, unsigned int mode); #define MLD_QRV_DEFAULT 2 /* RFC3810, 9.2. Query Interval */ #define MLD_QI_DEFAULT (125 * HZ) /* RFC3810, 9.3. Query Response Interval */ #define MLD_QRI_DEFAULT (10 * HZ) /* RFC3810, 8.1 Query Version Distinctions */ #define MLD_V1_QUERY_LEN 24 #define MLD_V2_QUERY_LEN_MIN 28 #define IPV6_MLD_MAX_MSF 64 int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF; int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT; /* * socket join on multicast group */ #define mc_dereference(e, idev) \ rcu_dereference_protected(e, lockdep_is_held(&(idev)->mc_lock)) #define sock_dereference(e, sk) \ rcu_dereference_protected(e, lockdep_sock_is_held(sk)) #define for_each_pmc_socklock(np, sk, pmc) \ for (pmc = sock_dereference((np)->ipv6_mc_list, sk); \ pmc; \ pmc = sock_dereference(pmc->next, sk)) #define for_each_pmc_rcu(np, pmc) \ for (pmc = rcu_dereference((np)->ipv6_mc_list); \ pmc; \ pmc = rcu_dereference(pmc->next)) #define for_each_psf_mclock(mc, psf) \ for (psf = mc_dereference((mc)->mca_sources, mc->idev); \ psf; \ psf = mc_dereference(psf->sf_next, mc->idev)) #define for_each_psf_rcu(mc, psf) \ for (psf = rcu_dereference((mc)->mca_sources); \ psf; \ psf = rcu_dereference(psf->sf_next)) #define for_each_psf_tomb(mc, psf) \ for (psf = mc_dereference((mc)->mca_tomb, mc->idev); \ psf; \ psf = mc_dereference(psf->sf_next, mc->idev)) #define for_each_mc_mclock(idev, mc) \ for (mc = mc_dereference((idev)->mc_list, idev); \ mc; \ mc = mc_dereference(mc->next, idev)) #define for_each_mc_rcu(idev, mc) \ for (mc = rcu_dereference((idev)->mc_list); \ mc; \ mc = rcu_dereference(mc->next)) #define for_each_mc_tomb(idev, mc) \ for (mc = mc_dereference((idev)->mc_tomb, idev); \ mc; \ mc = mc_dereference(mc->next, idev)) static int unsolicited_report_interval(struct inet6_dev *idev) { int iv; if (mld_in_v1_mode(idev)) iv = READ_ONCE(idev->cnf.mldv1_unsolicited_report_interval); else iv = READ_ONCE(idev->cnf.mldv2_unsolicited_report_interval); return iv > 0 ? iv : 1; } static int __ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr, unsigned int mode) { struct net_device *dev = NULL; struct ipv6_mc_socklist *mc_lst; struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int err; ASSERT_RTNL(); if (!ipv6_addr_is_multicast(addr)) return -EINVAL; for_each_pmc_socklock(np, sk, mc_lst) { if ((ifindex == 0 || mc_lst->ifindex == ifindex) && ipv6_addr_equal(&mc_lst->addr, addr)) return -EADDRINUSE; } mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); if (!mc_lst) return -ENOMEM; mc_lst->next = NULL; mc_lst->addr = *addr; if (ifindex == 0) { struct rt6_info *rt; rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; ip6_rt_put(rt); } } else dev = __dev_get_by_index(net, ifindex); if (!dev) { sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return -ENODEV; } mc_lst->ifindex = dev->ifindex; mc_lst->sfmode = mode; RCU_INIT_POINTER(mc_lst->sflist, NULL); /* * now add/increase the group membership on the device */ err = __ipv6_dev_mc_inc(dev, addr, mode); if (err) { sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return err; } mc_lst->next = np->ipv6_mc_list; rcu_assign_pointer(np->ipv6_mc_list, mc_lst); return 0; } int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) { return __ipv6_sock_mc_join(sk, ifindex, addr, MCAST_EXCLUDE); } EXPORT_SYMBOL(ipv6_sock_mc_join); int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex, const struct in6_addr *addr, unsigned int mode) { return __ipv6_sock_mc_join(sk, ifindex, addr, mode); } /* * socket leave on multicast group */ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_mc_socklist *mc_lst; struct ipv6_mc_socklist __rcu **lnk; struct net *net = sock_net(sk); ASSERT_RTNL(); if (!ipv6_addr_is_multicast(addr)) return -EINVAL; for (lnk = &np->ipv6_mc_list; (mc_lst = sock_dereference(*lnk, sk)) != NULL; lnk = &mc_lst->next) { if ((ifindex == 0 || mc_lst->ifindex == ifindex) && ipv6_addr_equal(&mc_lst->addr, addr)) { struct net_device *dev; *lnk = mc_lst->next; dev = __dev_get_by_index(net, mc_lst->ifindex); if (dev) { struct inet6_dev *idev = __in6_dev_get(dev); ip6_mc_leave_src(sk, mc_lst, idev); if (idev) __ipv6_dev_mc_dec(idev, &mc_lst->addr); } else { ip6_mc_leave_src(sk, mc_lst, NULL); } atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); kfree_rcu(mc_lst, rcu); return 0; } } return -EADDRNOTAVAIL; } EXPORT_SYMBOL(ipv6_sock_mc_drop); static struct inet6_dev *ip6_mc_find_dev_rtnl(struct net *net, const struct in6_addr *group, int ifindex) { struct net_device *dev = NULL; struct inet6_dev *idev = NULL; if (ifindex == 0) { struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; ip6_rt_put(rt); } } else { dev = __dev_get_by_index(net, ifindex); } if (!dev) return NULL; idev = __in6_dev_get(dev); if (!idev) return NULL; if (idev->dead) return NULL; return idev; } void __ipv6_sock_mc_close(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_mc_socklist *mc_lst; struct net *net = sock_net(sk); ASSERT_RTNL(); while ((mc_lst = sock_dereference(np->ipv6_mc_list, sk)) != NULL) { struct net_device *dev; np->ipv6_mc_list = mc_lst->next; dev = __dev_get_by_index(net, mc_lst->ifindex); if (dev) { struct inet6_dev *idev = __in6_dev_get(dev); ip6_mc_leave_src(sk, mc_lst, idev); if (idev) __ipv6_dev_mc_dec(idev, &mc_lst->addr); } else { ip6_mc_leave_src(sk, mc_lst, NULL); } atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); kfree_rcu(mc_lst, rcu); } } void ipv6_sock_mc_close(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); if (!rcu_access_pointer(np->ipv6_mc_list)) return; rtnl_lock(); lock_sock(sk); __ipv6_sock_mc_close(sk); release_sock(sk); rtnl_unlock(); } int ip6_mc_source(int add, int omode, struct sock *sk, struct group_source_req *pgsr) { struct in6_addr *source, *group; struct ipv6_mc_socklist *pmc; struct inet6_dev *idev; struct ipv6_pinfo *inet6 = inet6_sk(sk); struct ip6_sf_socklist *psl; struct net *net = sock_net(sk); int i, j, rv; int leavegroup = 0; int err; source = &((struct sockaddr_in6 *)&pgsr->gsr_source)->sin6_addr; group = &((struct sockaddr_in6 *)&pgsr->gsr_group)->sin6_addr; if (!ipv6_addr_is_multicast(group)) return -EINVAL; idev = ip6_mc_find_dev_rtnl(net, group, pgsr->gsr_interface); if (!idev) return -ENODEV; err = -EADDRNOTAVAIL; mutex_lock(&idev->mc_lock); for_each_pmc_socklock(inet6, sk, pmc) { if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface) continue; if (ipv6_addr_equal(&pmc->addr, group)) break; } if (!pmc) { /* must have a prior join */ err = -EINVAL; goto done; } /* if a source filter was set, must be the same mode as before */ if (rcu_access_pointer(pmc->sflist)) { if (pmc->sfmode != omode) { err = -EINVAL; goto done; } } else if (pmc->sfmode != omode) { /* allow mode switches for empty-set filters */ ip6_mc_add_src(idev, group, omode, 0, NULL, 0); ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); pmc->sfmode = omode; } psl = sock_dereference(pmc->sflist, sk); if (!add) { if (!psl) goto done; /* err = -EADDRNOTAVAIL */ rv = !0; for (i = 0; i < psl->sl_count; i++) { rv = !ipv6_addr_equal(&psl->sl_addr[i], source); if (rv == 0) break; } if (rv) /* source not found */ goto done; /* err = -EADDRNOTAVAIL */ /* special case - (INCLUDE, empty) == LEAVE_GROUP */ if (psl->sl_count == 1 && omode == MCAST_INCLUDE) { leavegroup = 1; goto done; } /* update the interface filter */ ip6_mc_del_src(idev, group, omode, 1, source, 1); for (j = i+1; j < psl->sl_count; j++) psl->sl_addr[j-1] = psl->sl_addr[j]; psl->sl_count--; err = 0; goto done; } /* else, add a new source to the filter */ if (psl && psl->sl_count >= sysctl_mld_max_msf) { err = -ENOBUFS; goto done; } if (!psl || psl->sl_count == psl->sl_max) { struct ip6_sf_socklist *newpsl; int count = IP6_SFBLOCK; if (psl) count += psl->sl_max; newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count), GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; } newpsl->sl_max = count; newpsl->sl_count = count - IP6_SFBLOCK; if (psl) { for (i = 0; i < psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); } rcu_assign_pointer(pmc->sflist, newpsl); kfree_rcu(psl, rcu); psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ for (i = 0; i < psl->sl_count; i++) { rv = !ipv6_addr_equal(&psl->sl_addr[i], source); if (rv == 0) /* There is an error in the address. */ goto done; } for (j = psl->sl_count-1; j >= i; j--) psl->sl_addr[j+1] = psl->sl_addr[j]; psl->sl_addr[i] = *source; psl->sl_count++; err = 0; /* update the interface list */ ip6_mc_add_src(idev, group, omode, 1, source, 1); done: mutex_unlock(&idev->mc_lock); if (leavegroup) err = ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group); return err; } int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, struct sockaddr_storage *list) { const struct in6_addr *group; struct ipv6_mc_socklist *pmc; struct inet6_dev *idev; struct ipv6_pinfo *inet6 = inet6_sk(sk); struct ip6_sf_socklist *newpsl, *psl; struct net *net = sock_net(sk); int leavegroup = 0; int i, err; group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; if (!ipv6_addr_is_multicast(group)) return -EINVAL; if (gsf->gf_fmode != MCAST_INCLUDE && gsf->gf_fmode != MCAST_EXCLUDE) return -EINVAL; idev = ip6_mc_find_dev_rtnl(net, group, gsf->gf_interface); if (!idev) return -ENODEV; err = 0; if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) { leavegroup = 1; goto done; } for_each_pmc_socklock(inet6, sk, pmc) { if (pmc->ifindex != gsf->gf_interface) continue; if (ipv6_addr_equal(&pmc->addr, group)) break; } if (!pmc) { /* must have a prior join */ err = -EINVAL; goto done; } if (gsf->gf_numsrc) { newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, gsf->gf_numsrc), GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; } newpsl->sl_max = newpsl->sl_count = gsf->gf_numsrc; for (i = 0; i < newpsl->sl_count; ++i, ++list) { struct sockaddr_in6 *psin6; psin6 = (struct sockaddr_in6 *)list; newpsl->sl_addr[i] = psin6->sin6_addr; } mutex_lock(&idev->mc_lock); err = ip6_mc_add_src(idev, group, gsf->gf_fmode, newpsl->sl_count, newpsl->sl_addr, 0); if (err) { mutex_unlock(&idev->mc_lock); sock_kfree_s(sk, newpsl, struct_size(newpsl, sl_addr, newpsl->sl_max)); goto done; } mutex_unlock(&idev->mc_lock); } else { newpsl = NULL; mutex_lock(&idev->mc_lock); ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0); mutex_unlock(&idev->mc_lock); } mutex_lock(&idev->mc_lock); psl = sock_dereference(pmc->sflist, sk); if (psl) { ip6_mc_del_src(idev, group, pmc->sfmode, psl->sl_count, psl->sl_addr, 0); atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); } else { ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); } rcu_assign_pointer(pmc->sflist, newpsl); mutex_unlock(&idev->mc_lock); kfree_rcu(psl, rcu); pmc->sfmode = gsf->gf_fmode; err = 0; done: if (leavegroup) err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group); return err; } int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, sockptr_t optval, size_t ss_offset) { struct ipv6_pinfo *inet6 = inet6_sk(sk); const struct in6_addr *group; struct ipv6_mc_socklist *pmc; struct ip6_sf_socklist *psl; unsigned int count; int i, copycount; group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; if (!ipv6_addr_is_multicast(group)) return -EINVAL; /* changes to the ipv6_mc_list require the socket lock and * rtnl lock. We have the socket lock, so reading the list is safe. */ for_each_pmc_socklock(inet6, sk, pmc) { if (pmc->ifindex != gsf->gf_interface) continue; if (ipv6_addr_equal(group, &pmc->addr)) break; } if (!pmc) /* must have a prior join */ return -EADDRNOTAVAIL; gsf->gf_fmode = pmc->sfmode; psl = sock_dereference(pmc->sflist, sk); count = psl ? psl->sl_count : 0; copycount = min(count, gsf->gf_numsrc); gsf->gf_numsrc = count; for (i = 0; i < copycount; i++) { struct sockaddr_in6 *psin6; struct sockaddr_storage ss; psin6 = (struct sockaddr_in6 *)&ss; memset(&ss, 0, sizeof(ss)); psin6->sin6_family = AF_INET6; psin6->sin6_addr = psl->sl_addr[i]; if (copy_to_sockptr_offset(optval, ss_offset, &ss, sizeof(ss))) return -EFAULT; ss_offset += sizeof(ss); } return 0; } bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr, const struct in6_addr *src_addr) { const struct ipv6_pinfo *np = inet6_sk(sk); const struct ipv6_mc_socklist *mc; const struct ip6_sf_socklist *psl; bool rv = true; rcu_read_lock(); for_each_pmc_rcu(np, mc) { if (ipv6_addr_equal(&mc->addr, mc_addr)) break; } if (!mc) { rcu_read_unlock(); return inet6_test_bit(MC6_ALL, sk); } psl = rcu_dereference(mc->sflist); if (!psl) { rv = mc->sfmode == MCAST_EXCLUDE; } else { int i; for (i = 0; i < psl->sl_count; i++) { if (ipv6_addr_equal(&psl->sl_addr[i], src_addr)) break; } if (mc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) rv = false; if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) rv = false; } rcu_read_unlock(); return rv; } /* called with mc_lock */ static void igmp6_group_added(struct ifmcaddr6 *mc) { struct net_device *dev = mc->idev->dev; char buf[MAX_ADDR_LEN]; if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) return; if (!(mc->mca_flags&MAF_LOADED)) { mc->mca_flags |= MAF_LOADED; if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) dev_mc_add(dev, buf); } if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT)) return; if (mld_in_v1_mode(mc->idev)) { igmp6_join_group(mc); return; } /* else v2 */ /* Based on RFC3810 6.1, for newly added INCLUDE SSM, we * should not send filter-mode change record as the mode * should be from IN() to IN(A). */ if (mc->mca_sfmode == MCAST_EXCLUDE) mc->mca_crcount = mc->idev->mc_qrv; mld_ifc_event(mc->idev); } /* called with mc_lock */ static void igmp6_group_dropped(struct ifmcaddr6 *mc) { struct net_device *dev = mc->idev->dev; char buf[MAX_ADDR_LEN]; if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) return; if (mc->mca_flags&MAF_LOADED) { mc->mca_flags &= ~MAF_LOADED; if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) dev_mc_del(dev, buf); } if (mc->mca_flags & MAF_NOREPORT) return; if (!mc->idev->dead) igmp6_leave_group(mc); if (cancel_delayed_work(&mc->mca_work)) refcount_dec(&mc->mca_refcnt); } /* * deleted ifmcaddr6 manipulation * called with mc_lock */ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) { struct ifmcaddr6 *pmc; /* this is an "ifmcaddr6" for convenience; only the fields below * are actually used. In particular, the refcnt and users are not * used for management of the delete list. Using the same structure * for deleted items allows change reports to use common code with * non-deleted or query-response MCA's. */ pmc = kzalloc(sizeof(*pmc), GFP_KERNEL); if (!pmc) return; pmc->idev = im->idev; in6_dev_hold(idev); pmc->mca_addr = im->mca_addr; pmc->mca_crcount = idev->mc_qrv; pmc->mca_sfmode = im->mca_sfmode; if (pmc->mca_sfmode == MCAST_INCLUDE) { struct ip6_sf_list *psf; rcu_assign_pointer(pmc->mca_tomb, mc_dereference(im->mca_tomb, idev)); rcu_assign_pointer(pmc->mca_sources, mc_dereference(im->mca_sources, idev)); RCU_INIT_POINTER(im->mca_tomb, NULL); RCU_INIT_POINTER(im->mca_sources, NULL); for_each_psf_mclock(pmc, psf) psf->sf_crcount = pmc->mca_crcount; } rcu_assign_pointer(pmc->next, idev->mc_tomb); rcu_assign_pointer(idev->mc_tomb, pmc); } /* called with mc_lock */ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) { struct ip6_sf_list *psf, *sources, *tomb; struct in6_addr *pmca = &im->mca_addr; struct ifmcaddr6 *pmc, *pmc_prev; pmc_prev = NULL; for_each_mc_tomb(idev, pmc) { if (ipv6_addr_equal(&pmc->mca_addr, pmca)) break; pmc_prev = pmc; } if (pmc) { if (pmc_prev) rcu_assign_pointer(pmc_prev->next, pmc->next); else rcu_assign_pointer(idev->mc_tomb, pmc->next); } if (pmc) { im->idev = pmc->idev; if (im->mca_sfmode == MCAST_INCLUDE) { tomb = rcu_replace_pointer(im->mca_tomb, mc_dereference(pmc->mca_tomb, pmc->idev), lockdep_is_held(&im->idev->mc_lock)); rcu_assign_pointer(pmc->mca_tomb, tomb); sources = rcu_replace_pointer(im->mca_sources, mc_dereference(pmc->mca_sources, pmc->idev), lockdep_is_held(&im->idev->mc_lock)); rcu_assign_pointer(pmc->mca_sources, sources); for_each_psf_mclock(im, psf) psf->sf_crcount = idev->mc_qrv; } else { im->mca_crcount = idev->mc_qrv; } in6_dev_put(pmc->idev); ip6_mc_clear_src(pmc); kfree_rcu(pmc, rcu); } } /* called with mc_lock */ static void mld_clear_delrec(struct inet6_dev *idev) { struct ifmcaddr6 *pmc, *nextpmc; pmc = mc_dereference(idev->mc_tomb, idev); RCU_INIT_POINTER(idev->mc_tomb, NULL); for (; pmc; pmc = nextpmc) { nextpmc = mc_dereference(pmc->next, idev); ip6_mc_clear_src(pmc); in6_dev_put(pmc->idev); kfree_rcu(pmc, rcu); } /* clear dead sources, too */ for_each_mc_mclock(idev, pmc) { struct ip6_sf_list *psf, *psf_next; psf = mc_dereference(pmc->mca_tomb, idev); RCU_INIT_POINTER(pmc->mca_tomb, NULL); for (; psf; psf = psf_next) { psf_next = mc_dereference(psf->sf_next, idev); kfree_rcu(psf, rcu); } } } static void mld_clear_query(struct inet6_dev *idev) { struct sk_buff *skb; spin_lock_bh(&idev->mc_query_lock); while ((skb = __skb_dequeue(&idev->mc_query_queue))) kfree_skb(skb); spin_unlock_bh(&idev->mc_query_lock); } static void mld_clear_report(struct inet6_dev *idev) { struct sk_buff *skb; spin_lock_bh(&idev->mc_report_lock); while ((skb = __skb_dequeue(&idev->mc_report_queue))) kfree_skb(skb); spin_unlock_bh(&idev->mc_report_lock); } static void mca_get(struct ifmcaddr6 *mc) { refcount_inc(&mc->mca_refcnt); } static void ma_put(struct ifmcaddr6 *mc) { if (refcount_dec_and_test(&mc->mca_refcnt)) { in6_dev_put(mc->idev); kfree_rcu(mc, rcu); } } /* called with mc_lock */ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, const struct in6_addr *addr, unsigned int mode) { struct ifmcaddr6 *mc; mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return NULL; INIT_DELAYED_WORK(&mc->mca_work, mld_mca_work); mc->mca_addr = *addr; mc->idev = idev; /* reference taken by caller */ mc->mca_users = 1; /* mca_stamp should be updated upon changes */ mc->mca_cstamp = mc->mca_tstamp = jiffies; refcount_set(&mc->mca_refcnt, 1); mc->mca_sfmode = mode; mc->mca_sfcount[mode] = 1; if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) || IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) mc->mca_flags |= MAF_NOREPORT; return mc; } static void inet6_ifmcaddr_notify(struct net_device *dev, const struct ifmcaddr6 *ifmca, int event) { struct inet6_fill_args fillargs = { .portid = 0, .seq = 0, .event = event, .flags = 0, .netnsid = -1, .force_rt_scope_universe = true, }; struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOMEM; skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(sizeof(struct in6_addr)) + nla_total_size(sizeof(struct ifa_cacheinfo)), GFP_KERNEL); if (!skb) goto error; err = inet6_fill_ifmcaddr(skb, ifmca, &fillargs); if (err < 0) { WARN_ON_ONCE(err == -EMSGSIZE); nlmsg_free(skb); goto error; } rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MCADDR, NULL, GFP_KERNEL); return; error: rtnl_set_sk_err(net, RTNLGRP_IPV6_MCADDR, err); } /* * device multicast group inc (add if not found) */ static int __ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr, unsigned int mode) { struct ifmcaddr6 *mc; struct inet6_dev *idev; ASSERT_RTNL(); /* we need to take a reference on idev */ idev = in6_dev_get(dev); if (!idev) return -EINVAL; if (idev->dead) { in6_dev_put(idev); return -ENODEV; } mutex_lock(&idev->mc_lock); for_each_mc_mclock(idev, mc) { if (ipv6_addr_equal(&mc->mca_addr, addr)) { mc->mca_users++; ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0); mutex_unlock(&idev->mc_lock); in6_dev_put(idev); return 0; } } mc = mca_alloc(idev, addr, mode); if (!mc) { mutex_unlock(&idev->mc_lock); in6_dev_put(idev); return -ENOMEM; } rcu_assign_pointer(mc->next, idev->mc_list); rcu_assign_pointer(idev->mc_list, mc); mca_get(mc); mld_del_delrec(idev, mc); igmp6_group_added(mc); inet6_ifmcaddr_notify(dev, mc, RTM_NEWMULTICAST); mutex_unlock(&idev->mc_lock); ma_put(mc); return 0; } int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) { return __ipv6_dev_mc_inc(dev, addr, MCAST_EXCLUDE); } EXPORT_SYMBOL(ipv6_dev_mc_inc); /* * device multicast group del */ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifmcaddr6 *ma, __rcu **map; ASSERT_RTNL(); mutex_lock(&idev->mc_lock); for (map = &idev->mc_list; (ma = mc_dereference(*map, idev)); map = &ma->next) { if (ipv6_addr_equal(&ma->mca_addr, addr)) { if (--ma->mca_users == 0) { *map = ma->next; igmp6_group_dropped(ma); inet6_ifmcaddr_notify(idev->dev, ma, RTM_DELMULTICAST); ip6_mc_clear_src(ma); mutex_unlock(&idev->mc_lock); ma_put(ma); return 0; } mutex_unlock(&idev->mc_lock); return 0; } } mutex_unlock(&idev->mc_lock); return -ENOENT; } int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr) { struct inet6_dev *idev; int err; ASSERT_RTNL(); idev = __in6_dev_get(dev); if (!idev) err = -ENODEV; else err = __ipv6_dev_mc_dec(idev, addr); return err; } EXPORT_SYMBOL(ipv6_dev_mc_dec); /* * check if the interface/address pair is valid */ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, const struct in6_addr *src_addr) { struct inet6_dev *idev; struct ifmcaddr6 *mc; bool rv = false; rcu_read_lock(); idev = __in6_dev_get(dev); if (!idev) goto unlock; for_each_mc_rcu(idev, mc) { if (ipv6_addr_equal(&mc->mca_addr, group)) break; } if (!mc) goto unlock; if (src_addr && !ipv6_addr_any(src_addr)) { struct ip6_sf_list *psf; for_each_psf_rcu(mc, psf) { if (ipv6_addr_equal(&psf->sf_addr, src_addr)) break; } if (psf) rv = READ_ONCE(psf->sf_count[MCAST_INCLUDE]) || READ_ONCE(psf->sf_count[MCAST_EXCLUDE]) != READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]); else rv = READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]) != 0; } else { rv = true; /* don't filter unspecified source */ } unlock: rcu_read_unlock(); return rv; } /* called with mc_lock */ static void mld_gq_start_work(struct inet6_dev *idev) { unsigned long tv = get_random_u32_below(idev->mc_maxdelay); idev->mc_gq_running = 1; if (!mod_delayed_work(mld_wq, &idev->mc_gq_work, tv + 2)) in6_dev_hold(idev); } /* called with mc_lock */ static void mld_gq_stop_work(struct inet6_dev *idev) { idev->mc_gq_running = 0; if (cancel_delayed_work(&idev->mc_gq_work)) __in6_dev_put(idev); } /* called with mc_lock */ static void mld_ifc_start_work(struct inet6_dev *idev, unsigned long delay) { unsigned long tv = get_random_u32_below(delay); if (!mod_delayed_work(mld_wq, &idev->mc_ifc_work, tv + 2)) in6_dev_hold(idev); } /* called with mc_lock */ static void mld_ifc_stop_work(struct inet6_dev *idev) { idev->mc_ifc_count = 0; if (cancel_delayed_work(&idev->mc_ifc_work)) __in6_dev_put(idev); } /* called with mc_lock */ static void mld_dad_start_work(struct inet6_dev *idev, unsigned long delay) { unsigned long tv = get_random_u32_below(delay); if (!mod_delayed_work(mld_wq, &idev->mc_dad_work, tv + 2)) in6_dev_hold(idev); } static void mld_dad_stop_work(struct inet6_dev *idev) { if (cancel_delayed_work(&idev->mc_dad_work)) __in6_dev_put(idev); } static void mld_query_stop_work(struct inet6_dev *idev) { spin_lock_bh(&idev->mc_query_lock); if (cancel_delayed_work(&idev->mc_query_work)) __in6_dev_put(idev); spin_unlock_bh(&idev->mc_query_lock); } static void mld_report_stop_work(struct inet6_dev *idev) { if (cancel_delayed_work_sync(&idev->mc_report_work)) __in6_dev_put(idev); } /* * IGMP handling (alias multicast ICMPv6 messages) * called with mc_lock */ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) { unsigned long delay = resptime; /* Do not start work for these addresses */ if (ipv6_addr_is_ll_all_nodes(&ma->mca_addr) || IPV6_ADDR_MC_SCOPE(&ma->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) return; if (cancel_delayed_work(&ma->mca_work)) { refcount_dec(&ma->mca_refcnt); delay = ma->mca_work.timer.expires - jiffies; } if (delay >= resptime) delay = get_random_u32_below(resptime); if (!mod_delayed_work(mld_wq, &ma->mca_work, delay)) refcount_inc(&ma->mca_refcnt); ma->mca_flags |= MAF_TIMER_RUNNING; } /* mark EXCLUDE-mode sources * called with mc_lock */ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, const struct in6_addr *srcs) { struct ip6_sf_list *psf; int i, scount; scount = 0; for_each_psf_mclock(pmc, psf) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) { /* skip inactive filters */ if (psf->sf_count[MCAST_INCLUDE] || pmc->mca_sfcount[MCAST_EXCLUDE] != psf->sf_count[MCAST_EXCLUDE]) break; if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) { scount++; break; } } } pmc->mca_flags &= ~MAF_GSQUERY; if (scount == nsrcs) /* all sources excluded */ return false; return true; } /* called with mc_lock */ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, const struct in6_addr *srcs) { struct ip6_sf_list *psf; int i, scount; if (pmc->mca_sfmode == MCAST_EXCLUDE) return mld_xmarksources(pmc, nsrcs, srcs); /* mark INCLUDE-mode sources */ scount = 0; for_each_psf_mclock(pmc, psf) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) { if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) { psf->sf_gsresp = 1; scount++; break; } } } if (!scount) { pmc->mca_flags &= ~MAF_GSQUERY; return false; } pmc->mca_flags |= MAF_GSQUERY; return true; } static int mld_force_mld_version(const struct inet6_dev *idev) { const struct net *net = dev_net(idev->dev); int all_force; all_force = READ_ONCE(net->ipv6.devconf_all->force_mld_version); /* Normally, both are 0 here. If enforcement to a particular is * being used, individual device enforcement will have a lower * precedence over 'all' device (.../conf/all/force_mld_version). */ return all_force ?: READ_ONCE(idev->cnf.force_mld_version); } static bool mld_in_v2_mode_only(const struct inet6_dev *idev) { return mld_force_mld_version(idev) == 2; } static bool mld_in_v1_mode_only(const struct inet6_dev *idev) { return mld_force_mld_version(idev) == 1; } static bool mld_in_v1_mode(const struct inet6_dev *idev) { if (mld_in_v2_mode_only(idev)) return false; if (mld_in_v1_mode_only(idev)) return true; if (idev->mc_v1_seen && time_before(jiffies, idev->mc_v1_seen)) return true; return false; } static void mld_set_v1_mode(struct inet6_dev *idev) { /* RFC3810, relevant sections: * - 9.1. Robustness Variable * - 9.2. Query Interval * - 9.3. Query Response Interval * - 9.12. Older Version Querier Present Timeout */ unsigned long switchback; switchback = (idev->mc_qrv * idev->mc_qi) + idev->mc_qri; idev->mc_v1_seen = jiffies + switchback; } static void mld_update_qrv(struct inet6_dev *idev, const struct mld2_query *mlh2) { /* RFC3810, relevant sections: * - 5.1.8. QRV (Querier's Robustness Variable) * - 9.1. Robustness Variable */ /* The value of the Robustness Variable MUST NOT be zero, * and SHOULD NOT be one. Catch this here if we ever run * into such a case in future. */ const int min_qrv = min(MLD_QRV_DEFAULT, sysctl_mld_qrv); WARN_ON(idev->mc_qrv == 0); if (mlh2->mld2q_qrv > 0) idev->mc_qrv = mlh2->mld2q_qrv; if (unlikely(idev->mc_qrv < min_qrv)) { net_warn_ratelimited("IPv6: MLD: clamping QRV from %u to %u!\n", idev->mc_qrv, min_qrv); idev->mc_qrv = min_qrv; } } static void mld_update_qi(struct inet6_dev *idev, const struct mld2_query *mlh2) { /* RFC3810, relevant sections: * - 5.1.9. QQIC (Querier's Query Interval Code) * - 9.2. Query Interval * - 9.12. Older Version Querier Present Timeout * (the [Query Interval] in the last Query received) */ unsigned long mc_qqi; if (mlh2->mld2q_qqic < 128) { mc_qqi = mlh2->mld2q_qqic; } else { unsigned long mc_man, mc_exp; mc_exp = MLDV2_QQIC_EXP(mlh2->mld2q_qqic); mc_man = MLDV2_QQIC_MAN(mlh2->mld2q_qqic); mc_qqi = (mc_man | 0x10) << (mc_exp + 3); } idev->mc_qi = mc_qqi * HZ; } static void mld_update_qri(struct inet6_dev *idev, const struct mld2_query *mlh2) { /* RFC3810, relevant sections: * - 5.1.3. Maximum Response Code * - 9.3. Query Response Interval */ idev->mc_qri = msecs_to_jiffies(mldv2_mrc(mlh2)); } static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, unsigned long *max_delay, bool v1_query) { unsigned long mldv1_md; /* Ignore v1 queries */ if (mld_in_v2_mode_only(idev)) return -EINVAL; mldv1_md = ntohs(mld->mld_maxdelay); /* When in MLDv1 fallback and a MLDv2 router start-up being * unaware of current MLDv1 operation, the MRC == MRD mapping * only works when the exponential algorithm is not being * used (as MLDv1 is unaware of such things). * * According to the RFC author, the MLDv2 implementations * he's aware of all use a MRC < 32768 on start up queries. * * Thus, should we *ever* encounter something else larger * than that, just assume the maximum possible within our * reach. */ if (!v1_query) mldv1_md = min(mldv1_md, MLDV1_MRD_MAX_COMPAT); *max_delay = max(msecs_to_jiffies(mldv1_md), 1UL); /* MLDv1 router present: we need to go into v1 mode *only* * when an MLDv1 query is received as per section 9.12. of * RFC3810! And we know from RFC2710 section 3.7 that MLDv1 * queries MUST be of exactly 24 octets. */ if (v1_query) mld_set_v1_mode(idev); /* cancel MLDv2 report work */ mld_gq_stop_work(idev); /* cancel the interface change work */ mld_ifc_stop_work(idev); /* clear deleted report items */ mld_clear_delrec(idev); return 0; } static void mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, unsigned long *max_delay) { *max_delay = max(msecs_to_jiffies(mldv2_mrc(mld)), 1UL); mld_update_qrv(idev, mld); mld_update_qi(idev, mld); mld_update_qri(idev, mld); idev->mc_maxdelay = *max_delay; return; } /* called with rcu_read_lock() */ void igmp6_event_query(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); if (!idev || idev->dead) goto out; spin_lock_bh(&idev->mc_query_lock); if (skb_queue_len(&idev->mc_query_queue) < MLD_MAX_SKBS) { __skb_queue_tail(&idev->mc_query_queue, skb); if (!mod_delayed_work(mld_wq, &idev->mc_query_work, 0)) in6_dev_hold(idev); skb = NULL; } spin_unlock_bh(&idev->mc_query_lock); out: kfree_skb(skb); } static void __mld_query_work(struct sk_buff *skb) { struct mld2_query *mlh2 = NULL; const struct in6_addr *group; unsigned long max_delay; struct inet6_dev *idev; struct ifmcaddr6 *ma; struct mld_msg *mld; int group_type; int mark = 0; int len, err; if (!pskb_may_pull(skb, sizeof(struct in6_addr))) goto kfree_skb; /* compute payload length excluding extension headers */ len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); len -= skb_network_header_len(skb); /* RFC3810 6.2 * Upon reception of an MLD message that contains a Query, the node * checks if the source address of the message is a valid link-local * address, if the Hop Limit is set to 1, and if the Router Alert * option is present in the Hop-By-Hop Options header of the IPv6 * packet. If any of these checks fails, the packet is dropped. */ if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL) || ipv6_hdr(skb)->hop_limit != 1 || !(IP6CB(skb)->flags & IP6SKB_ROUTERALERT) || IP6CB(skb)->ra != htons(IPV6_OPT_ROUTERALERT_MLD)) goto kfree_skb; idev = in6_dev_get(skb->dev); if (!idev) goto kfree_skb; mld = (struct mld_msg *)icmp6_hdr(skb); group = &mld->mld_mca; group_type = ipv6_addr_type(group); if (group_type != IPV6_ADDR_ANY && !(group_type&IPV6_ADDR_MULTICAST)) goto out; if (len < MLD_V1_QUERY_LEN) { goto out; } else if (len == MLD_V1_QUERY_LEN || mld_in_v1_mode(idev)) { err = mld_process_v1(idev, mld, &max_delay, len == MLD_V1_QUERY_LEN); if (err < 0) goto out; } else if (len >= MLD_V2_QUERY_LEN_MIN) { int srcs_offset = sizeof(struct mld2_query) - sizeof(struct icmp6hdr); if (!pskb_may_pull(skb, srcs_offset)) goto out; mlh2 = (struct mld2_query *)skb_transport_header(skb); mld_process_v2(idev, mlh2, &max_delay); if (group_type == IPV6_ADDR_ANY) { /* general query */ if (mlh2->mld2q_nsrcs) goto out; /* no sources allowed */ mld_gq_start_work(idev); goto out; } /* mark sources to include, if group & source-specific */ if (mlh2->mld2q_nsrcs != 0) { if (!pskb_may_pull(skb, srcs_offset + ntohs(mlh2->mld2q_nsrcs) * sizeof(struct in6_addr))) goto out; mlh2 = (struct mld2_query *)skb_transport_header(skb); mark = 1; } } else { goto out; } if (group_type == IPV6_ADDR_ANY) { for_each_mc_mclock(idev, ma) { igmp6_group_queried(ma, max_delay); } } else { for_each_mc_mclock(idev, ma) { if (!ipv6_addr_equal(group, &ma->mca_addr)) continue; if (ma->mca_flags & MAF_TIMER_RUNNING) { /* gsquery <- gsquery && mark */ if (!mark) ma->mca_flags &= ~MAF_GSQUERY; } else { /* gsquery <- mark */ if (mark) ma->mca_flags |= MAF_GSQUERY; else ma->mca_flags &= ~MAF_GSQUERY; } if (!(ma->mca_flags & MAF_GSQUERY) || mld_marksources(ma, ntohs(mlh2->mld2q_nsrcs), mlh2->mld2q_srcs)) igmp6_group_queried(ma, max_delay); break; } } out: in6_dev_put(idev); kfree_skb: consume_skb(skb); } static void mld_query_work(struct work_struct *work) { struct inet6_dev *idev = container_of(to_delayed_work(work), struct inet6_dev, mc_query_work); struct sk_buff_head q; struct sk_buff *skb; bool rework = false; int cnt = 0; skb_queue_head_init(&q); spin_lock_bh(&idev->mc_query_lock); while ((skb = __skb_dequeue(&idev->mc_query_queue))) { __skb_queue_tail(&q, skb); if (++cnt >= MLD_MAX_QUEUE) { rework = true; break; } } spin_unlock_bh(&idev->mc_query_lock); mutex_lock(&idev->mc_lock); while ((skb = __skb_dequeue(&q))) __mld_query_work(skb); mutex_unlock(&idev->mc_lock); if (rework && queue_delayed_work(mld_wq, &idev->mc_query_work, 0)) return; in6_dev_put(idev); } /* called with rcu_read_lock() */ void igmp6_event_report(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); if (!idev || idev->dead) goto out; spin_lock_bh(&idev->mc_report_lock); if (skb_queue_len(&idev->mc_report_queue) < MLD_MAX_SKBS) { __skb_queue_tail(&idev->mc_report_queue, skb); if (!mod_delayed_work(mld_wq, &idev->mc_report_work, 0)) in6_dev_hold(idev); skb = NULL; } spin_unlock_bh(&idev->mc_report_lock); out: kfree_skb(skb); } static void __mld_report_work(struct sk_buff *skb) { struct inet6_dev *idev; struct ifmcaddr6 *ma; struct mld_msg *mld; int addr_type; /* Our own report looped back. Ignore it. */ if (skb->pkt_type == PACKET_LOOPBACK) goto kfree_skb; /* send our report if the MC router may not have heard this report */ if (skb->pkt_type != PACKET_MULTICAST && skb->pkt_type != PACKET_BROADCAST) goto kfree_skb; if (!pskb_may_pull(skb, sizeof(*mld) - sizeof(struct icmp6hdr))) goto kfree_skb; mld = (struct mld_msg *)icmp6_hdr(skb); /* Drop reports with not link local source */ addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); if (addr_type != IPV6_ADDR_ANY && !(addr_type&IPV6_ADDR_LINKLOCAL)) goto kfree_skb; idev = in6_dev_get(skb->dev); if (!idev) goto kfree_skb; /* * Cancel the work for this group */ for_each_mc_mclock(idev, ma) { if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) { if (cancel_delayed_work(&ma->mca_work)) refcount_dec(&ma->mca_refcnt); ma->mca_flags &= ~(MAF_LAST_REPORTER | MAF_TIMER_RUNNING); break; } } in6_dev_put(idev); kfree_skb: consume_skb(skb); } static void mld_report_work(struct work_struct *work) { struct inet6_dev *idev = container_of(to_delayed_work(work), struct inet6_dev, mc_report_work); struct sk_buff_head q; struct sk_buff *skb; bool rework = false; int cnt = 0; skb_queue_head_init(&q); spin_lock_bh(&idev->mc_report_lock); while ((skb = __skb_dequeue(&idev->mc_report_queue))) { __skb_queue_tail(&q, skb); if (++cnt >= MLD_MAX_QUEUE) { rework = true; break; } } spin_unlock_bh(&idev->mc_report_lock); mutex_lock(&idev->mc_lock); while ((skb = __skb_dequeue(&q))) __mld_report_work(skb); mutex_unlock(&idev->mc_lock); if (rework && queue_delayed_work(mld_wq, &idev->mc_report_work, 0)) return; in6_dev_put(idev); } static bool is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type, int gdeleted, int sdeleted) { switch (type) { case MLD2_MODE_IS_INCLUDE: case MLD2_MODE_IS_EXCLUDE: if (gdeleted || sdeleted) return false; if (!((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp)) { if (pmc->mca_sfmode == MCAST_INCLUDE) return true; /* don't include if this source is excluded * in all filters */ if (psf->sf_count[MCAST_INCLUDE]) return type == MLD2_MODE_IS_INCLUDE; return pmc->mca_sfcount[MCAST_EXCLUDE] == psf->sf_count[MCAST_EXCLUDE]; } return false; case MLD2_CHANGE_TO_INCLUDE: if (gdeleted || sdeleted) return false; return psf->sf_count[MCAST_INCLUDE] != 0; case MLD2_CHANGE_TO_EXCLUDE: if (gdeleted || sdeleted) return false; if (pmc->mca_sfcount[MCAST_EXCLUDE] == 0 || psf->sf_count[MCAST_INCLUDE]) return false; return pmc->mca_sfcount[MCAST_EXCLUDE] == psf->sf_count[MCAST_EXCLUDE]; case MLD2_ALLOW_NEW_SOURCES: if (gdeleted || !psf->sf_crcount) return false; return (pmc->mca_sfmode == MCAST_INCLUDE) ^ sdeleted; case MLD2_BLOCK_OLD_SOURCES: if (pmc->mca_sfmode == MCAST_INCLUDE) return gdeleted || (psf->sf_crcount && sdeleted); return psf->sf_crcount && !gdeleted && !sdeleted; } return false; } static int mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) { struct ip6_sf_list *psf; int scount = 0; for_each_psf_mclock(pmc, psf) { if (!is_in(pmc, psf, type, gdeleted, sdeleted)) continue; scount++; } return scount; } static void ip6_mc_hdr(const struct sock *sk, struct sk_buff *skb, struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr, int proto, int len) { struct ipv6hdr *hdr; skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; skb_reset_network_header(skb); skb_put(skb, sizeof(struct ipv6hdr)); hdr = ipv6_hdr(skb); ip6_flow_hdr(hdr, 0, 0); hdr->payload_len = htons(len); hdr->nexthdr = proto; hdr->hop_limit = READ_ONCE(inet6_sk(sk)->hop_limit); hdr->saddr = *saddr; hdr->daddr = *daddr; } static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) { u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; struct net_device *dev = idev->dev; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; struct net *net = dev_net(dev); const struct in6_addr *saddr; struct in6_addr addr_buf; struct mld2_report *pmr; struct sk_buff *skb; unsigned int size; struct sock *sk; int err; sk = net->ipv6.igmp_sk; /* we assume size > sizeof(ra) here * Also try to not allocate high-order pages for big MTU */ size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen; skb = sock_alloc_send_skb(sk, size, 1, &err); if (!skb) return NULL; skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, hlen); skb_tailroom_reserve(skb, mtu, tlen); if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: * use unspecified address as the source address * when a valid link-local address is not available. */ saddr = &in6addr_any; } else saddr = &addr_buf; ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); skb_put_data(skb, ra, sizeof(ra)); skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data); skb_put(skb, sizeof(*pmr)); pmr = (struct mld2_report *)skb_transport_header(skb); pmr->mld2r_type = ICMPV6_MLD2_REPORT; pmr->mld2r_resv1 = 0; pmr->mld2r_cksum = 0; pmr->mld2r_resv2 = 0; pmr->mld2r_ngrec = 0; return skb; } static void mld_sendpack(struct sk_buff *skb) { struct ipv6hdr *pip6 = ipv6_hdr(skb); struct mld2_report *pmr = (struct mld2_report *)skb_transport_header(skb); int payload_len, mldlen; struct inet6_dev *idev; struct net *net = dev_net(skb->dev); int err; struct flowi6 fl6; struct dst_entry *dst; rcu_read_lock(); idev = __in6_dev_get(skb->dev); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); payload_len = (skb_tail_pointer(skb) - skb_network_header(skb)) - sizeof(*pip6); mldlen = skb_tail_pointer(skb) - skb_transport_header(skb); pip6->payload_len = htons(payload_len); pmr->mld2r_cksum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen, IPPROTO_ICMPV6, csum_partial(skb_transport_header(skb), mldlen, 0)); icmpv6_flow_init(net->ipv6.igmp_sk, &fl6, ICMPV6_MLD2_REPORT, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); dst = icmp6_dst_alloc(skb->dev, &fl6); err = 0; if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; } skb_dst_set(skb, dst); if (err) goto err_out; err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, net->ipv6.igmp_sk, skb, NULL, skb->dev, dst_output); out: if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, ICMPV6_MLD2_REPORT); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } else { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); } rcu_read_unlock(); return; err_out: kfree_skb(skb); goto out; } static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel) { return sizeof(struct mld2_grec) + 16 * mld_scount(pmc,type,gdel,sdel); } static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, int type, struct mld2_grec **ppgr, unsigned int mtu) { struct mld2_report *pmr; struct mld2_grec *pgr; if (!skb) { skb = mld_newpack(pmc->idev, mtu); if (!skb) return NULL; } pgr = skb_put(skb, sizeof(struct mld2_grec)); pgr->grec_type = type; pgr->grec_auxwords = 0; pgr->grec_nsrcs = 0; pgr->grec_mca = pmc->mca_addr; /* structure copy */ pmr = (struct mld2_report *)skb_transport_header(skb); pmr->mld2r_ngrec = htons(ntohs(pmr->mld2r_ngrec)+1); *ppgr = pgr; return skb; } #define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) /* called with mc_lock */ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted, int crsend) { struct ip6_sf_list *psf, *psf_prev, *psf_next; int scount, stotal, first, isquery, truncate; struct ip6_sf_list __rcu **psf_list; struct inet6_dev *idev = pmc->idev; struct net_device *dev = idev->dev; struct mld2_grec *pgr = NULL; struct mld2_report *pmr; unsigned int mtu; if (pmc->mca_flags & MAF_NOREPORT) return skb; mtu = READ_ONCE(dev->mtu); if (mtu < IPV6_MIN_MTU) return skb; isquery = type == MLD2_MODE_IS_INCLUDE || type == MLD2_MODE_IS_EXCLUDE; truncate = type == MLD2_MODE_IS_EXCLUDE || type == MLD2_CHANGE_TO_EXCLUDE; stotal = scount = 0; psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources; if (!rcu_access_pointer(*psf_list)) goto empty_source; pmr = skb ? (struct mld2_report *)skb_transport_header(skb) : NULL; /* EX and TO_EX get a fresh packet, if needed */ if (truncate) { if (pmr && pmr->mld2r_ngrec && AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) mld_sendpack(skb); skb = mld_newpack(idev, mtu); } } first = 1; psf_prev = NULL; for (psf = mc_dereference(*psf_list, idev); psf; psf = psf_next) { struct in6_addr *psrc; psf_next = mc_dereference(psf->sf_next, idev); if (!is_in(pmc, psf, type, gdeleted, sdeleted) && !crsend) { psf_prev = psf; continue; } /* Based on RFC3810 6.1. Should not send source-list change * records when there is a filter mode change. */ if (((gdeleted && pmc->mca_sfmode == MCAST_EXCLUDE) || (!gdeleted && pmc->mca_crcount)) && (type == MLD2_ALLOW_NEW_SOURCES || type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) goto decrease_sf_crcount; /* clear marks on query responses */ if (isquery) psf->sf_gsresp = 0; if (AVAILABLE(skb) < sizeof(*psrc) + first*sizeof(struct mld2_grec)) { if (truncate && !first) break; /* truncate these */ if (pgr) pgr->grec_nsrcs = htons(scount); if (skb) mld_sendpack(skb); skb = mld_newpack(idev, mtu); first = 1; scount = 0; } if (first) { skb = add_grhead(skb, pmc, type, &pgr, mtu); first = 0; } if (!skb) return NULL; psrc = skb_put(skb, sizeof(*psrc)); *psrc = psf->sf_addr; scount++; stotal++; if ((type == MLD2_ALLOW_NEW_SOURCES || type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) { decrease_sf_crcount: psf->sf_crcount--; if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { if (psf_prev) rcu_assign_pointer(psf_prev->sf_next, mc_dereference(psf->sf_next, idev)); else rcu_assign_pointer(*psf_list, mc_dereference(psf->sf_next, idev)); kfree_rcu(psf, rcu); continue; } } psf_prev = psf; } empty_source: if (!stotal) { if (type == MLD2_ALLOW_NEW_SOURCES || type == MLD2_BLOCK_OLD_SOURCES) return skb; if (pmc->mca_crcount || isquery || crsend) { /* make sure we have room for group header */ if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)) { mld_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } skb = add_grhead(skb, pmc, type, &pgr, mtu); } } if (pgr) pgr->grec_nsrcs = htons(scount); if (isquery) pmc->mca_flags &= ~MAF_GSQUERY; /* clear query state */ return skb; } /* called with mc_lock */ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) { struct sk_buff *skb = NULL; int type; if (!pmc) { for_each_mc_mclock(idev, pmc) { if (pmc->mca_flags & MAF_NOREPORT) continue; if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_MODE_IS_EXCLUDE; else type = MLD2_MODE_IS_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0, 0); } } else { if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_MODE_IS_EXCLUDE; else type = MLD2_MODE_IS_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0, 0); } if (skb) mld_sendpack(skb); } /* * remove zero-count source records from a source filter list * called with mc_lock */ static void mld_clear_zeros(struct ip6_sf_list __rcu **ppsf, struct inet6_dev *idev) { struct ip6_sf_list *psf_prev, *psf_next, *psf; psf_prev = NULL; for (psf = mc_dereference(*ppsf, idev); psf; psf = psf_next) { psf_next = mc_dereference(psf->sf_next, idev); if (psf->sf_crcount == 0) { if (psf_prev) rcu_assign_pointer(psf_prev->sf_next, mc_dereference(psf->sf_next, idev)); else rcu_assign_pointer(*ppsf, mc_dereference(psf->sf_next, idev)); kfree_rcu(psf, rcu); } else { psf_prev = psf; } } } /* called with mc_lock */ static void mld_send_cr(struct inet6_dev *idev) { struct ifmcaddr6 *pmc, *pmc_prev, *pmc_next; struct sk_buff *skb = NULL; int type, dtype; /* deleted MCA's */ pmc_prev = NULL; for (pmc = mc_dereference(idev->mc_tomb, idev); pmc; pmc = pmc_next) { pmc_next = mc_dereference(pmc->next, idev); if (pmc->mca_sfmode == MCAST_INCLUDE) { type = MLD2_BLOCK_OLD_SOURCES; dtype = MLD2_BLOCK_OLD_SOURCES; skb = add_grec(skb, pmc, type, 1, 0, 0); skb = add_grec(skb, pmc, dtype, 1, 1, 0); } if (pmc->mca_crcount) { if (pmc->mca_sfmode == MCAST_EXCLUDE) { type = MLD2_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 1, 0, 0); } pmc->mca_crcount--; if (pmc->mca_crcount == 0) { mld_clear_zeros(&pmc->mca_tomb, idev); mld_clear_zeros(&pmc->mca_sources, idev); } } if (pmc->mca_crcount == 0 && !rcu_access_pointer(pmc->mca_tomb) && !rcu_access_pointer(pmc->mca_sources)) { if (pmc_prev) rcu_assign_pointer(pmc_prev->next, pmc_next); else rcu_assign_pointer(idev->mc_tomb, pmc_next); in6_dev_put(pmc->idev); kfree_rcu(pmc, rcu); } else pmc_prev = pmc; } /* change recs */ for_each_mc_mclock(idev, pmc) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { type = MLD2_BLOCK_OLD_SOURCES; dtype = MLD2_ALLOW_NEW_SOURCES; } else { type = MLD2_ALLOW_NEW_SOURCES; dtype = MLD2_BLOCK_OLD_SOURCES; } skb = add_grec(skb, pmc, type, 0, 0, 0); skb = add_grec(skb, pmc, dtype, 0, 1, 0); /* deleted sources */ /* filter mode changes */ if (pmc->mca_crcount) { if (pmc->mca_sfmode == MCAST_EXCLUDE) type = MLD2_CHANGE_TO_EXCLUDE; else type = MLD2_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0, 0); pmc->mca_crcount--; } } if (!skb) return; (void) mld_sendpack(skb); } static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) { struct net *net = dev_net(dev); struct sock *sk = net->ipv6.igmp_sk; struct inet6_dev *idev; struct sk_buff *skb; struct mld_msg *hdr; const struct in6_addr *snd_addr, *saddr; struct in6_addr addr_buf; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; int err, len, payload_len, full_len; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; struct flowi6 fl6; struct dst_entry *dst; if (type == ICMPV6_MGM_REDUCTION) snd_addr = &in6addr_linklocal_allrouters; else snd_addr = addr; len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); payload_len = len + sizeof(ra); full_len = sizeof(struct ipv6hdr) + payload_len; rcu_read_lock(); IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_OUTREQUESTS); rcu_read_unlock(); skb = sock_alloc_send_skb(sk, hlen + tlen + full_len, 1, &err); if (!skb) { rcu_read_lock(); IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); return; } skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, hlen); if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: * use unspecified address as the source address * when a valid link-local address is not available. */ saddr = &in6addr_any; } else saddr = &addr_buf; ip6_mc_hdr(sk, skb, dev, saddr, snd_addr, NEXTHDR_HOP, payload_len); skb_put_data(skb, ra, sizeof(ra)); hdr = skb_put_zero(skb, sizeof(struct mld_msg)); hdr->mld_type = type; hdr->mld_mca = *addr; hdr->mld_cksum = csum_ipv6_magic(saddr, snd_addr, len, IPPROTO_ICMPV6, csum_partial(hdr, len, 0)); rcu_read_lock(); idev = __in6_dev_get(skb->dev); icmpv6_flow_init(sk, &fl6, type, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto err_out; } skb_dst_set(skb, dst); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb->dev, dst_output); out: if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } else IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); return; err_out: kfree_skb(skb); goto out; } /* called with mc_lock */ static void mld_send_initial_cr(struct inet6_dev *idev) { struct sk_buff *skb; struct ifmcaddr6 *pmc; int type; if (mld_in_v1_mode(idev)) return; skb = NULL; for_each_mc_mclock(idev, pmc) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_CHANGE_TO_EXCLUDE; else type = MLD2_ALLOW_NEW_SOURCES; skb = add_grec(skb, pmc, type, 0, 0, 1); } if (skb) mld_sendpack(skb); } void ipv6_mc_dad_complete(struct inet6_dev *idev) { mutex_lock(&idev->mc_lock); idev->mc_dad_count = idev->mc_qrv; if (idev->mc_dad_count) { mld_send_initial_cr(idev); idev->mc_dad_count--; if (idev->mc_dad_count) mld_dad_start_work(idev, unsolicited_report_interval(idev)); } mutex_unlock(&idev->mc_lock); } static void mld_dad_work(struct work_struct *work) { struct inet6_dev *idev = container_of(to_delayed_work(work), struct inet6_dev, mc_dad_work); mutex_lock(&idev->mc_lock); mld_send_initial_cr(idev); if (idev->mc_dad_count) { idev->mc_dad_count--; if (idev->mc_dad_count) mld_dad_start_work(idev, unsolicited_report_interval(idev)); } mutex_unlock(&idev->mc_lock); in6_dev_put(idev); } /* called with mc_lock */ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, const struct in6_addr *psfsrc) { struct ip6_sf_list *psf, *psf_prev; int rv = 0; psf_prev = NULL; for_each_psf_mclock(pmc, psf) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) break; psf_prev = psf; } if (!psf || psf->sf_count[sfmode] == 0) { /* source filter not found, or count wrong => bug */ return -ESRCH; } WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] - 1); if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { struct inet6_dev *idev = pmc->idev; /* no more filters for this source */ if (psf_prev) rcu_assign_pointer(psf_prev->sf_next, mc_dereference(psf->sf_next, idev)); else rcu_assign_pointer(pmc->mca_sources, mc_dereference(psf->sf_next, idev)); if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) && !mld_in_v1_mode(idev)) { psf->sf_crcount = idev->mc_qrv; rcu_assign_pointer(psf->sf_next, mc_dereference(pmc->mca_tomb, idev)); rcu_assign_pointer(pmc->mca_tomb, psf); rv = 1; } else { kfree_rcu(psf, rcu); } } return rv; } /* called with mc_lock */ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta) { struct ifmcaddr6 *pmc; int changerec = 0; int i, err; if (!idev) return -ENODEV; for_each_mc_mclock(idev, pmc) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; } if (!pmc) return -ESRCH; sf_markstate(pmc); if (!delta) { if (!pmc->mca_sfcount[sfmode]) return -EINVAL; pmc->mca_sfcount[sfmode]--; } err = 0; for (i = 0; i < sfcount; i++) { int rv = ip6_mc_del1_src(pmc, sfmode, &psfsrc[i]); changerec |= rv > 0; if (!err && rv < 0) err = rv; } if (pmc->mca_sfmode == MCAST_EXCLUDE && pmc->mca_sfcount[MCAST_EXCLUDE] == 0 && pmc->mca_sfcount[MCAST_INCLUDE]) { struct ip6_sf_list *psf; /* filter mode change */ pmc->mca_sfmode = MCAST_INCLUDE; pmc->mca_crcount = idev->mc_qrv; idev->mc_ifc_count = pmc->mca_crcount; for_each_psf_mclock(pmc, psf) psf->sf_crcount = 0; mld_ifc_event(pmc->idev); } else if (sf_setstate(pmc) || changerec) { mld_ifc_event(pmc->idev); } return err; } /* * Add multicast single-source filter to the interface list * called with mc_lock */ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, const struct in6_addr *psfsrc) { struct ip6_sf_list *psf, *psf_prev; psf_prev = NULL; for_each_psf_mclock(pmc, psf) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) break; psf_prev = psf; } if (!psf) { psf = kzalloc(sizeof(*psf), GFP_KERNEL); if (!psf) return -ENOBUFS; psf->sf_addr = *psfsrc; if (psf_prev) { rcu_assign_pointer(psf_prev->sf_next, psf); } else { rcu_assign_pointer(pmc->mca_sources, psf); } } WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] + 1); return 0; } /* called with mc_lock */ static void sf_markstate(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf; int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; for_each_psf_mclock(pmc, psf) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { psf->sf_oldin = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; } else { psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0; } } } /* called with mc_lock */ static int sf_setstate(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf, *dpsf; int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; int qrv = pmc->idev->mc_qrv; int new_in, rv; rv = 0; for_each_psf_mclock(pmc, psf) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; } else new_in = psf->sf_count[MCAST_INCLUDE] != 0; if (new_in) { if (!psf->sf_oldin) { struct ip6_sf_list *prev = NULL; for_each_psf_tomb(pmc, dpsf) { if (ipv6_addr_equal(&dpsf->sf_addr, &psf->sf_addr)) break; prev = dpsf; } if (dpsf) { if (prev) rcu_assign_pointer(prev->sf_next, mc_dereference(dpsf->sf_next, pmc->idev)); else rcu_assign_pointer(pmc->mca_tomb, mc_dereference(dpsf->sf_next, pmc->idev)); kfree_rcu(dpsf, rcu); } psf->sf_crcount = qrv; rv++; } } else if (psf->sf_oldin) { psf->sf_crcount = 0; /* * add or update "delete" records if an active filter * is now inactive */ for_each_psf_tomb(pmc, dpsf) if (ipv6_addr_equal(&dpsf->sf_addr, &psf->sf_addr)) break; if (!dpsf) { dpsf = kmalloc(sizeof(*dpsf), GFP_KERNEL); if (!dpsf) continue; *dpsf = *psf; rcu_assign_pointer(dpsf->sf_next, mc_dereference(pmc->mca_tomb, pmc->idev)); rcu_assign_pointer(pmc->mca_tomb, dpsf); } dpsf->sf_crcount = qrv; rv++; } } return rv; } /* * Add multicast source filter list to the interface list * called with mc_lock */ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta) { struct ifmcaddr6 *pmc; int isexclude; int i, err; if (!idev) return -ENODEV; for_each_mc_mclock(idev, pmc) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; } if (!pmc) return -ESRCH; sf_markstate(pmc); isexclude = pmc->mca_sfmode == MCAST_EXCLUDE; if (!delta) WRITE_ONCE(pmc->mca_sfcount[sfmode], pmc->mca_sfcount[sfmode] + 1); err = 0; for (i = 0; i < sfcount; i++) { err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i]); if (err) break; } if (err) { int j; if (!delta) WRITE_ONCE(pmc->mca_sfcount[sfmode], pmc->mca_sfcount[sfmode] - 1); for (j = 0; j < i; j++) ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) { struct ip6_sf_list *psf; /* filter mode change */ if (pmc->mca_sfcount[MCAST_EXCLUDE]) pmc->mca_sfmode = MCAST_EXCLUDE; else if (pmc->mca_sfcount[MCAST_INCLUDE]) pmc->mca_sfmode = MCAST_INCLUDE; /* else no filters; keep old mode for reports */ pmc->mca_crcount = idev->mc_qrv; idev->mc_ifc_count = pmc->mca_crcount; for_each_psf_mclock(pmc, psf) psf->sf_crcount = 0; mld_ifc_event(idev); } else if (sf_setstate(pmc)) { mld_ifc_event(idev); } return err; } /* called with mc_lock */ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf, *nextpsf; for (psf = mc_dereference(pmc->mca_tomb, pmc->idev); psf; psf = nextpsf) { nextpsf = mc_dereference(psf->sf_next, pmc->idev); kfree_rcu(psf, rcu); } RCU_INIT_POINTER(pmc->mca_tomb, NULL); for (psf = mc_dereference(pmc->mca_sources, pmc->idev); psf; psf = nextpsf) { nextpsf = mc_dereference(psf->sf_next, pmc->idev); kfree_rcu(psf, rcu); } RCU_INIT_POINTER(pmc->mca_sources, NULL); pmc->mca_sfmode = MCAST_EXCLUDE; pmc->mca_sfcount[MCAST_INCLUDE] = 0; /* Paired with the READ_ONCE() from ipv6_chk_mcast_addr() */ WRITE_ONCE(pmc->mca_sfcount[MCAST_EXCLUDE], 1); } /* called with mc_lock */ static void igmp6_join_group(struct ifmcaddr6 *ma) { unsigned long delay; if (ma->mca_flags & MAF_NOREPORT) return; igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); delay = get_random_u32_below(unsolicited_report_interval(ma->idev)); if (cancel_delayed_work(&ma->mca_work)) { refcount_dec(&ma->mca_refcnt); delay = ma->mca_work.timer.expires - jiffies; } if (!mod_delayed_work(mld_wq, &ma->mca_work, delay)) refcount_inc(&ma->mca_refcnt); ma->mca_flags |= MAF_TIMER_RUNNING | MAF_LAST_REPORTER; } static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, struct inet6_dev *idev) { struct ip6_sf_socklist *psl; int err; psl = sock_dereference(iml->sflist, sk); if (idev) mutex_lock(&idev->mc_lock); if (!psl) { /* any-source empty exclude case */ err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0); } else { err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, psl->sl_count, psl->sl_addr, 0); RCU_INIT_POINTER(iml->sflist, NULL); atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); kfree_rcu(psl, rcu); } if (idev) mutex_unlock(&idev->mc_lock); return err; } /* called with mc_lock */ static void igmp6_leave_group(struct ifmcaddr6 *ma) { if (mld_in_v1_mode(ma->idev)) { if (ma->mca_flags & MAF_LAST_REPORTER) { igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REDUCTION); } } else { mld_add_delrec(ma->idev, ma); mld_ifc_event(ma->idev); } } static void mld_gq_work(struct work_struct *work) { struct inet6_dev *idev = container_of(to_delayed_work(work), struct inet6_dev, mc_gq_work); mutex_lock(&idev->mc_lock); mld_send_report(idev, NULL); idev->mc_gq_running = 0; mutex_unlock(&idev->mc_lock); in6_dev_put(idev); } static void mld_ifc_work(struct work_struct *work) { struct inet6_dev *idev = container_of(to_delayed_work(work), struct inet6_dev, mc_ifc_work); mutex_lock(&idev->mc_lock); mld_send_cr(idev); if (idev->mc_ifc_count) { idev->mc_ifc_count--; if (idev->mc_ifc_count) mld_ifc_start_work(idev, unsolicited_report_interval(idev)); } mutex_unlock(&idev->mc_lock); in6_dev_put(idev); } /* called with mc_lock */ static void mld_ifc_event(struct inet6_dev *idev) { if (mld_in_v1_mode(idev)) return; idev->mc_ifc_count = idev->mc_qrv; mld_ifc_start_work(idev, 1); } static void mld_mca_work(struct work_struct *work) { struct ifmcaddr6 *ma = container_of(to_delayed_work(work), struct ifmcaddr6, mca_work); mutex_lock(&ma->idev->mc_lock); if (mld_in_v1_mode(ma->idev)) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); else mld_send_report(ma->idev, ma); ma->mca_flags |= MAF_LAST_REPORTER; ma->mca_flags &= ~MAF_TIMER_RUNNING; mutex_unlock(&ma->idev->mc_lock); ma_put(ma); } /* Device changing type */ void ipv6_mc_unmap(struct inet6_dev *idev) { struct ifmcaddr6 *i; /* Install multicast list, except for all-nodes (already installed) */ mutex_lock(&idev->mc_lock); for_each_mc_mclock(idev, i) igmp6_group_dropped(i); mutex_unlock(&idev->mc_lock); } void ipv6_mc_remap(struct inet6_dev *idev) { ipv6_mc_up(idev); } /* Device going down */ void ipv6_mc_down(struct inet6_dev *idev) { struct ifmcaddr6 *i; mutex_lock(&idev->mc_lock); /* Withdraw multicast list */ for_each_mc_mclock(idev, i) igmp6_group_dropped(i); mutex_unlock(&idev->mc_lock); /* Should stop work after group drop. or we will * start work again in mld_ifc_event() */ mld_query_stop_work(idev); mld_report_stop_work(idev); mutex_lock(&idev->mc_lock); mld_ifc_stop_work(idev); mld_gq_stop_work(idev); mutex_unlock(&idev->mc_lock); mld_dad_stop_work(idev); } static void ipv6_mc_reset(struct inet6_dev *idev) { idev->mc_qrv = sysctl_mld_qrv; idev->mc_qi = MLD_QI_DEFAULT; idev->mc_qri = MLD_QRI_DEFAULT; idev->mc_v1_seen = 0; idev->mc_maxdelay = unsolicited_report_interval(idev); } /* Device going up */ void ipv6_mc_up(struct inet6_dev *idev) { struct ifmcaddr6 *i; /* Install multicast list, except for all-nodes (already installed) */ ipv6_mc_reset(idev); mutex_lock(&idev->mc_lock); for_each_mc_mclock(idev, i) { mld_del_delrec(idev, i); igmp6_group_added(i); } mutex_unlock(&idev->mc_lock); } /* IPv6 device initialization. */ void ipv6_mc_init_dev(struct inet6_dev *idev) { idev->mc_gq_running = 0; INIT_DELAYED_WORK(&idev->mc_gq_work, mld_gq_work); RCU_INIT_POINTER(idev->mc_tomb, NULL); idev->mc_ifc_count = 0; INIT_DELAYED_WORK(&idev->mc_ifc_work, mld_ifc_work); INIT_DELAYED_WORK(&idev->mc_dad_work, mld_dad_work); INIT_DELAYED_WORK(&idev->mc_query_work, mld_query_work); INIT_DELAYED_WORK(&idev->mc_report_work, mld_report_work); skb_queue_head_init(&idev->mc_query_queue); skb_queue_head_init(&idev->mc_report_queue); spin_lock_init(&idev->mc_query_lock); spin_lock_init(&idev->mc_report_lock); mutex_init(&idev->mc_lock); ipv6_mc_reset(idev); } /* * Device is about to be destroyed: clean up. */ void ipv6_mc_destroy_dev(struct inet6_dev *idev) { struct ifmcaddr6 *i; /* Deactivate works */ ipv6_mc_down(idev); mutex_lock(&idev->mc_lock); mld_clear_delrec(idev); mutex_unlock(&idev->mc_lock); mld_clear_query(idev); mld_clear_report(idev); /* Delete all-nodes address. */ /* We cannot call ipv6_dev_mc_dec() directly, our caller in * addrconf.c has NULL'd out dev->ip6_ptr so in6_dev_get() will * fail. */ __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allnodes); if (idev->cnf.forwarding) __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allrouters); mutex_lock(&idev->mc_lock); while ((i = mc_dereference(idev->mc_list, idev))) { rcu_assign_pointer(idev->mc_list, mc_dereference(i->next, idev)); ip6_mc_clear_src(i); ma_put(i); } mutex_unlock(&idev->mc_lock); } static void ipv6_mc_rejoin_groups(struct inet6_dev *idev) { struct ifmcaddr6 *pmc; ASSERT_RTNL(); mutex_lock(&idev->mc_lock); if (mld_in_v1_mode(idev)) { for_each_mc_mclock(idev, pmc) igmp6_join_group(pmc); } else { mld_send_report(idev, NULL); } mutex_unlock(&idev->mc_lock); } static int ipv6_mc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct inet6_dev *idev = __in6_dev_get(dev); switch (event) { case NETDEV_RESEND_IGMP: if (idev) ipv6_mc_rejoin_groups(idev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block igmp6_netdev_notifier = { .notifier_call = ipv6_mc_netdev_event, }; #ifdef CONFIG_PROC_FS struct igmp6_mc_iter_state { struct seq_net_private p; struct net_device *dev; struct inet6_dev *idev; }; #define igmp6_mc_seq_private(seq) ((struct igmp6_mc_iter_state *)(seq)->private) static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq) { struct ifmcaddr6 *im = NULL; struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); struct net *net = seq_file_net(seq); state->idev = NULL; for_each_netdev_rcu(net, state->dev) { struct inet6_dev *idev; idev = __in6_dev_get(state->dev); if (!idev) continue; im = rcu_dereference(idev->mc_list); if (im) { state->idev = idev; break; } } return im; } static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr6 *im) { struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); im = rcu_dereference(im->next); while (!im) { state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; break; } state->idev = __in6_dev_get(state->dev); if (!state->idev) continue; im = rcu_dereference(state->idev->mc_list); } return im; } static struct ifmcaddr6 *igmp6_mc_get_idx(struct seq_file *seq, loff_t pos) { struct ifmcaddr6 *im = igmp6_mc_get_first(seq); if (im) while (pos && (im = igmp6_mc_get_next(seq, im)) != NULL) --pos; return pos ? NULL : im; } static void *igmp6_mc_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return igmp6_mc_get_idx(seq, *pos); } static void *igmp6_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ifmcaddr6 *im = igmp6_mc_get_next(seq, v); ++*pos; return im; } static void igmp6_mc_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); if (likely(state->idev)) state->idev = NULL; state->dev = NULL; rcu_read_unlock(); } static int igmp6_mc_seq_show(struct seq_file *seq, void *v) { struct ifmcaddr6 *im = (struct ifmcaddr6 *)v; struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); seq_printf(seq, "%-4d %-15s %pi6 %5d %08X %ld\n", state->dev->ifindex, state->dev->name, &im->mca_addr, im->mca_users, im->mca_flags, (im->mca_flags & MAF_TIMER_RUNNING) ? jiffies_to_clock_t(im->mca_work.timer.expires - jiffies) : 0); return 0; } static const struct seq_operations igmp6_mc_seq_ops = { .start = igmp6_mc_seq_start, .next = igmp6_mc_seq_next, .stop = igmp6_mc_seq_stop, .show = igmp6_mc_seq_show, }; struct igmp6_mcf_iter_state { struct seq_net_private p; struct net_device *dev; struct inet6_dev *idev; struct ifmcaddr6 *im; }; #define igmp6_mcf_seq_private(seq) ((struct igmp6_mcf_iter_state *)(seq)->private) static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq) { struct ip6_sf_list *psf = NULL; struct ifmcaddr6 *im = NULL; struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); struct net *net = seq_file_net(seq); state->idev = NULL; state->im = NULL; for_each_netdev_rcu(net, state->dev) { struct inet6_dev *idev; idev = __in6_dev_get(state->dev); if (unlikely(idev == NULL)) continue; im = rcu_dereference(idev->mc_list); if (likely(im)) { psf = rcu_dereference(im->mca_sources); if (likely(psf)) { state->im = im; state->idev = idev; break; } } } return psf; } static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_sf_list *psf) { struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); psf = rcu_dereference(psf->sf_next); while (!psf) { state->im = rcu_dereference(state->im->next); while (!state->im) { state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; goto out; } state->idev = __in6_dev_get(state->dev); if (!state->idev) continue; state->im = rcu_dereference(state->idev->mc_list); } psf = rcu_dereference(state->im->mca_sources); } out: return psf; } static struct ip6_sf_list *igmp6_mcf_get_idx(struct seq_file *seq, loff_t pos) { struct ip6_sf_list *psf = igmp6_mcf_get_first(seq); if (psf) while (pos && (psf = igmp6_mcf_get_next(seq, psf)) != NULL) --pos; return pos ? NULL : psf; } static void *igmp6_mcf_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return *pos ? igmp6_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *igmp6_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip6_sf_list *psf; if (v == SEQ_START_TOKEN) psf = igmp6_mcf_get_first(seq); else psf = igmp6_mcf_get_next(seq, v); ++*pos; return psf; } static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); if (likely(state->im)) state->im = NULL; if (likely(state->idev)) state->idev = NULL; state->dev = NULL; rcu_read_unlock(); } static int igmp6_mcf_seq_show(struct seq_file *seq, void *v) { struct ip6_sf_list *psf = (struct ip6_sf_list *)v; struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); if (v == SEQ_START_TOKEN) { seq_puts(seq, "Idx Device Multicast Address Source Address INC EXC\n"); } else { seq_printf(seq, "%3d %6.6s %pi6 %pi6 %6lu %6lu\n", state->dev->ifindex, state->dev->name, &state->im->mca_addr, &psf->sf_addr, READ_ONCE(psf->sf_count[MCAST_INCLUDE]), READ_ONCE(psf->sf_count[MCAST_EXCLUDE])); } return 0; } static const struct seq_operations igmp6_mcf_seq_ops = { .start = igmp6_mcf_seq_start, .next = igmp6_mcf_seq_next, .stop = igmp6_mcf_seq_stop, .show = igmp6_mcf_seq_show, }; static int __net_init igmp6_proc_init(struct net *net) { int err; err = -ENOMEM; if (!proc_create_net("igmp6", 0444, net->proc_net, &igmp6_mc_seq_ops, sizeof(struct igmp6_mc_iter_state))) goto out; if (!proc_create_net("mcfilter6", 0444, net->proc_net, &igmp6_mcf_seq_ops, sizeof(struct igmp6_mcf_iter_state))) goto out_proc_net_igmp6; err = 0; out: return err; out_proc_net_igmp6: remove_proc_entry("igmp6", net->proc_net); goto out; } static void __net_exit igmp6_proc_exit(struct net *net) { remove_proc_entry("mcfilter6", net->proc_net); remove_proc_entry("igmp6", net->proc_net); } #else static inline int igmp6_proc_init(struct net *net) { return 0; } static inline void igmp6_proc_exit(struct net *net) { } #endif static int __net_init igmp6_net_init(struct net *net) { int err; err = inet_ctl_sock_create(&net->ipv6.igmp_sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { pr_err("Failed to initialize the IGMP6 control socket (err %d)\n", err); goto out; } inet6_sk(net->ipv6.igmp_sk)->hop_limit = 1; net->ipv6.igmp_sk->sk_allocation = GFP_KERNEL; err = inet_ctl_sock_create(&net->ipv6.mc_autojoin_sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { pr_err("Failed to initialize the IGMP6 autojoin socket (err %d)\n", err); goto out_sock_create; } err = igmp6_proc_init(net); if (err) goto out_sock_create_autojoin; return 0; out_sock_create_autojoin: inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk); out_sock_create: inet_ctl_sock_destroy(net->ipv6.igmp_sk); out: return err; } static void __net_exit igmp6_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.igmp_sk); inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk); igmp6_proc_exit(net); } static struct pernet_operations igmp6_net_ops = { .init = igmp6_net_init, .exit = igmp6_net_exit, }; int __init igmp6_init(void) { int err; err = register_pernet_subsys(&igmp6_net_ops); if (err) return err; mld_wq = create_workqueue("mld"); if (!mld_wq) { unregister_pernet_subsys(&igmp6_net_ops); return -ENOMEM; } return err; } int __init igmp6_late_init(void) { return register_netdevice_notifier(&igmp6_netdev_notifier); } void igmp6_cleanup(void) { unregister_pernet_subsys(&igmp6_net_ops); destroy_workqueue(mld_wq); } void igmp6_late_cleanup(void) { unregister_netdevice_notifier(&igmp6_netdev_notifier); } |
| 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM csd #if !defined(_TRACE_CSD_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_CSD_H #include <linux/tracepoint.h> TRACE_EVENT(csd_queue_cpu, TP_PROTO(const unsigned int cpu, unsigned long callsite, smp_call_func_t func, call_single_data_t *csd), TP_ARGS(cpu, callsite, func, csd), TP_STRUCT__entry( __field(unsigned int, cpu) __field(void *, callsite) __field(void *, func) __field(void *, csd) ), TP_fast_assign( __entry->cpu = cpu; __entry->callsite = (void *)callsite; __entry->func = func; __entry->csd = csd; ), TP_printk("cpu=%u callsite=%pS func=%ps csd=%p", __entry->cpu, __entry->callsite, __entry->func, __entry->csd) ); /* * Tracepoints for a function which is called as an effect of smp_call_function.* */ DECLARE_EVENT_CLASS(csd_function, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd), TP_STRUCT__entry( __field(void *, func) __field(void *, csd) ), TP_fast_assign( __entry->func = func; __entry->csd = csd; ), TP_printk("func=%ps, csd=%p", __entry->func, __entry->csd) ); DEFINE_EVENT(csd_function, csd_function_entry, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd) ); DEFINE_EVENT(csd_function, csd_function_exit, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd) ); #endif /* _TRACE_CSD_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 3 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> */ #ifndef __KVM_ARM_PSCI_H__ #define __KVM_ARM_PSCI_H__ #include <linux/kvm_host.h> #include <uapi/linux/psci.h> #define KVM_ARM_PSCI_0_1 PSCI_VERSION(0, 1) #define KVM_ARM_PSCI_0_2 PSCI_VERSION(0, 2) #define KVM_ARM_PSCI_1_0 PSCI_VERSION(1, 0) #define KVM_ARM_PSCI_1_1 PSCI_VERSION(1, 1) #define KVM_ARM_PSCI_1_2 PSCI_VERSION(1, 2) #define KVM_ARM_PSCI_1_3 PSCI_VERSION(1, 3) #define KVM_ARM_PSCI_LATEST KVM_ARM_PSCI_1_3 static inline int kvm_psci_version(struct kvm_vcpu *vcpu) { /* * Our PSCI implementation stays the same across versions from * v0.2 onward, only adding the few mandatory functions (such * as FEATURES with 1.0) that are required by newer * revisions. It is thus safe to return the latest, unless * userspace has instructed us otherwise. */ if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PSCI_0_2)) { if (vcpu->kvm->arch.psci_version) return vcpu->kvm->arch.psci_version; return KVM_ARM_PSCI_LATEST; } return KVM_ARM_PSCI_0_1; } int kvm_psci_call(struct kvm_vcpu *vcpu); #endif /* __KVM_ARM_PSCI_H__ */ |
| 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_NETLINK_H #define __NET_NETLINK_H #include <linux/types.h> #include <linux/netlink.h> #include <linux/jiffies.h> #include <linux/in6.h> /* ======================================================================== * Netlink Messages and Attributes Interface (As Seen On TV) * ------------------------------------------------------------------------ * Messages Interface * ------------------------------------------------------------------------ * * Message Format: * <--- nlmsg_total_size(payload) ---> * <-- nlmsg_msg_size(payload) -> * +----------+- - -+-------------+- - -+-------- - - * | nlmsghdr | Pad | Payload | Pad | nlmsghdr * +----------+- - -+-------------+- - -+-------- - - * nlmsg_data(nlh)---^ ^ * nlmsg_next(nlh)-----------------------+ * * Payload Format: * <---------------------- nlmsg_len(nlh) ---------------------> * <------ hdrlen ------> <- nlmsg_attrlen(nlh, hdrlen) -> * +----------------------+- - -+--------------------------------+ * | Family Header | Pad | Attributes | * +----------------------+- - -+--------------------------------+ * nlmsg_attrdata(nlh, hdrlen)---^ * * Data Structures: * struct nlmsghdr netlink message header * * Message Construction: * nlmsg_new() create a new netlink message * nlmsg_put() add a netlink message to an skb * nlmsg_put_answer() callback based nlmsg_put() * nlmsg_end() finalize netlink message * nlmsg_get_pos() return current position in message * nlmsg_trim() trim part of message * nlmsg_cancel() cancel message construction * nlmsg_consume() free a netlink message (expected) * nlmsg_free() free a netlink message (drop) * * Message Sending: * nlmsg_multicast() multicast message to several groups * nlmsg_unicast() unicast a message to a single socket * nlmsg_notify() send notification message * * Message Length Calculations: * nlmsg_msg_size(payload) length of message w/o padding * nlmsg_total_size(payload) length of message w/ padding * nlmsg_padlen(payload) length of padding at tail * * Message Payload Access: * nlmsg_data(nlh) head of message payload * nlmsg_len(nlh) length of message payload * nlmsg_attrdata(nlh, hdrlen) head of attributes data * nlmsg_attrlen(nlh, hdrlen) length of attributes data * * Message Parsing: * nlmsg_ok(nlh, remaining) does nlh fit into remaining bytes? * nlmsg_next(nlh, remaining) get next netlink message * nlmsg_parse() parse attributes of a message * nlmsg_find_attr() find an attribute in a message * nlmsg_for_each_msg() loop over all messages * nlmsg_validate() validate netlink message incl. attrs * nlmsg_for_each_attr() loop over all attributes * * Misc: * nlmsg_report() report back to application? * * ------------------------------------------------------------------------ * Attributes Interface * ------------------------------------------------------------------------ * * Attribute Format: * <------- nla_total_size(payload) -------> * <---- nla_attr_size(payload) -----> * +----------+- - -+- - - - - - - - - +- - -+-------- - - * | Header | Pad | Payload | Pad | Header * +----------+- - -+- - - - - - - - - +- - -+-------- - - * <- nla_len(nla) -> ^ * nla_data(nla)----^ | * nla_next(nla)-----------------------------' * * Data Structures: * struct nlattr netlink attribute header * * Attribute Construction: * nla_reserve(skb, type, len) reserve room for an attribute * nla_reserve_nohdr(skb, len) reserve room for an attribute w/o hdr * nla_put(skb, type, len, data) add attribute to skb * nla_put_nohdr(skb, len, data) add attribute w/o hdr * nla_append(skb, len, data) append data to skb * * Attribute Construction for Basic Types: * nla_put_u8(skb, type, value) add u8 attribute to skb * nla_put_u16(skb, type, value) add u16 attribute to skb * nla_put_u32(skb, type, value) add u32 attribute to skb * nla_put_u64_64bit(skb, type, * value, padattr) add u64 attribute to skb * nla_put_s8(skb, type, value) add s8 attribute to skb * nla_put_s16(skb, type, value) add s16 attribute to skb * nla_put_s32(skb, type, value) add s32 attribute to skb * nla_put_s64(skb, type, value, * padattr) add s64 attribute to skb * nla_put_string(skb, type, str) add string attribute to skb * nla_put_flag(skb, type) add flag attribute to skb * nla_put_msecs(skb, type, jiffies, * padattr) add msecs attribute to skb * nla_put_in_addr(skb, type, addr) add IPv4 address attribute to skb * nla_put_in6_addr(skb, type, addr) add IPv6 address attribute to skb * * Nested Attributes Construction: * nla_nest_start(skb, type) start a nested attribute * nla_nest_end(skb, nla) finalize a nested attribute * nla_nest_cancel(skb, nla) cancel nested attribute construction * * Attribute Length Calculations: * nla_attr_size(payload) length of attribute w/o padding * nla_total_size(payload) length of attribute w/ padding * nla_padlen(payload) length of padding * * Attribute Payload Access: * nla_data(nla) head of attribute payload * nla_len(nla) length of attribute payload * * Attribute Payload Access for Basic Types: * nla_get_uint(nla) get payload for a uint attribute * nla_get_sint(nla) get payload for a sint attribute * nla_get_u8(nla) get payload for a u8 attribute * nla_get_u16(nla) get payload for a u16 attribute * nla_get_u32(nla) get payload for a u32 attribute * nla_get_u64(nla) get payload for a u64 attribute * nla_get_s8(nla) get payload for a s8 attribute * nla_get_s16(nla) get payload for a s16 attribute * nla_get_s32(nla) get payload for a s32 attribute * nla_get_s64(nla) get payload for a s64 attribute * nla_get_flag(nla) return 1 if flag is true * nla_get_msecs(nla) get payload for a msecs attribute * * The same functions also exist with _default(). * * Attribute Misc: * nla_memcpy(dest, nla, count) copy attribute into memory * nla_memcmp(nla, data, size) compare attribute with memory area * nla_strscpy(dst, nla, size) copy attribute to a sized string * nla_strcmp(nla, str) compare attribute with string * * Attribute Parsing: * nla_ok(nla, remaining) does nla fit into remaining bytes? * nla_next(nla, remaining) get next netlink attribute * nla_validate() validate a stream of attributes * nla_validate_nested() validate a stream of nested attributes * nla_find() find attribute in stream of attributes * nla_find_nested() find attribute in nested attributes * nla_parse() parse and validate stream of attrs * nla_parse_nested() parse nested attributes * nla_for_each_attr() loop over all attributes * nla_for_each_attr_type() loop over all attributes with the * given type * nla_for_each_nested() loop over the nested attributes * nla_for_each_nested_type() loop over the nested attributes with * the given type *========================================================================= */ /** * Standard attribute types to specify validation policy */ enum { NLA_UNSPEC, NLA_U8, NLA_U16, NLA_U32, NLA_U64, NLA_STRING, NLA_FLAG, NLA_MSECS, NLA_NESTED, NLA_NESTED_ARRAY, NLA_NUL_STRING, NLA_BINARY, NLA_S8, NLA_S16, NLA_S32, NLA_S64, NLA_BITFIELD32, NLA_REJECT, NLA_BE16, NLA_BE32, NLA_SINT, NLA_UINT, __NLA_TYPE_MAX, }; #define NLA_TYPE_MAX (__NLA_TYPE_MAX - 1) struct netlink_range_validation { u64 min, max; }; struct netlink_range_validation_signed { s64 min, max; }; enum nla_policy_validation { NLA_VALIDATE_NONE, NLA_VALIDATE_RANGE, NLA_VALIDATE_RANGE_WARN_TOO_LONG, NLA_VALIDATE_MIN, NLA_VALIDATE_MAX, NLA_VALIDATE_MASK, NLA_VALIDATE_RANGE_PTR, NLA_VALIDATE_FUNCTION, }; /** * struct nla_policy - attribute validation policy * @type: Type of attribute or NLA_UNSPEC * @validation_type: type of attribute validation done in addition to * type-specific validation (e.g. range, function call), see * &enum nla_policy_validation * @len: Type specific length of payload * * Policies are defined as arrays of this struct, the array must be * accessible by attribute type up to the highest identifier to be expected. * * Meaning of `len' field: * NLA_STRING Maximum length of string * NLA_NUL_STRING Maximum length of string (excluding NUL) * NLA_FLAG Unused * NLA_BINARY Maximum length of attribute payload * (but see also below with the validation type) * NLA_NESTED, * NLA_NESTED_ARRAY Length verification is done by checking len of * nested header (or empty); len field is used if * nested_policy is also used, for the max attr * number in the nested policy. * NLA_SINT, NLA_UINT, * NLA_U8, NLA_U16, * NLA_U32, NLA_U64, * NLA_S8, NLA_S16, * NLA_S32, NLA_S64, * NLA_BE16, NLA_BE32, * NLA_MSECS Leaving the length field zero will verify the * given type fits, using it verifies minimum length * just like "All other" * NLA_BITFIELD32 Unused * NLA_REJECT Unused * All other Minimum length of attribute payload * * Meaning of validation union: * NLA_BITFIELD32 This is a 32-bit bitmap/bitselector attribute and * `bitfield32_valid' is the u32 value of valid flags * NLA_REJECT This attribute is always rejected and `reject_message' * may point to a string to report as the error instead * of the generic one in extended ACK. * NLA_NESTED `nested_policy' to a nested policy to validate, must * also set `len' to the max attribute number. Use the * provided NLA_POLICY_NESTED() macro. * Note that nla_parse() will validate, but of course not * parse, the nested sub-policies. * NLA_NESTED_ARRAY `nested_policy' points to a nested policy to validate, * must also set `len' to the max attribute number. Use * the provided NLA_POLICY_NESTED_ARRAY() macro. * The difference to NLA_NESTED is the structure: * NLA_NESTED has the nested attributes directly inside * while an array has the nested attributes at another * level down and the attribute types directly in the * nesting don't matter. * NLA_UINT, * NLA_U8, * NLA_U16, * NLA_U32, * NLA_U64, * NLA_BE16, * NLA_BE32, * NLA_SINT, * NLA_S8, * NLA_S16, * NLA_S32, * NLA_S64 The `min' and `max' fields are used depending on the * validation_type field, if that is min/max/range then * the min, max or both are used (respectively) to check * the value of the integer attribute. * Note that in the interest of code simplicity and * struct size both limits are s16, so you cannot * enforce a range that doesn't fall within the range * of s16 - do that using the NLA_POLICY_FULL_RANGE() * or NLA_POLICY_FULL_RANGE_SIGNED() macros instead. * Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and * NLA_POLICY_RANGE() macros. * NLA_UINT, * NLA_U8, * NLA_U16, * NLA_U32, * NLA_U64 If the validation_type field instead is set to * NLA_VALIDATE_RANGE_PTR, `range' must be a pointer * to a struct netlink_range_validation that indicates * the min/max values. * Use NLA_POLICY_FULL_RANGE(). * NLA_SINT, * NLA_S8, * NLA_S16, * NLA_S32, * NLA_S64 If the validation_type field instead is set to * NLA_VALIDATE_RANGE_PTR, `range_signed' must be a * pointer to a struct netlink_range_validation_signed * that indicates the min/max values. * Use NLA_POLICY_FULL_RANGE_SIGNED(). * * NLA_BINARY If the validation type is like the ones for integers * above, then the min/max length (not value like for * integers) of the attribute is enforced. * * All other Unused - but note that it's a union * * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN: * NLA_BINARY Validation function called for the attribute. * All other Unused - but note that it's a union * * Example: * * static const u32 myvalidflags = 0xff231023; * * static const struct nla_policy my_policy[ATTR_MAX+1] = { * [ATTR_FOO] = { .type = NLA_U16 }, * [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ }, * [ATTR_BAZ] = NLA_POLICY_EXACT_LEN(sizeof(struct mystruct)), * [ATTR_GOO] = NLA_POLICY_BITFIELD32(myvalidflags), * }; */ struct nla_policy { u8 type; u8 validation_type; u16 len; union { /** * @strict_start_type: first attribute to validate strictly * * This entry is special, and used for the attribute at index 0 * only, and specifies special data about the policy, namely it * specifies the "boundary type" where strict length validation * starts for any attribute types >= this value, also, strict * nesting validation starts here. * * Additionally, it means that NLA_UNSPEC is actually NLA_REJECT * for any types >= this, so need to use NLA_POLICY_MIN_LEN() to * get the previous pure { .len = xyz } behaviour. The advantage * of this is that types not specified in the policy will be * rejected. * * For completely new families it should be set to 1 so that the * validation is enforced for all attributes. For existing ones * it should be set at least when new attributes are added to * the enum used by the policy, and be set to the new value that * was added to enforce strict validation from thereon. */ u16 strict_start_type; /* private: use NLA_POLICY_*() to set */ const u32 bitfield32_valid; const u32 mask; const char *reject_message; const struct nla_policy *nested_policy; const struct netlink_range_validation *range; const struct netlink_range_validation_signed *range_signed; struct { s16 min, max; }; int (*validate)(const struct nlattr *attr, struct netlink_ext_ack *extack); }; }; #define NLA_POLICY_ETH_ADDR NLA_POLICY_EXACT_LEN(ETH_ALEN) #define NLA_POLICY_ETH_ADDR_COMPAT NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN) #define _NLA_POLICY_NESTED(maxattr, policy) \ { .type = NLA_NESTED, .nested_policy = policy, .len = maxattr } #define _NLA_POLICY_NESTED_ARRAY(maxattr, policy) \ { .type = NLA_NESTED_ARRAY, .nested_policy = policy, .len = maxattr } #define NLA_POLICY_NESTED(policy) \ _NLA_POLICY_NESTED(ARRAY_SIZE(policy) - 1, policy) #define NLA_POLICY_NESTED_ARRAY(policy) \ _NLA_POLICY_NESTED_ARRAY(ARRAY_SIZE(policy) - 1, policy) #define NLA_POLICY_BITFIELD32(valid) \ { .type = NLA_BITFIELD32, .bitfield32_valid = valid } #define __NLA_IS_UINT_TYPE(tp) \ (tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 || \ tp == NLA_U64 || tp == NLA_UINT || \ tp == NLA_BE16 || tp == NLA_BE32) #define __NLA_IS_SINT_TYPE(tp) \ (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64 || \ tp == NLA_SINT) #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) #define NLA_ENSURE_UINT_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp)) + tp) #define NLA_ENSURE_UINT_OR_BINARY_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \ tp == NLA_MSECS || \ tp == NLA_BINARY) + tp) #define NLA_ENSURE_SINT_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_SINT_TYPE(tp)) + tp) #define NLA_ENSURE_INT_OR_BINARY_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \ __NLA_IS_SINT_TYPE(tp) || \ tp == NLA_MSECS || \ tp == NLA_BINARY) + tp) #define NLA_ENSURE_NO_VALIDATION_PTR(tp) \ (__NLA_ENSURE(tp != NLA_BITFIELD32 && \ tp != NLA_REJECT && \ tp != NLA_NESTED && \ tp != NLA_NESTED_ARRAY) + tp) #define NLA_POLICY_RANGE(tp, _min, _max) { \ .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_RANGE, \ .min = _min, \ .max = _max \ } #define NLA_POLICY_FULL_RANGE(tp, _range) { \ .type = NLA_ENSURE_UINT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_RANGE_PTR, \ .range = _range, \ } #define NLA_POLICY_FULL_RANGE_SIGNED(tp, _range) { \ .type = NLA_ENSURE_SINT_TYPE(tp), \ .validation_type = NLA_VALIDATE_RANGE_PTR, \ .range_signed = _range, \ } #define NLA_POLICY_MIN(tp, _min) { \ .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_MIN, \ .min = _min, \ } #define NLA_POLICY_MAX(tp, _max) { \ .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_MAX, \ .max = _max, \ } #define NLA_POLICY_MASK(tp, _mask) { \ .type = NLA_ENSURE_UINT_TYPE(tp), \ .validation_type = NLA_VALIDATE_MASK, \ .mask = _mask, \ } #define NLA_POLICY_VALIDATE_FN(tp, fn, ...) { \ .type = NLA_ENSURE_NO_VALIDATION_PTR(tp), \ .validation_type = NLA_VALIDATE_FUNCTION, \ .validate = fn, \ .len = __VA_ARGS__ + 0, \ } #define NLA_POLICY_EXACT_LEN(_len) NLA_POLICY_RANGE(NLA_BINARY, _len, _len) #define NLA_POLICY_EXACT_LEN_WARN(_len) { \ .type = NLA_BINARY, \ .validation_type = NLA_VALIDATE_RANGE_WARN_TOO_LONG, \ .min = _len, \ .max = _len \ } #define NLA_POLICY_MIN_LEN(_len) NLA_POLICY_MIN(NLA_BINARY, _len) #define NLA_POLICY_MAX_LEN(_len) NLA_POLICY_MAX(NLA_BINARY, _len) /** * struct nl_info - netlink source information * @nlh: Netlink message header of original request * @nl_net: Network namespace * @portid: Netlink PORTID of requesting application * @skip_notify: Skip netlink notifications to user space * @skip_notify_kernel: Skip selected in-kernel notifications */ struct nl_info { struct nlmsghdr *nlh; struct net *nl_net; u32 portid; u8 skip_notify:1, skip_notify_kernel:1; }; /** * enum netlink_validation - netlink message/attribute validation levels * @NL_VALIDATE_LIBERAL: Old-style "be liberal" validation, not caring about * extra data at the end of the message, attributes being longer than * they should be, or unknown attributes being present. * @NL_VALIDATE_TRAILING: Reject junk data encountered after attribute parsing. * @NL_VALIDATE_MAXTYPE: Reject attributes > max type; Together with _TRAILING * this is equivalent to the old nla_parse_strict()/nlmsg_parse_strict(). * @NL_VALIDATE_UNSPEC: Reject attributes with NLA_UNSPEC in the policy. * This can safely be set by the kernel when the given policy has no * NLA_UNSPEC anymore, and can thus be used to ensure policy entries * are enforced going forward. * @NL_VALIDATE_STRICT_ATTRS: strict attribute policy parsing (e.g. * U8, U16, U32 must have exact size, etc.) * @NL_VALIDATE_NESTED: Check that NLA_F_NESTED is set for NLA_NESTED(_ARRAY) * and unset for other policies. */ enum netlink_validation { NL_VALIDATE_LIBERAL = 0, NL_VALIDATE_TRAILING = BIT(0), NL_VALIDATE_MAXTYPE = BIT(1), NL_VALIDATE_UNSPEC = BIT(2), NL_VALIDATE_STRICT_ATTRS = BIT(3), NL_VALIDATE_NESTED = BIT(4), }; #define NL_VALIDATE_DEPRECATED_STRICT (NL_VALIDATE_TRAILING |\ NL_VALIDATE_MAXTYPE) #define NL_VALIDATE_STRICT (NL_VALIDATE_TRAILING |\ NL_VALIDATE_MAXTYPE |\ NL_VALIDATE_UNSPEC |\ NL_VALIDATE_STRICT_ATTRS |\ NL_VALIDATE_NESTED) int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *)); int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, int report, gfp_t flags); int __nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack); int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack); int nla_policy_len(const struct nla_policy *, int); struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype); ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize); char *nla_strdup(const struct nlattr *nla, gfp_t flags); int nla_memcpy(void *dest, const struct nlattr *src, int count); int nla_memcmp(const struct nlattr *nla, const void *data, size_t size); int nla_strcmp(const struct nlattr *nla, const char *str); struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen); struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr); void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen); struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen); struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr); void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen); void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data); void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr); void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data); int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data); int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr); int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data); int nla_append(struct sk_buff *skb, int attrlen, const void *data); /************************************************************************** * Netlink Messages **************************************************************************/ /** * nlmsg_msg_size - length of netlink message not including padding * @payload: length of message payload */ static inline int nlmsg_msg_size(int payload) { return NLMSG_HDRLEN + payload; } /** * nlmsg_total_size - length of netlink message including padding * @payload: length of message payload */ static inline int nlmsg_total_size(int payload) { return NLMSG_ALIGN(nlmsg_msg_size(payload)); } /** * nlmsg_padlen - length of padding at the message's tail * @payload: length of message payload */ static inline int nlmsg_padlen(int payload) { return nlmsg_total_size(payload) - nlmsg_msg_size(payload); } /** * nlmsg_data - head of message payload * @nlh: netlink message header */ static inline void *nlmsg_data(const struct nlmsghdr *nlh) { return (unsigned char *) nlh + NLMSG_HDRLEN; } /** * nlmsg_len - length of message payload * @nlh: netlink message header */ static inline int nlmsg_len(const struct nlmsghdr *nlh) { return nlh->nlmsg_len - NLMSG_HDRLEN; } /** * nlmsg_attrdata - head of attributes data * @nlh: netlink message header * @hdrlen: length of family specific header */ static inline struct nlattr *nlmsg_attrdata(const struct nlmsghdr *nlh, int hdrlen) { unsigned char *data = nlmsg_data(nlh); return (struct nlattr *) (data + NLMSG_ALIGN(hdrlen)); } /** * nlmsg_attrlen - length of attributes data * @nlh: netlink message header * @hdrlen: length of family specific header */ static inline int nlmsg_attrlen(const struct nlmsghdr *nlh, int hdrlen) { return nlmsg_len(nlh) - NLMSG_ALIGN(hdrlen); } /** * nlmsg_ok - check if the netlink message fits into the remaining bytes * @nlh: netlink message header * @remaining: number of bytes remaining in message stream */ static inline int nlmsg_ok(const struct nlmsghdr *nlh, int remaining) { return (remaining >= (int) sizeof(struct nlmsghdr) && nlh->nlmsg_len >= sizeof(struct nlmsghdr) && nlh->nlmsg_len <= remaining); } /** * nlmsg_next - next netlink message in message stream * @nlh: netlink message header * @remaining: number of bytes remaining in message stream * * Returns: the next netlink message in the message stream and * decrements remaining by the size of the current message. */ static inline struct nlmsghdr * nlmsg_next(const struct nlmsghdr *nlh, int *remaining) { int totlen = NLMSG_ALIGN(nlh->nlmsg_len); *remaining -= totlen; return (struct nlmsghdr *) ((unsigned char *) nlh + totlen); } /** * nla_parse - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. Attributes with a type * exceeding maxtype will be rejected, policy must be specified, attributes * will be validated in the strictest way possible. * * Returns: 0 on success or a negative error code. */ static inline int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, head, len, policy, NL_VALIDATE_STRICT, extack); } /** * nla_parse_deprecated - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. Attributes with a type * exceeding maxtype will be ignored and attributes from the policy are not * always strictly validated (only for new attributes). * * Returns: 0 on success or a negative error code. */ static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, head, len, policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_parse_deprecated_strict - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. Attributes with a type * exceeding maxtype will be rejected as well as trailing data, but the * policy is not completely strictly validated (only for new attributes). * * Returns: 0 on success or a negative error code. */ static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, head, len, policy, NL_VALIDATE_DEPRECATED_STRICT, extack); } /** * __nlmsg_parse - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * See nla_parse() */ static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) { NL_SET_ERR_MSG(extack, "Invalid header length"); return -EINVAL; } return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), policy, validate, extack); } /** * nlmsg_parse - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse() */ static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, NL_VALIDATE_STRICT, extack); } /** * nlmsg_parse_deprecated - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse_deprecated() */ static inline int nlmsg_parse_deprecated(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nlmsg_parse_deprecated_strict - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse_deprecated_strict() */ static inline int nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, NL_VALIDATE_DEPRECATED_STRICT, extack); } /** * nlmsg_find_attr - find a specific attribute in a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @attrtype: type of attribute to look for * * Returns: the first attribute which matches the specified type. */ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, int hdrlen, int attrtype) { return nla_find(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), attrtype); } /** * nla_validate_deprecated - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation is done in liberal mode. * See documentation of struct nla_policy for more details. * * Returns: 0 on success or a negative error code. */ static inline int nla_validate_deprecated(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_validate - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation is done in strict mode. * See documentation of struct nla_policy for more details. * * Returns: 0 on success or a negative error code. */ static inline int nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_STRICT, extack); } /** * nlmsg_validate_deprecated - validate a netlink message including attributes * @nlh: netlinket message header * @hdrlen: length of family specific header * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct */ static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh, int hdrlen, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) return -EINVAL; return __nla_validate(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nlmsg_report - need to report back to application? * @nlh: netlink message header * * Returns: 1 if a report back to the application is requested. */ static inline int nlmsg_report(const struct nlmsghdr *nlh) { return nlh ? !!(nlh->nlmsg_flags & NLM_F_ECHO) : 0; } /** * nlmsg_seq - return the seq number of netlink message * @nlh: netlink message header * * Returns: 0 if netlink message is NULL */ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) { return nlh ? nlh->nlmsg_seq : 0; } /** * nlmsg_for_each_attr - iterate over a stream of attributes * @pos: loop counter, set to current attribute * @nlh: netlink message header * @hdrlen: length of family specific header * @rem: initialized to len, holds bytes currently remaining in stream */ #define nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \ nla_for_each_attr(pos, nlmsg_attrdata(nlh, hdrlen), \ nlmsg_attrlen(nlh, hdrlen), rem) /** * nlmsg_put - Add a new netlink message to an skb * @skb: socket buffer to store message in * @portid: netlink PORTID of requesting application * @seq: sequence number of message * @type: message type * @payload: length of message payload * @flags: message flags * * Returns: NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int payload, int flags) { if (unlikely(skb_tailroom(skb) < nlmsg_total_size(payload))) return NULL; return __nlmsg_put(skb, portid, seq, type, payload, flags); } /** * nlmsg_append - Add more data to a nlmsg in a skb * @skb: socket buffer to store message in * @size: length of message payload * * Append data to an existing nlmsg, used when constructing a message * with multiple fixed-format headers (which is rare). * Returns: NULL if the tailroom of the skb is insufficient to store * the extra payload. */ static inline void *nlmsg_append(struct sk_buff *skb, u32 size) { if (unlikely(skb_tailroom(skb) < NLMSG_ALIGN(size))) return NULL; if (NLMSG_ALIGN(size) - size) memset(skb_tail_pointer(skb) + size, 0, NLMSG_ALIGN(size) - size); return __skb_put(skb, NLMSG_ALIGN(size)); } /** * nlmsg_put_answer - Add a new callback based netlink message to an skb * @skb: socket buffer to store message in * @cb: netlink callback * @type: message type * @payload: length of message payload * @flags: message flags * * Returns: NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put_answer(struct sk_buff *skb, struct netlink_callback *cb, int type, int payload, int flags) { return nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, type, payload, flags); } /** * nlmsg_new - Allocate a new netlink message * @payload: size of the message payload * @flags: the type of memory to allocate. * * Use NLMSG_DEFAULT_SIZE if the size of the payload isn't known * and a good default is needed. */ static inline struct sk_buff *nlmsg_new(size_t payload, gfp_t flags) { return alloc_skb(nlmsg_total_size(payload), flags); } /** * nlmsg_new_large - Allocate a new netlink message with non-contiguous * physical memory * @payload: size of the message payload * * The allocated skb is unable to have frag page for shinfo->frags*, * as the NULL setting for skb->head in netlink_skb_destructor() will * bypass most of the handling in skb_release_data() */ static inline struct sk_buff *nlmsg_new_large(size_t payload) { return netlink_alloc_large_skb(nlmsg_total_size(payload), 0); } /** * nlmsg_end - Finalize a netlink message * @skb: socket buffer the message is stored in * @nlh: netlink message header * * Corrects the netlink message header to include the appended * attributes. Only necessary if attributes have been added to * the message. */ static inline void nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh) { nlh->nlmsg_len = skb_tail_pointer(skb) - (unsigned char *)nlh; } /** * nlmsg_get_pos - return current position in netlink message * @skb: socket buffer the message is stored in * * Returns: a pointer to the current tail of the message. */ static inline void *nlmsg_get_pos(struct sk_buff *skb) { return skb_tail_pointer(skb); } /** * nlmsg_trim - Trim message to a mark * @skb: socket buffer the message is stored in * @mark: mark to trim to * * Trims the message to the provided mark. */ static inline void nlmsg_trim(struct sk_buff *skb, const void *mark) { if (mark) { WARN_ON((unsigned char *) mark < skb->data); skb_trim(skb, (unsigned char *) mark - skb->data); } } /** * nlmsg_cancel - Cancel construction of a netlink message * @skb: socket buffer the message is stored in * @nlh: netlink message header * * Removes the complete netlink message including all * attributes from the socket buffer again. */ static inline void nlmsg_cancel(struct sk_buff *skb, struct nlmsghdr *nlh) { nlmsg_trim(skb, nlh); } /** * nlmsg_free - drop a netlink message * @skb: socket buffer of netlink message */ static inline void nlmsg_free(struct sk_buff *skb) { kfree_skb(skb); } /** * nlmsg_consume - free a netlink message * @skb: socket buffer of netlink message */ static inline void nlmsg_consume(struct sk_buff *skb) { consume_skb(skb); } /** * nlmsg_multicast_filtered - multicast a netlink message with filter function * @sk: netlink socket to spread messages to * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: multicast group id * @flags: allocation flags * @filter: filter function * @filter_data: filter function private data * * Return: 0 on success, negative error code for failure. */ static inline int nlmsg_multicast_filtered(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags, netlink_filter_fn filter, void *filter_data) { int err; NETLINK_CB(skb).dst_group = group; err = netlink_broadcast_filtered(sk, skb, portid, group, flags, filter, filter_data); if (err > 0) err = 0; return err; } /** * nlmsg_multicast - multicast a netlink message * @sk: netlink socket to spread messages to * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: multicast group id * @flags: allocation flags */ static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { return nlmsg_multicast_filtered(sk, skb, portid, group, flags, NULL, NULL); } /** * nlmsg_unicast - unicast a netlink message * @sk: netlink socket to spread message to * @skb: netlink message as socket buffer * @portid: netlink portid of the destination socket */ static inline int nlmsg_unicast(struct sock *sk, struct sk_buff *skb, u32 portid) { int err; err = netlink_unicast(sk, skb, portid, MSG_DONTWAIT); if (err > 0) err = 0; return err; } /** * nlmsg_for_each_msg - iterate over a stream of messages * @pos: loop counter, set to current message * @head: head of message stream * @len: length of message stream * @rem: initialized to len, holds bytes currently remaining in stream */ #define nlmsg_for_each_msg(pos, head, len, rem) \ for (pos = head, rem = len; \ nlmsg_ok(pos, rem); \ pos = nlmsg_next(pos, &(rem))) /** * nl_dump_check_consistent - check if sequence is consistent and advertise if not * @cb: netlink callback structure that stores the sequence number * @nlh: netlink message header to write the flag to * * This function checks if the sequence (generation) number changed during dump * and if it did, advertises it in the netlink message header. * * The correct way to use it is to set cb->seq to the generation counter when * all locks for dumping have been acquired, and then call this function for * each message that is generated. * * Note that due to initialisation concerns, 0 is an invalid sequence number * and must not be used by code that uses this functionality. */ static inline void nl_dump_check_consistent(struct netlink_callback *cb, struct nlmsghdr *nlh) { if (cb->prev_seq && cb->seq != cb->prev_seq) nlh->nlmsg_flags |= NLM_F_DUMP_INTR; cb->prev_seq = cb->seq; } /************************************************************************** * Netlink Attributes **************************************************************************/ /** * nla_attr_size - length of attribute not including padding * @payload: length of payload */ static inline int nla_attr_size(int payload) { return NLA_HDRLEN + payload; } /** * nla_total_size - total length of attribute including padding * @payload: length of payload */ static inline int nla_total_size(int payload) { return NLA_ALIGN(nla_attr_size(payload)); } /** * nla_padlen - length of padding at the tail of attribute * @payload: length of payload */ static inline int nla_padlen(int payload) { return nla_total_size(payload) - nla_attr_size(payload); } /** * nla_type - attribute type * @nla: netlink attribute */ static inline int nla_type(const struct nlattr *nla) { return nla->nla_type & NLA_TYPE_MASK; } /** * nla_data - head of payload * @nla: netlink attribute */ static inline void *nla_data(const struct nlattr *nla) { return (char *) nla + NLA_HDRLEN; } /** * nla_len - length of payload * @nla: netlink attribute */ static inline u16 nla_len(const struct nlattr *nla) { return nla->nla_len - NLA_HDRLEN; } /** * nla_ok - check if the netlink attribute fits into the remaining bytes * @nla: netlink attribute * @remaining: number of bytes remaining in attribute stream */ static inline int nla_ok(const struct nlattr *nla, int remaining) { return remaining >= (int) sizeof(*nla) && nla->nla_len >= sizeof(*nla) && nla->nla_len <= remaining; } /** * nla_next - next netlink attribute in attribute stream * @nla: netlink attribute * @remaining: number of bytes remaining in attribute stream * * Returns: the next netlink attribute in the attribute stream and * decrements remaining by the size of the current attribute. */ static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining) { unsigned int totlen = NLA_ALIGN(nla->nla_len); *remaining -= totlen; return (struct nlattr *) ((char *) nla + totlen); } /** * nla_find_nested - find attribute in a set of nested attributes * @nla: attribute containing the nested attributes * @attrtype: type of attribute to look for * * Returns: the first attribute which matches the specified type. */ static inline struct nlattr * nla_find_nested(const struct nlattr *nla, int attrtype) { return nla_find(nla_data(nla), nla_len(nla), attrtype); } /** * nla_parse_nested - parse nested attributes * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @nla: attribute containing the nested attributes * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse() */ static inline int nla_parse_nested(struct nlattr *tb[], int maxtype, const struct nlattr *nla, const struct nla_policy *policy, struct netlink_ext_ack *extack) { if (!(nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR(extack, nla, "NLA_F_NESTED is missing"); return -EINVAL; } return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy, NL_VALIDATE_STRICT, extack); } /** * nla_parse_nested_deprecated - parse nested attributes * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @nla: attribute containing the nested attributes * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse_deprecated() */ static inline int nla_parse_nested_deprecated(struct nlattr *tb[], int maxtype, const struct nlattr *nla, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_put_u8 - Add a u8 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value) { /* temporary variables to work around GCC PR81715 with asan-stack=1 */ u8 tmp = value; return nla_put(skb, attrtype, sizeof(u8), &tmp); } /** * nla_put_u16 - Add a u16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value) { u16 tmp = value; return nla_put(skb, attrtype, sizeof(u16), &tmp); } /** * nla_put_be16 - Add a __be16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value) { __be16 tmp = value; return nla_put(skb, attrtype, sizeof(__be16), &tmp); } /** * nla_put_net16 - Add 16-bit network byte order netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_net16(struct sk_buff *skb, int attrtype, __be16 value) { __be16 tmp = value; return nla_put_be16(skb, attrtype | NLA_F_NET_BYTEORDER, tmp); } /** * nla_put_le16 - Add a __le16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_le16(struct sk_buff *skb, int attrtype, __le16 value) { __le16 tmp = value; return nla_put(skb, attrtype, sizeof(__le16), &tmp); } /** * nla_put_u32 - Add a u32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value) { u32 tmp = value; return nla_put(skb, attrtype, sizeof(u32), &tmp); } /** * nla_put_uint - Add a variable-size unsigned int to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_uint(struct sk_buff *skb, int attrtype, u64 value) { u64 tmp64 = value; u32 tmp32 = value; if (tmp64 == tmp32) return nla_put_u32(skb, attrtype, tmp32); return nla_put(skb, attrtype, sizeof(u64), &tmp64); } /** * nla_put_be32 - Add a __be32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value) { __be32 tmp = value; return nla_put(skb, attrtype, sizeof(__be32), &tmp); } /** * nla_put_net32 - Add 32-bit network byte order netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_net32(struct sk_buff *skb, int attrtype, __be32 value) { __be32 tmp = value; return nla_put_be32(skb, attrtype | NLA_F_NET_BYTEORDER, tmp); } /** * nla_put_le32 - Add a __le32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_le32(struct sk_buff *skb, int attrtype, __le32 value) { __le32 tmp = value; return nla_put(skb, attrtype, sizeof(__le32), &tmp); } /** * nla_put_u64_64bit - Add a u64 netlink attribute to a skb and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype, u64 value, int padattr) { u64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr); } /** * nla_put_be64 - Add a __be64 netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value, int padattr) { __be64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(__be64), &tmp, padattr); } /** * nla_put_net64 - Add 64-bit network byte order nlattr to a skb and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value, int padattr) { __be64 tmp = value; return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, tmp, padattr); } /** * nla_put_le64 - Add a __le64 netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value, int padattr) { __le64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(__le64), &tmp, padattr); } /** * nla_put_s8 - Add a s8 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_s8(struct sk_buff *skb, int attrtype, s8 value) { s8 tmp = value; return nla_put(skb, attrtype, sizeof(s8), &tmp); } /** * nla_put_s16 - Add a s16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_s16(struct sk_buff *skb, int attrtype, s16 value) { s16 tmp = value; return nla_put(skb, attrtype, sizeof(s16), &tmp); } /** * nla_put_s32 - Add a s32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_s32(struct sk_buff *skb, int attrtype, s32 value) { s32 tmp = value; return nla_put(skb, attrtype, sizeof(s32), &tmp); } /** * nla_put_s64 - Add a s64 netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value, int padattr) { s64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(s64), &tmp, padattr); } /** * nla_put_sint - Add a variable-size signed int to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_sint(struct sk_buff *skb, int attrtype, s64 value) { s64 tmp64 = value; s32 tmp32 = value; if (tmp64 == tmp32) return nla_put_s32(skb, attrtype, tmp32); return nla_put(skb, attrtype, sizeof(s64), &tmp64); } /** * nla_put_string - Add a string netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @str: NUL terminated string */ static inline int nla_put_string(struct sk_buff *skb, int attrtype, const char *str) { return nla_put(skb, attrtype, strlen(str) + 1, str); } /** * nla_put_flag - Add a flag netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type */ static inline int nla_put_flag(struct sk_buff *skb, int attrtype) { return nla_put(skb, attrtype, 0, NULL); } /** * nla_put_msecs - Add a msecs netlink attribute to a skb and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @njiffies: number of jiffies to convert to msecs * @padattr: attribute type for the padding */ static inline int nla_put_msecs(struct sk_buff *skb, int attrtype, unsigned long njiffies, int padattr) { u64 tmp = jiffies_to_msecs(njiffies); return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr); } /** * nla_put_in_addr - Add an IPv4 address netlink attribute to a socket * buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @addr: IPv4 address */ static inline int nla_put_in_addr(struct sk_buff *skb, int attrtype, __be32 addr) { __be32 tmp = addr; return nla_put_be32(skb, attrtype, tmp); } /** * nla_put_in6_addr - Add an IPv6 address netlink attribute to a socket * buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @addr: IPv6 address */ static inline int nla_put_in6_addr(struct sk_buff *skb, int attrtype, const struct in6_addr *addr) { return nla_put(skb, attrtype, sizeof(*addr), addr); } /** * nla_put_bitfield32 - Add a bitfield32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: value carrying bits * @selector: selector of valid bits */ static inline int nla_put_bitfield32(struct sk_buff *skb, int attrtype, __u32 value, __u32 selector) { struct nla_bitfield32 tmp = { value, selector, }; return nla_put(skb, attrtype, sizeof(tmp), &tmp); } /** * nla_get_u32 - return payload of u32 attribute * @nla: u32 netlink attribute */ static inline u32 nla_get_u32(const struct nlattr *nla) { return *(u32 *) nla_data(nla); } /** * nla_get_u32_default - return payload of u32 attribute or default * @nla: u32 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u32 nla_get_u32_default(const struct nlattr *nla, u32 defvalue) { if (!nla) return defvalue; return nla_get_u32(nla); } /** * nla_get_be32 - return payload of __be32 attribute * @nla: __be32 netlink attribute */ static inline __be32 nla_get_be32(const struct nlattr *nla) { return *(__be32 *) nla_data(nla); } /** * nla_get_be32_default - return payload of be32 attribute or default * @nla: __be32 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __be32 nla_get_be32_default(const struct nlattr *nla, __be32 defvalue) { if (!nla) return defvalue; return nla_get_be32(nla); } /** * nla_get_le32 - return payload of __le32 attribute * @nla: __le32 netlink attribute */ static inline __le32 nla_get_le32(const struct nlattr *nla) { return *(__le32 *) nla_data(nla); } /** * nla_get_le32_default - return payload of le32 attribute or default * @nla: __le32 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __le32 nla_get_le32_default(const struct nlattr *nla, __le32 defvalue) { if (!nla) return defvalue; return nla_get_le32(nla); } /** * nla_get_u16 - return payload of u16 attribute * @nla: u16 netlink attribute */ static inline u16 nla_get_u16(const struct nlattr *nla) { return *(u16 *) nla_data(nla); } /** * nla_get_u16_default - return payload of u16 attribute or default * @nla: u16 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u16 nla_get_u16_default(const struct nlattr *nla, u16 defvalue) { if (!nla) return defvalue; return nla_get_u16(nla); } /** * nla_get_be16 - return payload of __be16 attribute * @nla: __be16 netlink attribute */ static inline __be16 nla_get_be16(const struct nlattr *nla) { return *(__be16 *) nla_data(nla); } /** * nla_get_be16_default - return payload of be16 attribute or default * @nla: __be16 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __be16 nla_get_be16_default(const struct nlattr *nla, __be16 defvalue) { if (!nla) return defvalue; return nla_get_be16(nla); } /** * nla_get_le16 - return payload of __le16 attribute * @nla: __le16 netlink attribute */ static inline __le16 nla_get_le16(const struct nlattr *nla) { return *(__le16 *) nla_data(nla); } /** * nla_get_le16_default - return payload of le16 attribute or default * @nla: __le16 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __le16 nla_get_le16_default(const struct nlattr *nla, __le16 defvalue) { if (!nla) return defvalue; return nla_get_le16(nla); } /** * nla_get_u8 - return payload of u8 attribute * @nla: u8 netlink attribute */ static inline u8 nla_get_u8(const struct nlattr *nla) { return *(u8 *) nla_data(nla); } /** * nla_get_u8_default - return payload of u8 attribute or default * @nla: u8 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u8 nla_get_u8_default(const struct nlattr *nla, u8 defvalue) { if (!nla) return defvalue; return nla_get_u8(nla); } /** * nla_get_u64 - return payload of u64 attribute * @nla: u64 netlink attribute */ static inline u64 nla_get_u64(const struct nlattr *nla) { u64 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_u64_default - return payload of u64 attribute or default * @nla: u64 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u64 nla_get_u64_default(const struct nlattr *nla, u64 defvalue) { if (!nla) return defvalue; return nla_get_u64(nla); } /** * nla_get_uint - return payload of uint attribute * @nla: uint netlink attribute */ static inline u64 nla_get_uint(const struct nlattr *nla) { if (nla_len(nla) == sizeof(u32)) return nla_get_u32(nla); return nla_get_u64(nla); } /** * nla_get_uint_default - return payload of uint attribute or default * @nla: uint netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u64 nla_get_uint_default(const struct nlattr *nla, u64 defvalue) { if (!nla) return defvalue; return nla_get_uint(nla); } /** * nla_get_be64 - return payload of __be64 attribute * @nla: __be64 netlink attribute */ static inline __be64 nla_get_be64(const struct nlattr *nla) { __be64 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_be64_default - return payload of be64 attribute or default * @nla: __be64 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __be64 nla_get_be64_default(const struct nlattr *nla, __be64 defvalue) { if (!nla) return defvalue; return nla_get_be64(nla); } /** * nla_get_le64 - return payload of __le64 attribute * @nla: __le64 netlink attribute */ static inline __le64 nla_get_le64(const struct nlattr *nla) { return *(__le64 *) nla_data(nla); } /** * nla_get_le64_default - return payload of le64 attribute or default * @nla: __le64 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __le64 nla_get_le64_default(const struct nlattr *nla, __le64 defvalue) { if (!nla) return defvalue; return nla_get_le64(nla); } /** * nla_get_s32 - return payload of s32 attribute * @nla: s32 netlink attribute */ static inline s32 nla_get_s32(const struct nlattr *nla) { return *(s32 *) nla_data(nla); } /** * nla_get_s32_default - return payload of s32 attribute or default * @nla: s32 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s32 nla_get_s32_default(const struct nlattr *nla, s32 defvalue) { if (!nla) return defvalue; return nla_get_s32(nla); } /** * nla_get_s16 - return payload of s16 attribute * @nla: s16 netlink attribute */ static inline s16 nla_get_s16(const struct nlattr *nla) { return *(s16 *) nla_data(nla); } /** * nla_get_s16_default - return payload of s16 attribute or default * @nla: s16 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s16 nla_get_s16_default(const struct nlattr *nla, s16 defvalue) { if (!nla) return defvalue; return nla_get_s16(nla); } /** * nla_get_s8 - return payload of s8 attribute * @nla: s8 netlink attribute */ static inline s8 nla_get_s8(const struct nlattr *nla) { return *(s8 *) nla_data(nla); } /** * nla_get_s8_default - return payload of s8 attribute or default * @nla: s8 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s8 nla_get_s8_default(const struct nlattr *nla, s8 defvalue) { if (!nla) return defvalue; return nla_get_s8(nla); } /** * nla_get_s64 - return payload of s64 attribute * @nla: s64 netlink attribute */ static inline s64 nla_get_s64(const struct nlattr *nla) { s64 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_s64_default - return payload of s64 attribute or default * @nla: s64 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s64 nla_get_s64_default(const struct nlattr *nla, s64 defvalue) { if (!nla) return defvalue; return nla_get_s64(nla); } /** * nla_get_sint - return payload of uint attribute * @nla: uint netlink attribute */ static inline s64 nla_get_sint(const struct nlattr *nla) { if (nla_len(nla) == sizeof(s32)) return nla_get_s32(nla); return nla_get_s64(nla); } /** * nla_get_sint_default - return payload of sint attribute or default * @nla: sint netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s64 nla_get_sint_default(const struct nlattr *nla, s64 defvalue) { if (!nla) return defvalue; return nla_get_sint(nla); } /** * nla_get_flag - return payload of flag attribute * @nla: flag netlink attribute */ static inline int nla_get_flag(const struct nlattr *nla) { return !!nla; } /** * nla_get_msecs - return payload of msecs attribute * @nla: msecs netlink attribute * * Returns: the number of milliseconds in jiffies. */ static inline unsigned long nla_get_msecs(const struct nlattr *nla) { u64 msecs = nla_get_u64(nla); return msecs_to_jiffies((unsigned long) msecs); } /** * nla_get_msecs_default - return payload of msecs attribute or default * @nla: msecs netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline unsigned long nla_get_msecs_default(const struct nlattr *nla, unsigned long defvalue) { if (!nla) return defvalue; return nla_get_msecs(nla); } /** * nla_get_in_addr - return payload of IPv4 address attribute * @nla: IPv4 address netlink attribute */ static inline __be32 nla_get_in_addr(const struct nlattr *nla) { return *(__be32 *) nla_data(nla); } /** * nla_get_in_addr_default - return payload of be32 attribute or default * @nla: IPv4 address netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __be32 nla_get_in_addr_default(const struct nlattr *nla, __be32 defvalue) { if (!nla) return defvalue; return nla_get_in_addr(nla); } /** * nla_get_in6_addr - return payload of IPv6 address attribute * @nla: IPv6 address netlink attribute */ static inline struct in6_addr nla_get_in6_addr(const struct nlattr *nla) { struct in6_addr tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_bitfield32 - return payload of 32 bitfield attribute * @nla: nla_bitfield32 attribute */ static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla) { struct nla_bitfield32 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_memdup - duplicate attribute memory (kmemdup) * @src: netlink attribute to duplicate from * @gfp: GFP mask */ static inline void *nla_memdup_noprof(const struct nlattr *src, gfp_t gfp) { return kmemdup_noprof(nla_data(src), nla_len(src), gfp); } #define nla_memdup(...) alloc_hooks(nla_memdup_noprof(__VA_ARGS__)) /** * nla_nest_start_noflag - Start a new level of nested attributes * @skb: socket buffer to add attributes to * @attrtype: attribute type of container * * This function exists for backward compatibility to use in APIs which never * marked their nest attributes with NLA_F_NESTED flag. New APIs should use * nla_nest_start() which sets the flag. * * Returns: the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb, int attrtype) { struct nlattr *start = (struct nlattr *)skb_tail_pointer(skb); if (nla_put(skb, attrtype, 0, NULL) < 0) return NULL; return start; } /** * nla_nest_start - Start a new level of nested attributes, with NLA_F_NESTED * @skb: socket buffer to add attributes to * @attrtype: attribute type of container * * Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED * flag. This is the preferred function to use in new code. * * Returns: the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) { return nla_nest_start_noflag(skb, attrtype | NLA_F_NESTED); } /** * nla_nest_end - Finalize nesting of attributes * @skb: socket buffer the attributes are stored in * @start: container attribute * * Corrects the container attribute header to include the all * appended attributes. * * Returns: the total data length of the skb. */ static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start) { start->nla_len = skb_tail_pointer(skb) - (unsigned char *)start; return skb->len; } /** * nla_nest_cancel - Cancel nesting of attributes * @skb: socket buffer the message is stored in * @start: container attribute * * Removes the container attribute and including all nested * attributes. Returns -EMSGSIZE */ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) { nlmsg_trim(skb, start); } /** * __nla_validate_nested - Validate a stream of nested attributes * @start: container attribute * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the nested attribute stream against the * specified policy. Attributes with a type exceeding maxtype will be * ignored. See documentation of struct nla_policy for more details. * * Returns: 0 on success or a negative error code. */ static inline int __nla_validate_nested(const struct nlattr *start, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate(nla_data(start), nla_len(start), maxtype, policy, validate, extack); } static inline int nla_validate_nested(const struct nlattr *start, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate_nested(start, maxtype, policy, NL_VALIDATE_STRICT, extack); } static inline int nla_validate_nested_deprecated(const struct nlattr *start, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate_nested(start, maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_need_padding_for_64bit - test 64-bit alignment of the next attribute * @skb: socket buffer the message is stored in * * Return: true if padding is needed to align the next attribute (nla_data()) to * a 64-bit aligned area. */ static inline bool nla_need_padding_for_64bit(struct sk_buff *skb) { #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS /* The nlattr header is 4 bytes in size, that's why we test * if the skb->data _is_ aligned. A NOP attribute, plus * nlattr header for next attribute, will make nla_data() * 8-byte aligned. */ if (IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8)) return true; #endif return false; } /** * nla_align_64bit - 64-bit align the nla_data() of next attribute * @skb: socket buffer the message is stored in * @padattr: attribute type for the padding * * Conditionally emit a padding netlink attribute in order to make * the next attribute we emit have a 64-bit aligned nla_data() area. * This will only be done in architectures which do not have * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS defined. * * Returns: zero on success or a negative error code. */ static inline int nla_align_64bit(struct sk_buff *skb, int padattr) { if (nla_need_padding_for_64bit(skb) && !nla_reserve(skb, padattr, 0)) return -EMSGSIZE; return 0; } /** * nla_total_size_64bit - total length of attribute including padding * @payload: length of payload */ static inline int nla_total_size_64bit(int payload) { return NLA_ALIGN(nla_attr_size(payload)) #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + NLA_ALIGN(nla_attr_size(0)) #endif ; } /** * nla_for_each_attr - iterate over a stream of attributes * @pos: loop counter, set to current attribute * @head: head of attribute stream * @len: length of attribute stream * @rem: initialized to len, holds bytes currently remaining in stream */ #define nla_for_each_attr(pos, head, len, rem) \ for (pos = head, rem = len; \ nla_ok(pos, rem); \ pos = nla_next(pos, &(rem))) /** * nla_for_each_attr_type - iterate over a stream of attributes * @pos: loop counter, set to current attribute * @type: required attribute type for @pos * @head: head of attribute stream * @len: length of attribute stream * @rem: initialized to len, holds bytes currently remaining in stream */ #define nla_for_each_attr_type(pos, type, head, len, rem) \ nla_for_each_attr(pos, head, len, rem) \ if (nla_type(pos) == type) /** * nla_for_each_nested - iterate over nested attributes * @pos: loop counter, set to current attribute * @nla: attribute containing the nested attributes * @rem: initialized to len, holds bytes currently remaining in stream */ #define nla_for_each_nested(pos, nla, rem) \ nla_for_each_attr(pos, nla_data(nla), nla_len(nla), rem) /** * nla_for_each_nested_type - iterate over nested attributes * @pos: loop counter, set to current attribute * @type: required attribute type for @pos * @nla: attribute containing the nested attributes * @rem: initialized to len, holds bytes currently remaining in stream */ #define nla_for_each_nested_type(pos, type, nla, rem) \ nla_for_each_nested(pos, nla, rem) \ if (nla_type(pos) == type) /** * nla_is_last - Test if attribute is last in stream * @nla: attribute to test * @rem: bytes remaining in stream */ static inline bool nla_is_last(const struct nlattr *nla, int rem) { return nla->nla_len == rem; } void nla_get_range_unsigned(const struct nla_policy *pt, struct netlink_range_validation *range); void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range); struct netlink_policy_dump_state; int netlink_policy_dump_add_policy(struct netlink_policy_dump_state **pstate, const struct nla_policy *policy, unsigned int maxtype); int netlink_policy_dump_get_policy_idx(struct netlink_policy_dump_state *state, const struct nla_policy *policy, unsigned int maxtype); bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state); int netlink_policy_dump_write(struct sk_buff *skb, struct netlink_policy_dump_state *state); int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt); int netlink_policy_dump_write_attr(struct sk_buff *skb, const struct nla_policy *pt, int nestattr); void netlink_policy_dump_free(struct netlink_policy_dump_state *state); #endif |
| 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 | // SPDX-License-Identifier: GPL-2.0-only /* * net/core/fib_rules.c Generic Routing Rules * * Authors: Thomas Graf <tgraf@suug.ch> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/module.h> #include <net/net_namespace.h> #include <net/inet_dscp.h> #include <net/sock.h> #include <net/fib_rules.h> #include <net/ip_tunnels.h> #include <linux/indirect_call_wrapper.h> #if defined(CONFIG_IPV6) && defined(CONFIG_IPV6_MULTIPLE_TABLES) #ifdef CONFIG_IP_MULTIPLE_TABLES #define INDIRECT_CALL_MT(f, f2, f1, ...) \ INDIRECT_CALL_INET(f, f2, f1, __VA_ARGS__) #else #define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) #endif #elif defined(CONFIG_IP_MULTIPLE_TABLES) #define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) #else #define INDIRECT_CALL_MT(f, f2, f1, ...) f(__VA_ARGS__) #endif static const struct fib_kuid_range fib_kuid_range_unset = { KUIDT_INIT(0), KUIDT_INIT(~0), }; bool fib_rule_matchall(const struct fib_rule *rule) { if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id || rule->flags) return false; if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1) return false; if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) || !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end)) return false; if (fib_rule_port_range_set(&rule->sport_range)) return false; if (fib_rule_port_range_set(&rule->dport_range)) return false; return true; } EXPORT_SYMBOL_GPL(fib_rule_matchall); int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table) { struct fib_rule *r; r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (r == NULL) return -ENOMEM; refcount_set(&r->refcnt, 1); r->action = FR_ACT_TO_TBL; r->pref = pref; r->table = table; r->proto = RTPROT_KERNEL; r->fr_net = ops->fro_net; r->uid_range = fib_kuid_range_unset; r->suppress_prefixlen = -1; r->suppress_ifgroup = -1; /* The lock is not required here, the list in unreachable * at the moment this function is called */ list_add_tail(&r->list, &ops->rules_list); return 0; } EXPORT_SYMBOL(fib_default_rule_add); static u32 fib_default_rule_pref(struct fib_rules_ops *ops) { struct list_head *pos; struct fib_rule *rule; if (!list_empty(&ops->rules_list)) { pos = ops->rules_list.next; if (pos->next != &ops->rules_list) { rule = list_entry(pos->next, struct fib_rule, list); if (rule->pref) return rule->pref - 1; } } return 0; } static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid); static struct fib_rules_ops *lookup_rules_ops(const struct net *net, int family) { struct fib_rules_ops *ops; rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (ops->family == family) { if (!try_module_get(ops->owner)) ops = NULL; rcu_read_unlock(); return ops; } } rcu_read_unlock(); return NULL; } static void rules_ops_put(struct fib_rules_ops *ops) { if (ops) module_put(ops->owner); } static void flush_route_cache(struct fib_rules_ops *ops) { if (ops->flush_cache) ops->flush_cache(ops); } static int __fib_rules_register(struct fib_rules_ops *ops) { int err = -EEXIST; struct fib_rules_ops *o; struct net *net; net = ops->fro_net; if (ops->rule_size < sizeof(struct fib_rule)) return -EINVAL; if (ops->match == NULL || ops->configure == NULL || ops->compare == NULL || ops->fill == NULL || ops->action == NULL) return -EINVAL; spin_lock(&net->rules_mod_lock); list_for_each_entry(o, &net->rules_ops, list) if (ops->family == o->family) goto errout; list_add_tail_rcu(&ops->list, &net->rules_ops); err = 0; errout: spin_unlock(&net->rules_mod_lock); return err; } struct fib_rules_ops * fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net) { struct fib_rules_ops *ops; int err; ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL); if (ops == NULL) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&ops->rules_list); ops->fro_net = net; err = __fib_rules_register(ops); if (err) { kfree(ops); ops = ERR_PTR(err); } return ops; } EXPORT_SYMBOL_GPL(fib_rules_register); static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) { struct fib_rule *rule, *tmp; list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) { list_del_rcu(&rule->list); if (ops->delete) ops->delete(rule); fib_rule_put(rule); } } void fib_rules_unregister(struct fib_rules_ops *ops) { struct net *net = ops->fro_net; spin_lock(&net->rules_mod_lock); list_del_rcu(&ops->list); spin_unlock(&net->rules_mod_lock); fib_rules_cleanup_ops(ops); kfree_rcu(ops, rcu); } EXPORT_SYMBOL_GPL(fib_rules_unregister); static int uid_range_set(struct fib_kuid_range *range) { return uid_valid(range->start) && uid_valid(range->end); } static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb) { struct fib_rule_uid_range *in; struct fib_kuid_range out; in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]); out.start = make_kuid(current_user_ns(), in->start); out.end = make_kuid(current_user_ns(), in->end); return out; } static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range) { struct fib_rule_uid_range out = { from_kuid_munged(current_user_ns(), range->start), from_kuid_munged(current_user_ns(), range->end) }; return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out); } static int nla_get_port_range(struct nlattr *pattr, struct fib_rule_port_range *port_range) { const struct fib_rule_port_range *pr = nla_data(pattr); if (!fib_rule_port_range_valid(pr)) return -EINVAL; port_range->start = pr->start; port_range->end = pr->end; return 0; } static int nla_put_port_range(struct sk_buff *skb, int attrtype, struct fib_rule_port_range *range) { return nla_put(skb, attrtype, sizeof(*range), range); } static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { int ret = 0; if (rule->iifindex && (rule->iifindex != fl->flowi_iif)) goto out; if (rule->oifindex && (rule->oifindex != fl->flowi_oif)) goto out; if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) goto out; if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) goto out; if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg)) goto out; if (uid_lt(fl->flowi_uid, rule->uid_range.start) || uid_gt(fl->flowi_uid, rule->uid_range.end)) goto out; ret = INDIRECT_CALL_MT(ops->match, fib6_rule_match, fib4_rule_match, rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; } int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { struct fib_rule *rule; int err; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { jumped: if (!fib_rule_match(rule, ops, fl, flags, arg)) continue; if (rule->action == FR_ACT_GOTO) { struct fib_rule *target; target = rcu_dereference(rule->ctarget); if (target == NULL) { continue; } else { rule = target; goto jumped; } } else if (rule->action == FR_ACT_NOP) continue; else err = INDIRECT_CALL_MT(ops->action, fib6_rule_action, fib4_rule_action, rule, fl, flags, arg); if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress, fib6_rule_suppress, fib4_rule_suppress, rule, flags, arg)) continue; if (err != -EAGAIN) { if ((arg->flags & FIB_LOOKUP_NOREF) || likely(refcount_inc_not_zero(&rule->refcnt))) { arg->rule = rule; goto out; } break; } } err = -ESRCH; out: rcu_read_unlock(); return err; } EXPORT_SYMBOL_GPL(fib_rules_lookup); static int call_fib_rule_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_rule *rule, int family, struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = family, .info.extack = extack, .rule = rule, }; return call_fib_notifier(nb, event_type, &info.info); } static int call_fib_rule_notifiers(struct net *net, enum fib_event_type event_type, struct fib_rule *rule, struct fib_rules_ops *ops, struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = ops->family, .info.extack = extack, .rule = rule, }; ASSERT_RTNL(); /* Paired with READ_ONCE() in fib_rules_seq() */ WRITE_ONCE(ops->fib_rules_seq, ops->fib_rules_seq + 1); return call_fib_notifiers(net, event_type, &info.info); } /* Called with rcu_read_lock() */ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack) { struct fib_rules_ops *ops; struct fib_rule *rule; int err = 0; ops = lookup_rules_ops(net, family); if (!ops) return -EAFNOSUPPORT; list_for_each_entry_rcu(rule, &ops->rules_list, list) { err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, rule, family, extack); if (err) break; } rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_rules_dump); unsigned int fib_rules_seq_read(const struct net *net, int family) { unsigned int fib_rules_seq; struct fib_rules_ops *ops; ops = lookup_rules_ops(net, family); if (!ops) return 0; /* Paired with WRITE_ONCE() in call_fib_rule_notifiers() */ fib_rules_seq = READ_ONCE(ops->fib_rules_seq); rules_ops_put(ops); return fib_rules_seq; } EXPORT_SYMBOL_GPL(fib_rules_seq_read); static struct fib_rule *rule_find(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule, bool user_priority) { struct fib_rule *r; list_for_each_entry(r, &ops->rules_list, list) { if (rule->action && r->action != rule->action) continue; if (rule->table && r->table != rule->table) continue; if (user_priority && r->pref != rule->pref) continue; if (rule->iifname[0] && memcmp(r->iifname, rule->iifname, IFNAMSIZ)) continue; if (rule->oifname[0] && memcmp(r->oifname, rule->oifname, IFNAMSIZ)) continue; if (rule->mark && r->mark != rule->mark) continue; if (rule->suppress_ifgroup != -1 && r->suppress_ifgroup != rule->suppress_ifgroup) continue; if (rule->suppress_prefixlen != -1 && r->suppress_prefixlen != rule->suppress_prefixlen) continue; if (rule->mark_mask && r->mark_mask != rule->mark_mask) continue; if (rule->tun_id && r->tun_id != rule->tun_id) continue; if (r->fr_net != rule->fr_net) continue; if (rule->l3mdev && r->l3mdev != rule->l3mdev) continue; if (uid_range_set(&rule->uid_range) && (!uid_eq(r->uid_range.start, rule->uid_range.start) || !uid_eq(r->uid_range.end, rule->uid_range.end))) continue; if (rule->ip_proto && r->ip_proto != rule->ip_proto) continue; if (rule->proto && r->proto != rule->proto) continue; if (fib_rule_port_range_set(&rule->sport_range) && !fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) continue; if (fib_rule_port_range_set(&rule->dport_range) && !fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; if (!ops->compare(r, frh, tb)) continue; return r; } return NULL; } #ifdef CONFIG_NET_L3_MASTER_DEV static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, struct netlink_ext_ack *extack) { nlrule->l3mdev = nla_get_u8(nla); if (nlrule->l3mdev != 1) { NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute"); return -1; } return 0; } #else static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel"); return -1; } #endif static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct fib_rules_ops *ops, struct nlattr *tb[], struct fib_rule **rule, bool *user_priority) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rule *nlrule = NULL; int err = -EINVAL; if (frh->src_len) if (!tb[FRA_SRC] || frh->src_len > (ops->addr_size * 8) || nla_len(tb[FRA_SRC]) != ops->addr_size) { NL_SET_ERR_MSG(extack, "Invalid source address"); goto errout; } if (frh->dst_len) if (!tb[FRA_DST] || frh->dst_len > (ops->addr_size * 8) || nla_len(tb[FRA_DST]) != ops->addr_size) { NL_SET_ERR_MSG(extack, "Invalid dst address"); goto errout; } nlrule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (!nlrule) { err = -ENOMEM; goto errout; } refcount_set(&nlrule->refcnt, 1); nlrule->fr_net = net; if (tb[FRA_PRIORITY]) { nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]); *user_priority = true; } else { nlrule->pref = fib_default_rule_pref(ops); } nlrule->proto = nla_get_u8_default(tb[FRA_PROTOCOL], RTPROT_UNSPEC); if (tb[FRA_IIFNAME]) { struct net_device *dev; nlrule->iifindex = -1; nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->iifname); if (dev) nlrule->iifindex = dev->ifindex; } if (tb[FRA_OIFNAME]) { struct net_device *dev; nlrule->oifindex = -1; nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->oifname); if (dev) nlrule->oifindex = dev->ifindex; } if (tb[FRA_FWMARK]) { nlrule->mark = nla_get_u32(tb[FRA_FWMARK]); if (nlrule->mark) /* compatibility: if the mark value is non-zero all bits * are compared unless a mask is explicitly specified. */ nlrule->mark_mask = 0xFFFFFFFF; } if (tb[FRA_FWMASK]) nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); if (tb[FRA_TUN_ID]) nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); if (tb[FRA_L3MDEV] && fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0) goto errout_free; nlrule->action = frh->action; nlrule->flags = frh->flags; nlrule->table = frh_get_table(frh, tb); if (tb[FRA_SUPPRESS_PREFIXLEN]) nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); else nlrule->suppress_prefixlen = -1; if (tb[FRA_SUPPRESS_IFGROUP]) nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); else nlrule->suppress_ifgroup = -1; if (tb[FRA_GOTO]) { if (nlrule->action != FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Unexpected goto"); goto errout_free; } nlrule->target = nla_get_u32(tb[FRA_GOTO]); /* Backward jumps are prohibited to avoid endless loops */ if (nlrule->target <= nlrule->pref) { NL_SET_ERR_MSG(extack, "Backward goto not supported"); goto errout_free; } } else if (nlrule->action == FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Missing goto target for action goto"); goto errout_free; } if (nlrule->l3mdev && nlrule->table) { NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive"); goto errout_free; } if (tb[FRA_UID_RANGE]) { if (current_user_ns() != net->user_ns) { err = -EPERM; NL_SET_ERR_MSG(extack, "No permission to set uid"); goto errout_free; } nlrule->uid_range = nla_get_kuid_range(tb); if (!uid_range_set(&nlrule->uid_range) || !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) { NL_SET_ERR_MSG(extack, "Invalid uid range"); goto errout_free; } } else { nlrule->uid_range = fib_kuid_range_unset; } if (tb[FRA_IP_PROTO]) nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); if (tb[FRA_SPORT_RANGE]) { err = nla_get_port_range(tb[FRA_SPORT_RANGE], &nlrule->sport_range); if (err) { NL_SET_ERR_MSG(extack, "Invalid sport range"); goto errout_free; } } if (tb[FRA_DPORT_RANGE]) { err = nla_get_port_range(tb[FRA_DPORT_RANGE], &nlrule->dport_range); if (err) { NL_SET_ERR_MSG(extack, "Invalid dport range"); goto errout_free; } } *rule = nlrule; return 0; errout_free: kfree(nlrule); errout: return err; } static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule) { struct fib_rule *r; list_for_each_entry(r, &ops->rules_list, list) { if (r->action != rule->action) continue; if (r->table != rule->table) continue; if (r->pref != rule->pref) continue; if (memcmp(r->iifname, rule->iifname, IFNAMSIZ)) continue; if (memcmp(r->oifname, rule->oifname, IFNAMSIZ)) continue; if (r->mark != rule->mark) continue; if (r->suppress_ifgroup != rule->suppress_ifgroup) continue; if (r->suppress_prefixlen != rule->suppress_prefixlen) continue; if (r->mark_mask != rule->mark_mask) continue; if (r->tun_id != rule->tun_id) continue; if (r->fr_net != rule->fr_net) continue; if (r->l3mdev != rule->l3mdev) continue; if (!uid_eq(r->uid_range.start, rule->uid_range.start) || !uid_eq(r->uid_range.end, rule->uid_range.end)) continue; if (r->ip_proto != rule->ip_proto) continue; if (r->proto != rule->proto) continue; if (!fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) continue; if (!fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; if (!ops->compare(r, frh, tb)) continue; return 1; } return 0; } static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { [FRA_UNSPEC] = { .strict_start_type = FRA_DPORT_RANGE + 1 }, [FRA_IIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [FRA_OIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [FRA_PRIORITY] = { .type = NLA_U32 }, [FRA_FWMARK] = { .type = NLA_U32 }, [FRA_FLOW] = { .type = NLA_U32 }, [FRA_TUN_ID] = { .type = NLA_U64 }, [FRA_FWMASK] = { .type = NLA_U32 }, [FRA_TABLE] = { .type = NLA_U32 }, [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, [FRA_GOTO] = { .type = NLA_U32 }, [FRA_L3MDEV] = { .type = NLA_U8 }, [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }, [FRA_PROTOCOL] = { .type = NLA_U8 }, [FRA_IP_PROTO] = { .type = NLA_U8 }, [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2), [FRA_FLOWLABEL] = { .type = NLA_BE32 }, [FRA_FLOWLABEL_MASK] = { .type = NLA_BE32 }, }; int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; struct fib_rule *rule = NULL, *r, *last = NULL; struct nlattr *tb[FRA_MAX + 1]; int err = -EINVAL, unresolved = 0; bool user_priority = false; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } ops = lookup_rules_ops(net, frh->family); if (!ops) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, fib_rule_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; } err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority); if (err) goto errout; if ((nlh->nlmsg_flags & NLM_F_EXCL) && rule_exists(ops, frh, tb, rule)) { err = -EEXIST; goto errout_free; } err = ops->configure(rule, skb, frh, tb, extack); if (err < 0) goto errout_free; err = call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, extack); if (err < 0) goto errout_free; list_for_each_entry(r, &ops->rules_list, list) { if (r->pref == rule->target) { RCU_INIT_POINTER(rule->ctarget, r); break; } } if (rcu_dereference_protected(rule->ctarget, 1) == NULL) unresolved = 1; list_for_each_entry(r, &ops->rules_list, list) { if (r->pref > rule->pref) break; last = r; } if (last) list_add_rcu(&rule->list, &last->list); else list_add_rcu(&rule->list, &ops->rules_list); if (ops->unresolved_rules) { /* * There are unresolved goto rules in the list, check if * any of them are pointing to this new rule. */ list_for_each_entry(r, &ops->rules_list, list) { if (r->action == FR_ACT_GOTO && r->target == rule->pref && rtnl_dereference(r->ctarget) == NULL) { rcu_assign_pointer(r->ctarget, rule); if (--ops->unresolved_rules == 0) break; } } } if (rule->action == FR_ACT_GOTO) ops->nr_goto_rules++; if (unresolved) ops->unresolved_rules++; if (rule->tun_id) ip_tunnel_need_metadata(); notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); flush_route_cache(ops); rules_ops_put(ops); return 0; errout_free: kfree(rule); errout: rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_nl_newrule); int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; struct fib_rule *rule = NULL, *r, *nlrule = NULL; struct nlattr *tb[FRA_MAX+1]; int err = -EINVAL; bool user_priority = false; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } ops = lookup_rules_ops(net, frh->family); if (ops == NULL) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, fib_rule_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; } err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority); if (err) goto errout; rule = rule_find(ops, frh, tb, nlrule, user_priority); if (!rule) { err = -ENOENT; goto errout; } if (rule->flags & FIB_RULE_PERMANENT) { err = -EPERM; goto errout; } if (ops->delete) { err = ops->delete(rule); if (err) goto errout; } if (rule->tun_id) ip_tunnel_unneed_metadata(); list_del_rcu(&rule->list); if (rule->action == FR_ACT_GOTO) { ops->nr_goto_rules--; if (rtnl_dereference(rule->ctarget) == NULL) ops->unresolved_rules--; } /* * Check if this rule is a target to any of them. If so, * adjust to the next one with the same preference or * disable them. As this operation is eventually very * expensive, it is only performed if goto rules, except * current if it is goto rule, have actually been added. */ if (ops->nr_goto_rules > 0) { struct fib_rule *n; n = list_next_entry(rule, list); if (&n->list == &ops->rules_list || n->pref != rule->pref) n = NULL; list_for_each_entry(r, &ops->rules_list, list) { if (rtnl_dereference(r->ctarget) != rule) continue; rcu_assign_pointer(r->ctarget, n); if (!n) ops->unresolved_rules++; } } call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, NULL); notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); kfree(nlrule); return 0; errout: kfree(nlrule); rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_nl_delrule); static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) { size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)) + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */ + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4) /* FRA_FWMASK */ + nla_total_size_64bit(8) /* FRA_TUN_ID */ + nla_total_size(sizeof(struct fib_kuid_range)) + nla_total_size(1) /* FRA_PROTOCOL */ + nla_total_size(1) /* FRA_IP_PROTO */ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */ + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); return payload; } static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, u32 pid, u32 seq, int type, int flags, struct fib_rules_ops *ops) { struct nlmsghdr *nlh; struct fib_rule_hdr *frh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags); if (nlh == NULL) return -EMSGSIZE; frh = nlmsg_data(nlh); frh->family = ops->family; frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) goto nla_put_failure; frh->res1 = 0; frh->res2 = 0; frh->action = rule->action; frh->flags = rule->flags; if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto)) goto nla_put_failure; if (rule->action == FR_ACT_GOTO && rcu_access_pointer(rule->ctarget) == NULL) frh->flags |= FIB_RULE_UNRESOLVED; if (rule->iifname[0]) { if (nla_put_string(skb, FRA_IIFNAME, rule->iifname)) goto nla_put_failure; if (rule->iifindex == -1) frh->flags |= FIB_RULE_IIF_DETACHED; } if (rule->oifname[0]) { if (nla_put_string(skb, FRA_OIFNAME, rule->oifname)) goto nla_put_failure; if (rule->oifindex == -1) frh->flags |= FIB_RULE_OIF_DETACHED; } if ((rule->pref && nla_put_u32(skb, FRA_PRIORITY, rule->pref)) || (rule->mark && nla_put_u32(skb, FRA_FWMARK, rule->mark)) || ((rule->mark_mask || rule->mark) && nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) || (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target)) || (rule->tun_id && nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) || (rule->l3mdev && nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) || (uid_range_set(&rule->uid_range) && nla_put_uid_range(skb, &rule->uid_range)) || (fib_rule_port_range_set(&rule->sport_range) && nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) || (fib_rule_port_range_set(&rule->dport_range) && nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) || (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup)) goto nla_put_failure; } if (ops->fill(rule, skb, frh) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, struct fib_rules_ops *ops) { int idx = 0; struct fib_rule *rule; int err = 0; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { if (idx < cb->args[1]) goto skip; err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWRULE, NLM_F_MULTI, ops); if (err) break; skip: idx++; } rcu_read_unlock(); cb->args[1] = idx; rules_ops_put(ops); return err; } static int fib_valid_dumprule_req(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct fib_rule_hdr *frh; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request"); return -EINVAL; } frh = nlmsg_data(nlh); if (frh->dst_len || frh->src_len || frh->tos || frh->table || frh->res1 || frh->res2 || frh->action || frh->flags) { NL_SET_ERR_MSG(extack, "Invalid values in header for fib rule dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid data after header in fib rule dump request"); return -EINVAL; } return 0; } static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct fib_rules_ops *ops; int err, idx = 0, family; if (cb->strict_check) { err = fib_valid_dumprule_req(nlh, cb->extack); if (err < 0) return err; } family = rtnl_msg_family(nlh); if (family != AF_UNSPEC) { /* Protocol specific dump request */ ops = lookup_rules_ops(net, family); if (ops == NULL) return -EAFNOSUPPORT; return dump_rules(skb, cb, ops); } err = 0; rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (idx < cb->args[0] || !try_module_get(ops->owner)) goto skip; err = dump_rules(skb, cb, ops); if (err < 0) break; cb->args[1] = 0; skip: idx++; } rcu_read_unlock(); cb->args[0] = idx; return err; } static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid) { struct net *net; struct sk_buff *skb; int err = -ENOMEM; net = ops->fro_net; skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL); if (skb == NULL) goto errout; err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops); if (err < 0) { /* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, ops->nlgroup, err); } static void attach_rules(struct list_head *rules, struct net_device *dev) { struct fib_rule *rule; list_for_each_entry(rule, rules, list) { if (rule->iifindex == -1 && strcmp(dev->name, rule->iifname) == 0) rule->iifindex = dev->ifindex; if (rule->oifindex == -1 && strcmp(dev->name, rule->oifname) == 0) rule->oifindex = dev->ifindex; } } static void detach_rules(struct list_head *rules, struct net_device *dev) { struct fib_rule *rule; list_for_each_entry(rule, rules, list) { if (rule->iifindex == dev->ifindex) rule->iifindex = -1; if (rule->oifindex == dev->ifindex) rule->oifindex = -1; } } static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct fib_rules_ops *ops; ASSERT_RTNL(); switch (event) { case NETDEV_REGISTER: list_for_each_entry(ops, &net->rules_ops, list) attach_rules(&ops->rules_list, dev); break; case NETDEV_CHANGENAME: list_for_each_entry(ops, &net->rules_ops, list) { detach_rules(&ops->rules_list, dev); attach_rules(&ops->rules_list, dev); } break; case NETDEV_UNREGISTER: list_for_each_entry(ops, &net->rules_ops, list) detach_rules(&ops->rules_list, dev); break; } return NOTIFY_DONE; } static struct notifier_block fib_rules_notifier = { .notifier_call = fib_rules_event, }; static int __net_init fib_rules_net_init(struct net *net) { INIT_LIST_HEAD(&net->rules_ops); spin_lock_init(&net->rules_mod_lock); return 0; } static void __net_exit fib_rules_net_exit(struct net *net) { WARN_ON_ONCE(!list_empty(&net->rules_ops)); } static struct pernet_operations fib_rules_net_ops = { .init = fib_rules_net_init, .exit = fib_rules_net_exit, }; static const struct rtnl_msg_handler fib_rules_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule}, {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule}, {.msgtype = RTM_GETRULE, .dumpit = fib_nl_dumprule, .flags = RTNL_FLAG_DUMP_UNLOCKED}, }; static int __init fib_rules_init(void) { int err; rtnl_register_many(fib_rules_rtnl_msg_handlers); err = register_pernet_subsys(&fib_rules_net_ops); if (err < 0) goto fail; err = register_netdevice_notifier(&fib_rules_notifier); if (err < 0) goto fail_unregister; return 0; fail_unregister: unregister_pernet_subsys(&fib_rules_net_ops); fail: rtnl_unregister_many(fib_rules_rtnl_msg_handlers); return err; } subsys_initcall(fib_rules_init); |
| 60 346 36 177 1 62 208 208 207 208 513 161 245 190 22 104 29 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* internal.h: mm/ internal definitions * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef __MM_INTERNAL_H #define __MM_INTERNAL_H #include <linux/fs.h> #include <linux/khugepaged.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/pagemap.h> #include <linux/pagewalk.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/swap_cgroup.h> #include <linux/tracepoint-defs.h> /* Internal core VMA manipulation functions. */ #include "vma.h" struct folio_batch; /* * The set of flags that only affect watermark checking and reclaim * behaviour. This is used by the MM to obey the caller constraints * about IO, FS and watermark checking while ignoring placement * hints such as HIGHMEM usage. */ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) /* Control allocation cpuset and node placement constraints */ #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) /* * Different from WARN_ON_ONCE(), no warning will be issued * when we specify __GFP_NOWARN. */ #define WARN_ON_ONCE_GFP(cond, gfp) ({ \ static bool __section(".data..once") __warned; \ int __ret_warn_once = !!(cond); \ \ if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \ __warned = true; \ WARN_ON(1); \ } \ unlikely(__ret_warn_once); \ }) void page_writeback_init(void); /* * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages, * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently * leaves nr_pages_mapped at 0, but avoid surprise if it participates later. */ #define ENTIRELY_MAPPED 0x800000 #define FOLIO_PAGES_MAPPED (ENTIRELY_MAPPED - 1) /* * Flags passed to __show_mem() and show_free_areas() to suppress output in * various contexts. */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ /* * How many individual pages have an elevated _mapcount. Excludes * the folio's entire_mapcount. * * Don't use this function outside of debugging code. */ static inline int folio_nr_pages_mapped(const struct folio *folio) { return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED; } /* * Retrieve the first entry of a folio based on a provided entry within the * folio. We cannot rely on folio->swap as there is no guarantee that it has * been initialized. Used for calling arch_swap_restore() */ static inline swp_entry_t folio_swap(swp_entry_t entry, const struct folio *folio) { swp_entry_t swap = { .val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)), }; return swap; } static inline void *folio_raw_mapping(const struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; return (void *)(mapping & ~PAGE_MAPPING_FLAGS); } /* * This is a file-backed mapping, and is about to be memory mapped - invoke its * mmap hook and safely handle error conditions. On error, VMA hooks will be * mutated. * * @file: File which backs the mapping. * @vma: VMA which we are mapping. * * Returns: 0 if success, error otherwise. */ static inline int mmap_file(struct file *file, struct vm_area_struct *vma) { int err = call_mmap(file, vma); if (likely(!err)) return 0; /* * OK, we tried to call the file hook for mmap(), but an error * arose. The mapping is in an inconsistent state and we most not invoke * any further hooks on it. */ vma->vm_ops = &vma_dummy_vm_ops; return err; } /* * If the VMA has a close hook then close it, and since closing it might leave * it in an inconsistent state which makes the use of any hooks suspect, clear * them down by installing dummy empty hooks. */ static inline void vma_close(struct vm_area_struct *vma) { if (vma->vm_ops && vma->vm_ops->close) { vma->vm_ops->close(vma); /* * The mapping is in an inconsistent state, and no further hooks * may be invoked upon it. */ vma->vm_ops = &vma_dummy_vm_ops; } } #ifdef CONFIG_MMU /* Flags for folio_pte_batch(). */ typedef int __bitwise fpb_t; /* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */ #define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0)) /* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */ #define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1)) static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) { if (flags & FPB_IGNORE_DIRTY) pte = pte_mkclean(pte); if (likely(flags & FPB_IGNORE_SOFT_DIRTY)) pte = pte_clear_soft_dirty(pte); return pte_wrprotect(pte_mkold(pte)); } /** * folio_pte_batch - detect a PTE batch for a large folio * @folio: The large folio to detect a PTE batch for. * @addr: The user virtual address the first page is mapped at. * @start_ptep: Page table pointer for the first entry. * @pte: Page table entry for the first page. * @max_nr: The maximum number of table entries to consider. * @flags: Flags to modify the PTE batch semantics. * @any_writable: Optional pointer to indicate whether any entry except the * first one is writable. * @any_young: Optional pointer to indicate whether any entry except the * first one is young. * @any_dirty: Optional pointer to indicate whether any entry except the * first one is dirty. * * Detect a PTE batch: consecutive (present) PTEs that map consecutive * pages of the same large folio. * * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY). * * start_ptep must map any page of the folio. max_nr must be at least one and * must be limited by the caller so scanning cannot exceed a single page table. * * Return: the number of table entries in the batch. */ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, bool *any_writable, bool *any_young, bool *any_dirty) { unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); const pte_t *end_ptep = start_ptep + max_nr; pte_t expected_pte, *ptep; bool writable, young, dirty; int nr; if (any_writable) *any_writable = false; if (any_young) *any_young = false; if (any_dirty) *any_dirty = false; VM_WARN_ON_FOLIO(!pte_present(pte), folio); VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); nr = pte_batch_hint(start_ptep, pte); expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); ptep = start_ptep + nr; while (ptep < end_ptep) { pte = ptep_get(ptep); if (any_writable) writable = !!pte_write(pte); if (any_young) young = !!pte_young(pte); if (any_dirty) dirty = !!pte_dirty(pte); pte = __pte_batch_clear_ignored(pte, flags); if (!pte_same(pte, expected_pte)) break; /* * Stop immediately once we reached the end of the folio. In * corner cases the next PFN might fall into a different * folio. */ if (pte_pfn(pte) >= folio_end_pfn) break; if (any_writable) *any_writable |= writable; if (any_young) *any_young |= young; if (any_dirty) *any_dirty |= dirty; nr = pte_batch_hint(ptep, pte); expected_pte = pte_advance_pfn(expected_pte, nr); ptep += nr; } return min(ptep - start_ptep, max_nr); } /** * pte_move_swp_offset - Move the swap entry offset field of a swap pte * forward or backward by delta * @pte: The initial pte state; is_swap_pte(pte) must be true and * non_swap_entry() must be false. * @delta: The direction and the offset we are moving; forward if delta * is positive; backward if delta is negative * * Moves the swap offset, while maintaining all other fields, including * swap type, and any swp pte bits. The resulting pte is returned. */ static inline pte_t pte_move_swp_offset(pte_t pte, long delta) { swp_entry_t entry = pte_to_swp_entry(pte); pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry), (swp_offset(entry) + delta))); if (pte_swp_soft_dirty(pte)) new = pte_swp_mksoft_dirty(new); if (pte_swp_exclusive(pte)) new = pte_swp_mkexclusive(new); if (pte_swp_uffd_wp(pte)) new = pte_swp_mkuffd_wp(new); return new; } /** * pte_next_swp_offset - Increment the swap entry offset field of a swap pte. * @pte: The initial pte state; is_swap_pte(pte) must be true and * non_swap_entry() must be false. * * Increments the swap offset, while maintaining all other fields, including * swap type, and any swp pte bits. The resulting pte is returned. */ static inline pte_t pte_next_swp_offset(pte_t pte) { return pte_move_swp_offset(pte, 1); } /** * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries * @start_ptep: Page table pointer for the first entry. * @max_nr: The maximum number of table entries to consider. * @pte: Page table entry for the first entry. * * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs * containing swap entries all with consecutive offsets and targeting the same * swap type, all with matching swp pte bits. * * max_nr must be at least one and must be limited by the caller so scanning * cannot exceed a single page table. * * Return: the number of table entries in the batch. */ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) { pte_t expected_pte = pte_next_swp_offset(pte); const pte_t *end_ptep = start_ptep + max_nr; swp_entry_t entry = pte_to_swp_entry(pte); pte_t *ptep = start_ptep + 1; unsigned short cgroup_id; VM_WARN_ON(max_nr < 1); VM_WARN_ON(!is_swap_pte(pte)); VM_WARN_ON(non_swap_entry(entry)); cgroup_id = lookup_swap_cgroup_id(entry); while (ptep < end_ptep) { pte = ptep_get(ptep); if (!pte_same(pte, expected_pte)) break; if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id) break; expected_pte = pte_next_swp_offset(expected_pte); ptep++; } return ptep - start_ptep; } #endif /* CONFIG_MMU */ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, int nr_throttled); static inline void acct_reclaim_writeback(struct folio *folio) { pg_data_t *pgdat = folio_pgdat(folio); int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled); if (nr_throttled) __acct_reclaim_writeback(pgdat, folio, nr_throttled); } static inline void wake_throttle_isolated(pg_data_t *pgdat) { wait_queue_head_t *wqh; wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED]; if (waitqueue_active(wqh)) wake_up(wqh); } vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf); static inline vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) { vm_fault_t ret = __vmf_anon_prepare(vmf); if (unlikely(ret & VM_FAULT_RETRY)) vma_end_read(vmf->vma); return ret; } vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); void folio_activate(struct folio *folio); void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling, bool mm_wr_locked); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); struct zap_details; void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details); int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, gfp_t gfp); void page_cache_ra_order(struct readahead_control *, struct file_ra_state *, unsigned int order); void force_page_cache_ra(struct readahead_control *, unsigned long nr); static inline void force_page_cache_readahead(struct address_space *mapping, struct file *file, pgoff_t index, unsigned long nr_to_read) { DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index); force_page_cache_ra(&ractl, nr_to_read); } unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); void filemap_free_folio(struct address_space *mapping, struct folio *folio); int truncate_inode_folio(struct address_space *mapping, struct folio *folio); bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end); long mapping_evict_folio(struct address_space *mapping, struct folio *folio); unsigned long mapping_try_invalidate(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_failed); /** * folio_evictable - Test whether a folio is evictable. * @folio: The folio to test. * * Test whether @folio is evictable -- i.e., should be placed on * active/inactive lists vs unevictable list. * * Reasons folio might not be evictable: * 1. folio's mapping marked unevictable * 2. One of the pages in the folio is part of an mlocked VMA */ static inline bool folio_evictable(struct folio *folio) { bool ret; /* Prevent address_space of inode and swap cache from being freed */ rcu_read_lock(); ret = !mapping_unevictable(folio_mapping(folio)) && !folio_test_mlocked(folio); rcu_read_unlock(); return ret; } /* * Turn a non-refcounted page (->_refcount == 0) into refcounted with * a count of one. */ static inline void set_page_refcounted(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(page_ref_count(page), page); set_page_count(page, 1); } /* * Return true if a folio needs ->release_folio() calling upon it. */ static inline bool folio_needs_release(struct folio *folio) { struct address_space *mapping = folio_mapping(folio); return folio_has_private(folio) || (mapping && mapping_release_always(mapping)); } extern unsigned long highest_memmap_pfn; /* * Maximum number of reclaim retries without progress before the OOM * killer is consider the only way forward. */ #define MAX_RECLAIM_RETRIES 16 /* * in mm/vmscan.c: */ bool folio_isolate_lru(struct folio *folio); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); /* * in mm/rmap.c: */ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* * in mm/page_alloc.c */ #define K(x) ((x) << (PAGE_SHIFT-10)) extern char * const zone_names[MAX_NR_ZONES]; /* perform sanity checks on struct pages being allocated or freed */ DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); extern int min_free_kbytes; void setup_per_zone_wmarks(void); void calculate_min_free_kbytes(void); int __meminit init_per_zone_wmark_min(void); void page_alloc_sysctl_init(void); /* * Structure for holding the mostly immutable allocation parameters passed * between functions involved in allocations, including the alloc_pages* * family of functions. * * nodemask, migratetype and highest_zoneidx are initialized only once in * __alloc_pages() and then never change. * * zonelist, preferred_zone and highest_zoneidx are set first in * __alloc_pages() for the fast path, and might be later changed * in __alloc_pages_slowpath(). All other functions pass the whole structure * by a const pointer. */ struct alloc_context { struct zonelist *zonelist; nodemask_t *nodemask; struct zoneref *preferred_zoneref; int migratetype; /* * highest_zoneidx represents highest usable zone index of * the allocation request. Due to the nature of the zone, * memory on lower zone than the highest_zoneidx will be * protected by lowmem_reserve[highest_zoneidx]. * * highest_zoneidx is also used by reclaim/compaction to limit * the target zone since higher zone than this index cannot be * usable for this allocation request. */ enum zone_type highest_zoneidx; bool spread_dirty_pages; }; /* * This function returns the order of a free page in the buddy system. In * general, page_zone(page)->lock must be held by the caller to prevent the * page from being allocated in parallel and returning garbage as the order. * If a caller does not hold page_zone(page)->lock, it must guarantee that the * page cannot be allocated or merged in parallel. Alternatively, it must * handle invalid values gracefully, and use buddy_order_unsafe() below. */ static inline unsigned int buddy_order(struct page *page) { /* PageBuddy() must be checked by the caller */ return page_private(page); } /* * Like buddy_order(), but for callers who cannot afford to hold the zone lock. * PageBuddy() should be checked first by the caller to minimize race window, * and invalid values must be handled gracefully. * * READ_ONCE is used so that if the caller assigns the result into a local * variable and e.g. tests it for valid range before using, the compiler cannot * decide to remove the variable and inline the page_private(page) multiple * times, potentially observing different values in the tests and the actual * use of the result. */ #define buddy_order_unsafe(page) READ_ONCE(page_private(page)) /* * This function checks whether a page is free && is the buddy * we can coalesce a page and its buddy if * (a) the buddy is not in a hole (check before calling!) && * (b) the buddy is in the buddy system && * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * * For recording whether a page is in the buddy system, we set PageBuddy. * Setting, clearing, and testing PageBuddy is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ static inline bool page_is_buddy(struct page *page, struct page *buddy, unsigned int order) { if (!page_is_guard(buddy) && !PageBuddy(buddy)) return false; if (buddy_order(buddy) != order) return false; /* * zone check is done late to avoid uselessly calculating * zone/node ids for pages that could never merge. */ if (page_zone_id(page) != page_zone_id(buddy)) return false; VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); return true; } /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). * * 1) Any buddy B1 will have an order O twin B2 which satisfies * the following equation: * B2 = B1 ^ (1 << O) * For example, if the starting buddy (buddy2) is #8 its order * 1 buddy is #10: * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 * * 2) Any buddy B will have an order O+1 parent P which * satisfies the following equation: * P = B & ~(1 << O) * * Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER */ static inline unsigned long __find_buddy_pfn(unsigned long page_pfn, unsigned int order) { return page_pfn ^ (1 << order); } /* * Find the buddy of @page and validate it. * @page: The input page * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the * function is used in the performance-critical __free_one_page(). * @order: The order of the page * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to * page_to_pfn(). * * The found buddy can be a non PageBuddy, out of @page's zone, or its order is * not the same as @page. The validation is necessary before use it. * * Return: the found buddy page or NULL if not found. */ static inline struct page *find_buddy_page_pfn(struct page *page, unsigned long pfn, unsigned int order, unsigned long *buddy_pfn) { unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order); struct page *buddy; buddy = page + (__buddy_pfn - pfn); if (buddy_pfn) *buddy_pfn = __buddy_pfn; if (page_is_buddy(page, buddy, order)) return buddy; return NULL; } extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone); static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone) { if (zone->contiguous) return pfn_to_page(start_pfn); return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); } void set_zone_contiguous(struct zone *zone); static inline void clear_zone_contiguous(struct zone *zone) { zone->contiguous = false; } extern int __isolate_free_page(struct page *page, unsigned int order); extern void __putback_isolated_page(struct page *page, unsigned int order, int mt); extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order, enum meminit_context context); /* * This will have no effect, other than possibly generating a warning, if the * caller passes in a non-large folio. */ static inline void folio_set_order(struct folio *folio, unsigned int order) { if (WARN_ON_ONCE(!order || !folio_test_large(folio))) return; folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order; #ifdef CONFIG_64BIT folio->_folio_nr_pages = 1U << order; #endif } bool __folio_unqueue_deferred_split(struct folio *folio); static inline bool folio_unqueue_deferred_split(struct folio *folio) { if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio)) return false; /* * At this point, there is no one trying to add the folio to * deferred_list. If folio is not in deferred_list, it's safe * to check without acquiring the split_queue_lock. */ if (data_race(list_empty(&folio->_deferred_list))) return false; return __folio_unqueue_deferred_split(folio); } static inline struct folio *page_rmappable_folio(struct page *page) { struct folio *folio = (struct folio *)page; if (folio && folio_test_large(folio)) folio_set_large_rmappable(folio); return folio; } static inline void prep_compound_head(struct page *page, unsigned int order) { struct folio *folio = (struct folio *)page; folio_set_order(folio, order); atomic_set(&folio->_large_mapcount, -1); atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); if (order > 1) INIT_LIST_HEAD(&folio->_deferred_list); } static inline void prep_compound_tail(struct page *head, int tail_idx) { struct page *p = head + tail_idx; p->mapping = TAIL_MAPPING; set_compound_head(p, head); set_page_private(p, 0); } extern void prep_compound_page(struct page *page, unsigned int order); void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern bool free_pages_prepare(struct page *page, unsigned int order); extern int user_min_free_kbytes; struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid, nodemask_t *); #define __alloc_frozen_pages(...) \ alloc_hooks(__alloc_frozen_pages_noprof(__VA_ARGS__)) void free_frozen_pages(struct page *page, unsigned int order); void free_unref_folios(struct folio_batch *fbatch); #ifdef CONFIG_NUMA struct page *alloc_frozen_pages_noprof(gfp_t, unsigned int order); #else static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order) { return __alloc_frozen_pages_noprof(gfp, order, numa_node_id(), NULL); } #endif #define alloc_frozen_pages(...) \ alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__)) extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); extern void zone_pcp_init(struct zone *zone); extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, int nid, bool exact_nid); void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int); #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* * in mm/compaction.c */ /* * compact_control is used to track pages being migrated and the free pages * they are being migrated to during memory compaction. The free_pfn starts * at the end of a zone and migrate_pfn begins at the start. Movable pages * are moved to the end of a zone during a compaction run and the run * completes when free_pfn <= migrate_pfn */ struct compact_control { struct list_head freepages[NR_PAGE_ORDERS]; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ unsigned int nr_freepages; /* Number of isolated free pages */ unsigned int nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ /* * Acts as an in/out parameter to page isolation for migration. * isolate_migratepages uses it as a search base. * isolate_migratepages_block will update the value to the next pfn * after the last isolated one. */ unsigned long migrate_pfn; unsigned long fast_start_pfn; /* a pfn to start linear scan from */ struct zone *zone; unsigned long total_migrate_scanned; unsigned long total_free_scanned; unsigned short fast_search_fail;/* failures to use free list searches */ short search_order; /* order to start a fast search at */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ int order; /* order a direct compactor needs */ int migratetype; /* migratetype of direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ const int highest_zoneidx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool no_set_skip_hint; /* Don't mark blocks for skipping */ bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock contention */ bool finish_pageblock; /* Scan the remainder of a pageblock. Used * when there are potentially transient * isolation or migration failures to * ensure forward progress. */ bool alloc_contig; /* alloc_contig_range allocation */ }; /* * Used in direct compaction when a page should be taken from the freelists * immediately when one is created during the free path. */ struct capture_control { struct compact_control *cc; struct page *page; }; unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn); int isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void init_cma_reserved_pageblock(struct page *page); #endif /* CONFIG_COMPACTION || CONFIG_CMA */ int find_suitable_fallback(struct free_area *area, unsigned int order, int migratetype, bool only_stealable, bool *can_steal); static inline bool free_area_empty(struct free_area *area, int migratetype) { return list_empty(&area->free_list[migratetype]); } /* mm/util.c */ struct anon_vma *folio_anon_vma(const struct folio *folio); #ifdef CONFIG_MMU void unmap_mapping_folio(struct folio *folio); extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); extern long faultin_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, bool write, int *locked); extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes); /* * NOTE: This function can't tell whether the folio is "fully mapped" in the * range. * "fully mapped" means all the pages of folio is associated with the page * table of range while this function just check whether the folio range is * within the range [start, end). Function caller needs to do page table * check if it cares about the page table association. * * Typical usage (like mlock or madvise) is: * Caller knows at least 1 page of folio is associated with page table of VMA * and the range [start, end) is intersect with the VMA range. Caller wants * to know whether the folio is fully associated with the range. It calls * this function to check whether the folio is in the range first. Then checks * the page table to know whether the folio is fully mapped to the range. */ static inline bool folio_within_range(struct folio *folio, struct vm_area_struct *vma, unsigned long start, unsigned long end) { pgoff_t pgoff, addr; unsigned long vma_pglen = vma_pages(vma); VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio); if (start > end) return false; if (start < vma->vm_start) start = vma->vm_start; if (end > vma->vm_end) end = vma->vm_end; pgoff = folio_pgoff(folio); /* if folio start address is not in vma range */ if (!in_range(pgoff, vma->vm_pgoff, vma_pglen)) return false; addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); return !(addr < start || end - addr < folio_size(folio)); } static inline bool folio_within_vma(struct folio *folio, struct vm_area_struct *vma) { return folio_within_range(folio, vma, vma->vm_start, vma->vm_end); } /* * mlock_vma_folio() and munlock_vma_folio(): * should be called with vma's mmap_lock held for read or write, * under page table lock for the pte/pmd being added or removed. * * mlock is usually called at the end of folio_add_*_rmap_*(), munlock at * the end of folio_remove_rmap_*(); but new anon folios are managed by * folio_add_lru_vma() calling mlock_new_folio(). */ void mlock_folio(struct folio *folio); static inline void mlock_vma_folio(struct folio *folio, struct vm_area_struct *vma) { /* * The VM_SPECIAL check here serves two purposes. * 1) VM_IO check prevents migration from double-counting during mlock. * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED * is never left set on a VM_SPECIAL vma, there is an interval while * file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may * still be set while VM_SPECIAL bits are added: so ignore it then. */ if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED)) mlock_folio(folio); } void munlock_folio(struct folio *folio); static inline void munlock_vma_folio(struct folio *folio, struct vm_area_struct *vma) { /* * munlock if the function is called. Ideally, we should only * do munlock if any page of folio is unmapped from VMA and * cause folio not fully mapped to VMA. * * But it's not easy to confirm that's the situation. So we * always munlock the folio and page reclaim will correct it * if it's wrong. */ if (unlikely(vma->vm_flags & VM_LOCKED)) munlock_folio(folio); } void mlock_new_folio(struct folio *folio); bool need_mlock_drain(int cpu); void mlock_drain_local(void); void mlock_drain_remote(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); /** * vma_address - Find the virtual address a page range is mapped at * @vma: The vma which maps this object. * @pgoff: The page offset within its object. * @nr_pages: The number of pages to consider. * * If any page in this range is mapped by this VMA, return the first address * where any of these pages appear. Otherwise, return -EFAULT. */ static inline unsigned long vma_address(const struct vm_area_struct *vma, pgoff_t pgoff, unsigned long nr_pages) { unsigned long address; if (pgoff >= vma->vm_pgoff) { address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); /* Check for address beyond vma (or wrapped through 0?) */ if (address < vma->vm_start || address >= vma->vm_end) address = -EFAULT; } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) { /* Test above avoids possibility of wrap to 0 on 32-bit */ address = vma->vm_start; } else { address = -EFAULT; } return address; } /* * Then at what user virtual address will none of the range be found in vma? * Assumes that vma_address() already returned a good starting address. */ static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw) { struct vm_area_struct *vma = pvmw->vma; pgoff_t pgoff; unsigned long address; /* Common case, plus ->pgoff is invalid for KSM */ if (pvmw->nr_pages == 1) return pvmw->address + PAGE_SIZE; pgoff = pvmw->pgoff + pvmw->nr_pages; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); /* Check for address beyond vma (or wrapped through 0?) */ if (address < vma->vm_start || address > vma->vm_end) address = vma->vm_end; return address; } static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, struct file *fpin) { int flags = vmf->flags; if (fpin) return fpin; /* * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or * anything, so we only pin the file and drop the mmap_lock if only * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt. */ if (fault_flag_allow_retry_first(flags) && !(flags & FAULT_FLAG_RETRY_NOWAIT)) { fpin = get_file(vmf->vma->vm_file); release_fault_lock(vmf); } return fpin; } #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } static inline void mlock_new_folio(struct folio *folio) { } static inline bool need_mlock_drain(int cpu) { return false; } static inline void mlock_drain_local(void) { } static inline void mlock_drain_remote(int cpu) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } #endif /* !CONFIG_MMU */ /* Memory initialisation debug and verification */ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT DECLARE_STATIC_KEY_TRUE(deferred_pages); bool __init deferred_grow_zone(struct zone *zone, unsigned int order); #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ enum mminit_level { MMINIT_WARNING, MMINIT_VERIFY, MMINIT_TRACE }; #ifdef CONFIG_DEBUG_MEMORY_INIT extern int mminit_loglevel; #define mminit_dprintk(level, prefix, fmt, arg...) \ do { \ if (level < mminit_loglevel) { \ if (level <= MMINIT_WARNING) \ pr_warn("mminit::" prefix " " fmt, ##arg); \ else \ printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \ } \ } while (0) extern void mminit_verify_pageflags_layout(void); extern void mminit_verify_zonelist(void); #else static inline void mminit_dprintk(enum mminit_level level, const char *prefix, const char *fmt, ...) { } static inline void mminit_verify_pageflags_layout(void) { } static inline void mminit_verify_zonelist(void) { } #endif /* CONFIG_DEBUG_MEMORY_INIT */ #define NODE_RECLAIM_NOSCAN -2 #define NODE_RECLAIM_FULL -1 #define NODE_RECLAIM_SOME 0 #define NODE_RECLAIM_SUCCESS 1 #ifdef CONFIG_NUMA extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); extern int find_next_best_node(int node, nodemask_t *used_node_mask); #else static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order) { return NODE_RECLAIM_NOSCAN; } static inline int find_next_best_node(int node, nodemask_t *used_node_mask) { return NUMA_NO_NODE; } #endif /* * mm/memory-failure.c */ #ifdef CONFIG_MEMORY_FAILURE void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu); void shake_folio(struct folio *folio); extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; extern u32 hwpoison_filter_dev_minor; extern u64 hwpoison_filter_flags_mask; extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; #define MAGIC_HWPOISON 0x48575053U /* HWPS */ void SetPageHWPoisonTakenOff(struct page *page); void ClearPageHWPoisonTakenOff(struct page *page); bool take_page_off_buddy(struct page *page); bool put_page_back_buddy(struct page *page); struct task_struct *task_early_kill(struct task_struct *tsk, int force_early); void add_to_kill_ksm(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, unsigned long ksm_addr); unsigned long page_mapped_in_vma(const struct page *page, struct vm_area_struct *vma); #else static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) { } #endif extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); struct folio *alloc_migrate_folio(struct folio *src, unsigned long private); unsigned long reclaim_pages(struct list_head *folio_list); unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list); /* The ALLOC_WMARK bits are used as an index to zone->watermark */ #define ALLOC_WMARK_MIN WMARK_MIN #define ALLOC_WMARK_LOW WMARK_LOW #define ALLOC_WMARK_HIGH WMARK_HIGH #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ /* Mask to get the watermark bits */ #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) /* * Only MMU archs have async oom victim reclaim - aka oom_reaper so we * cannot assume a reduced access to memory reserves is sufficient for * !MMU */ #ifdef CONFIG_MMU #define ALLOC_OOM 0x08 #else #define ALLOC_OOM ALLOC_NO_WATERMARKS #endif #define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access * to 25% of the min watermark or * 62.5% if __GFP_HIGH is set. */ #define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50% * of the min watermark. */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #ifdef CONFIG_ZONE_DMA32 #define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */ #else #define ALLOC_NOFRAGMENT 0x0 #endif #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ /* Flags that allow allocations below the min watermark. */ #define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) enum ttu_flags; struct tlbflush_unmap_batch; /* * only for MM internal work items which do not depend on * any allocations or locks which might depend on allocations */ extern struct workqueue_struct *mm_percpu_wq; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH void try_to_unmap_flush(void); void try_to_unmap_flush_dirty(void); void flush_tlb_batched_pending(struct mm_struct *mm); #else static inline void try_to_unmap_flush(void) { } static inline void try_to_unmap_flush_dirty(void) { } static inline void flush_tlb_batched_pending(struct mm_struct *mm) { } #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ extern const struct trace_print_flags pageflag_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; static inline bool is_migrate_highatomic(enum migratetype migratetype) { return migratetype == MIGRATE_HIGHATOMIC; } void setup_zone_pageset(struct zone *zone); struct migration_target_control { int nid; /* preferred node id */ nodemask_t *nmask; gfp_t gfp_mask; enum migrate_reason reason; }; /* * mm/filemap.c */ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, struct folio *folio, loff_t fpos, size_t size); /* * mm/vmalloc.c */ #ifdef CONFIG_MMU void __init vmalloc_init(void); int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); unsigned int get_vm_area_page_order(struct vm_struct *vm); #else static inline void vmalloc_init(void) { } static inline int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { return -EINVAL; } #endif int __must_check __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); void vunmap_range_noflush(unsigned long start, unsigned long end); void __vunmap_range_noflush(unsigned long start, unsigned long end); int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, unsigned long addr, int *flags, bool writable, int *last_cpupid); void free_zone_device_folio(struct folio *folio); int migrate_device_coherent_folio(struct folio *folio); struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller); /* * mm/gup.c */ int __must_check try_grab_folio(struct folio *folio, int refs, unsigned int flags); /* * mm/huge_memory.c */ void touch_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, bool write); void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write); /* * Parses a string with mem suffixes into its order. Useful to parse kernel * parameters. */ static inline int get_order_from_str(const char *size_str, unsigned long valid_orders) { unsigned long size; char *endptr; int order; size = memparse(size_str, &endptr); if (!is_power_of_2(size)) return -EINVAL; order = get_order(size); if (BIT(order) & ~valid_orders) return -EINVAL; return order; } enum { /* mark page accessed */ FOLL_TOUCH = 1 << 16, /* a retry, previous pass started an IO */ FOLL_TRIED = 1 << 17, /* we are working on non-current tsk/mm */ FOLL_REMOTE = 1 << 18, /* pages must be released via unpin_user_page */ FOLL_PIN = 1 << 19, /* gup_fast: prevent fall-back to slow gup */ FOLL_FAST_ONLY = 1 << 20, /* allow unlocking the mmap lock */ FOLL_UNLOCKABLE = 1 << 21, /* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */ FOLL_MADV_POPULATE = 1 << 22, }; #define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \ FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \ FOLL_MADV_POPULATE) /* * Indicates for which pages that are write-protected in the page table, * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the * GUP pin will remain consistent with the pages mapped into the page tables * of the MM. * * Temporary unmapping of PageAnonExclusive() pages or clearing of * PageAnonExclusive() has to protect against concurrent GUP: * * Ordinary GUP: Using the PT lock * * GUP-fast and fork(): mm->write_protect_seq * * GUP-fast and KSM or temporary unmapping (swap, migration): see * folio_try_share_anon_rmap_*() * * Must be called with the (sub)page that's actually referenced via the * page table entry, which might not necessarily be the head page for a * PTE-mapped THP. * * If the vma is NULL, we're coming from the GUP-fast path and might have * to fallback to the slow path just to lookup the vma. */ static inline bool gup_must_unshare(struct vm_area_struct *vma, unsigned int flags, struct page *page) { /* * FOLL_WRITE is implicitly handled correctly as the page table entry * has to be writable -- and if it references (part of) an anonymous * folio, that part is required to be marked exclusive. */ if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN) return false; /* * Note: PageAnon(page) is stable until the page is actually getting * freed. */ if (!PageAnon(page)) { /* * We only care about R/O long-term pining: R/O short-term * pinning does not have the semantics to observe successive * changes through the process page tables. */ if (!(flags & FOLL_LONGTERM)) return false; /* We really need the vma ... */ if (!vma) return true; /* * ... because we only care about writable private ("COW") * mappings where we have to break COW early. */ return is_cow_mapping(vma->vm_flags); } /* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */ if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) smp_rmb(); /* * Note that KSM pages cannot be exclusive, and consequently, * cannot get pinned. */ return !PageAnonExclusive(page); } extern bool mirrored_kernelcore; extern bool memblock_has_mirror(void); static __always_inline void vma_set_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff) { vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; } static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) { /* * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty * enablements, because when without soft-dirty being compiled in, * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY) * will be constantly true. */ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) return false; /* * Soft-dirty is kind of special: its tracking is enabled when the * vma flags not set. */ return !(vma->vm_flags & VM_SOFTDIRTY); } static inline bool pmd_needs_soft_dirty_wp(struct vm_area_struct *vma, pmd_t pmd) { return vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd); } static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte) { return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte); } void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid); /* shrinker related functions */ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); #ifdef CONFIG_SHRINKER_DEBUG static inline __printf(2, 0) int shrinker_debugfs_name_alloc( struct shrinker *shrinker, const char *fmt, va_list ap) { shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); return shrinker->name ? 0 : -ENOMEM; } static inline void shrinker_debugfs_name_free(struct shrinker *shrinker) { kfree_const(shrinker->name); shrinker->name = NULL; } extern int shrinker_debugfs_add(struct shrinker *shrinker); extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, int *debugfs_id); extern void shrinker_debugfs_remove(struct dentry *debugfs_entry, int debugfs_id); #else /* CONFIG_SHRINKER_DEBUG */ static inline int shrinker_debugfs_add(struct shrinker *shrinker) { return 0; } static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker, const char *fmt, va_list ap) { return 0; } static inline void shrinker_debugfs_name_free(struct shrinker *shrinker) { } static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, int *debugfs_id) { *debugfs_id = -1; return NULL; } static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, int debugfs_id) { } #endif /* CONFIG_SHRINKER_DEBUG */ /* Only track the nodes of mappings with shadow entries */ void workingset_update_node(struct xa_node *node); extern struct list_lru shadow_nodes; #define mapping_set_update(xas, mapping) do { \ if (!dax_mapping(mapping) && !shmem_mapping(mapping)) { \ xas_set_update(xas, workingset_update_node); \ xas_set_lru(xas, &shadow_nodes); \ } \ } while (0) /* mremap.c */ unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, bool need_rmap_locks, bool for_stack); #ifdef CONFIG_UNACCEPTED_MEMORY void accept_page(struct page *page); #else /* CONFIG_UNACCEPTED_MEMORY */ static inline void accept_page(struct page *page) { } #endif /* CONFIG_UNACCEPTED_MEMORY */ /* pagewalk.c */ int walk_page_range_mm(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); /* pt_reclaim.c */ bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval); void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb, pmd_t pmdval); void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, struct mmu_gather *tlb); #ifdef CONFIG_PT_RECLAIM bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, struct zap_details *details); #else static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, struct zap_details *details) { return false; } #endif /* CONFIG_PT_RECLAIM */ #endif /* __MM_INTERNAL_H */ |
| 327 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Based on arch/arm/include/asm/processor.h * * Copyright (C) 1995-1999 Russell King * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_PROCESSOR_H #define __ASM_PROCESSOR_H /* * On arm64 systems, unaligned accesses by the CPU are cheap, and so there is * no point in shifting all network buffers by 2 bytes just to make some IP * header fields appear aligned in memory, potentially sacrificing some DMA * performance on some platforms. */ #define NET_IP_ALIGN 0 #define MTE_CTRL_GCR_USER_EXCL_SHIFT 0 #define MTE_CTRL_GCR_USER_EXCL_MASK 0xffff #define MTE_CTRL_TCF_SYNC (1UL << 16) #define MTE_CTRL_TCF_ASYNC (1UL << 17) #define MTE_CTRL_TCF_ASYMM (1UL << 18) #ifndef __ASSEMBLY__ #include <linux/build_bug.h> #include <linux/cache.h> #include <linux/init.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/thread_info.h> #include <vdso/processor.h> #include <asm/alternative.h> #include <asm/cpufeature.h> #include <asm/hw_breakpoint.h> #include <asm/kasan.h> #include <asm/lse.h> #include <asm/pgtable-hwdef.h> #include <asm/pointer_auth.h> #include <asm/ptrace.h> #include <asm/spectre.h> #include <asm/types.h> /* * TASK_SIZE - the maximum size of a user space task. * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area. */ #define DEFAULT_MAP_WINDOW_64 (UL(1) << VA_BITS_MIN) #define TASK_SIZE_64 (UL(1) << vabits_actual) #define TASK_SIZE_MAX (UL(1) << VA_BITS) #ifdef CONFIG_COMPAT #if defined(CONFIG_ARM64_64K_PAGES) && defined(CONFIG_KUSER_HELPERS) /* * With CONFIG_ARM64_64K_PAGES enabled, the last page is occupied * by the compat vectors page. */ #define TASK_SIZE_32 UL(0x100000000) #else #define TASK_SIZE_32 (UL(0x100000000) - PAGE_SIZE) #endif /* CONFIG_ARM64_64K_PAGES */ #define TASK_SIZE (test_thread_flag(TIF_32BIT) ? \ TASK_SIZE_32 : TASK_SIZE_64) #define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \ TASK_SIZE_32 : TASK_SIZE_64) #define DEFAULT_MAP_WINDOW (test_thread_flag(TIF_32BIT) ? \ TASK_SIZE_32 : DEFAULT_MAP_WINDOW_64) #else #define TASK_SIZE TASK_SIZE_64 #define DEFAULT_MAP_WINDOW DEFAULT_MAP_WINDOW_64 #endif /* CONFIG_COMPAT */ #ifdef CONFIG_ARM64_FORCE_52BIT #define STACK_TOP_MAX TASK_SIZE_64 #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 4)) #else #define STACK_TOP_MAX DEFAULT_MAP_WINDOW_64 #define TASK_UNMAPPED_BASE (PAGE_ALIGN(DEFAULT_MAP_WINDOW / 4)) #endif /* CONFIG_ARM64_FORCE_52BIT */ #ifdef CONFIG_COMPAT #define AARCH32_VECTORS_BASE 0xffff0000 #define STACK_TOP (test_thread_flag(TIF_32BIT) ? \ AARCH32_VECTORS_BASE : STACK_TOP_MAX) #else #define STACK_TOP STACK_TOP_MAX #endif /* CONFIG_COMPAT */ #ifndef CONFIG_ARM64_FORCE_52BIT #define arch_get_mmap_end(addr, len, flags) \ (((addr) > DEFAULT_MAP_WINDOW) ? TASK_SIZE : DEFAULT_MAP_WINDOW) #define arch_get_mmap_base(addr, base) ((addr > DEFAULT_MAP_WINDOW) ? \ base + TASK_SIZE - DEFAULT_MAP_WINDOW :\ base) #endif /* CONFIG_ARM64_FORCE_52BIT */ extern phys_addr_t arm64_dma_phys_limit; #define ARCH_LOW_ADDRESS_LIMIT (arm64_dma_phys_limit - 1) struct debug_info { #ifdef CONFIG_HAVE_HW_BREAKPOINT /* Have we suspended stepping by a debugger? */ int suspended_step; /* Allow breakpoints and watchpoints to be disabled for this thread. */ int bps_disabled; int wps_disabled; /* Hardware breakpoints pinned to this task. */ struct perf_event *hbp_break[ARM_MAX_BRP]; struct perf_event *hbp_watch[ARM_MAX_WRP]; #endif }; enum vec_type { ARM64_VEC_SVE = 0, ARM64_VEC_SME, ARM64_VEC_MAX, }; enum fp_type { FP_STATE_CURRENT, /* Save based on current task state. */ FP_STATE_FPSIMD, FP_STATE_SVE, }; struct cpu_context { unsigned long x19; unsigned long x20; unsigned long x21; unsigned long x22; unsigned long x23; unsigned long x24; unsigned long x25; unsigned long x26; unsigned long x27; unsigned long x28; unsigned long fp; unsigned long sp; unsigned long pc; }; struct thread_struct { struct cpu_context cpu_context; /* cpu context */ /* * Whitelisted fields for hardened usercopy: * Maintainers must ensure manually that this contains no * implicit padding. */ struct { unsigned long tp_value; /* TLS register */ unsigned long tp2_value; u64 fpmr; unsigned long pad; struct user_fpsimd_state fpsimd_state; } uw; enum fp_type fp_type; /* registers FPSIMD or SVE? */ unsigned int fpsimd_cpu; void *sve_state; /* SVE registers, if any */ void *sme_state; /* ZA and ZT state, if any */ unsigned int vl[ARM64_VEC_MAX]; /* vector length */ unsigned int vl_onexec[ARM64_VEC_MAX]; /* vl after next exec */ unsigned long fault_address; /* fault info */ unsigned long fault_code; /* ESR_EL1 value */ struct debug_info debug; /* debugging */ struct user_fpsimd_state kernel_fpsimd_state; unsigned int kernel_fpsimd_cpu; #ifdef CONFIG_ARM64_PTR_AUTH struct ptrauth_keys_user keys_user; #ifdef CONFIG_ARM64_PTR_AUTH_KERNEL struct ptrauth_keys_kernel keys_kernel; #endif #endif #ifdef CONFIG_ARM64_MTE u64 mte_ctrl; #endif u64 sctlr_user; u64 svcr; u64 tpidr2_el0; u64 por_el0; #ifdef CONFIG_ARM64_GCS unsigned int gcs_el0_mode; unsigned int gcs_el0_locked; u64 gcspr_el0; u64 gcs_base; u64 gcs_size; #endif }; static inline unsigned int thread_get_vl(struct thread_struct *thread, enum vec_type type) { return thread->vl[type]; } static inline unsigned int thread_get_sve_vl(struct thread_struct *thread) { return thread_get_vl(thread, ARM64_VEC_SVE); } static inline unsigned int thread_get_sme_vl(struct thread_struct *thread) { return thread_get_vl(thread, ARM64_VEC_SME); } static inline unsigned int thread_get_cur_vl(struct thread_struct *thread) { if (system_supports_sme() && (thread->svcr & SVCR_SM_MASK)) return thread_get_sme_vl(thread); else return thread_get_sve_vl(thread); } unsigned int task_get_vl(const struct task_struct *task, enum vec_type type); void task_set_vl(struct task_struct *task, enum vec_type type, unsigned long vl); void task_set_vl_onexec(struct task_struct *task, enum vec_type type, unsigned long vl); unsigned int task_get_vl_onexec(const struct task_struct *task, enum vec_type type); static inline unsigned int task_get_sve_vl(const struct task_struct *task) { return task_get_vl(task, ARM64_VEC_SVE); } static inline unsigned int task_get_sme_vl(const struct task_struct *task) { return task_get_vl(task, ARM64_VEC_SME); } static inline void task_set_sve_vl(struct task_struct *task, unsigned long vl) { task_set_vl(task, ARM64_VEC_SVE, vl); } static inline unsigned int task_get_sve_vl_onexec(const struct task_struct *task) { return task_get_vl_onexec(task, ARM64_VEC_SVE); } static inline void task_set_sve_vl_onexec(struct task_struct *task, unsigned long vl) { task_set_vl_onexec(task, ARM64_VEC_SVE, vl); } #define SCTLR_USER_MASK \ (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | SCTLR_ELx_ENDA | SCTLR_ELx_ENDB | \ SCTLR_EL1_TCF0_MASK) static inline void arch_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { /* Verify that there is no padding among the whitelisted fields: */ BUILD_BUG_ON(sizeof_field(struct thread_struct, uw) != sizeof_field(struct thread_struct, uw.tp_value) + sizeof_field(struct thread_struct, uw.tp2_value) + sizeof_field(struct thread_struct, uw.fpmr) + sizeof_field(struct thread_struct, uw.pad) + sizeof_field(struct thread_struct, uw.fpsimd_state)); *offset = offsetof(struct thread_struct, uw); *size = sizeof_field(struct thread_struct, uw); } #ifdef CONFIG_COMPAT #define task_user_tls(t) \ ({ \ unsigned long *__tls; \ if (is_compat_thread(task_thread_info(t))) \ __tls = &(t)->thread.uw.tp2_value; \ else \ __tls = &(t)->thread.uw.tp_value; \ __tls; \ }) #else #define task_user_tls(t) (&(t)->thread.uw.tp_value) #endif /* Sync TPIDR_EL0 back to thread_struct for current */ void tls_preserve_current_state(void); #define INIT_THREAD { \ .fpsimd_cpu = NR_CPUS, \ } static inline void start_thread_common(struct pt_regs *regs, unsigned long pc, unsigned long pstate) { /* * Ensure all GPRs are zeroed, and initialize PC + PSTATE. * The SP (or compat SP) will be initialized later. */ regs->user_regs = (struct user_pt_regs) { .pc = pc, .pstate = pstate, }; /* * To allow the syscalls:sys_exit_execve tracepoint we need to preserve * syscallno, but do not need orig_x0 or the original GPRs. */ regs->orig_x0 = 0; /* * An exec from a kernel thread won't have an existing PMR value. */ if (system_uses_irq_prio_masking()) regs->pmr = GIC_PRIO_IRQON; /* * The pt_regs::stackframe field must remain valid throughout this * function as a stacktrace can be taken at any time. Any user or * kernel task should have a valid final frame. */ WARN_ON_ONCE(regs->stackframe.record.fp != 0); WARN_ON_ONCE(regs->stackframe.record.lr != 0); WARN_ON_ONCE(regs->stackframe.type != FRAME_META_TYPE_FINAL); } static inline void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp) { start_thread_common(regs, pc, PSR_MODE_EL0t); spectre_v4_enable_task_mitigation(current); regs->sp = sp; } #ifdef CONFIG_COMPAT static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp) { unsigned long pstate = PSR_AA32_MODE_USR; if (pc & 1) pstate |= PSR_AA32_T_BIT; if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) pstate |= PSR_AA32_E_BIT; start_thread_common(regs, pc, pstate); spectre_v4_enable_task_mitigation(current); regs->compat_sp = sp; } #endif static __always_inline bool is_ttbr0_addr(unsigned long addr) { /* entry assembly clears tags for TTBR0 addrs */ return addr < TASK_SIZE; } static __always_inline bool is_ttbr1_addr(unsigned long addr) { /* TTBR1 addresses may have a tag if KASAN_SW_TAGS is in use */ return arch_kasan_reset_tag(addr) >= PAGE_OFFSET; } /* Forward declaration, a strange C thing */ struct task_struct; unsigned long __get_wchan(struct task_struct *p); void update_sctlr_el1(u64 sctlr); /* Thread switching */ extern struct task_struct *cpu_switch_to(struct task_struct *prev, struct task_struct *next); #define task_pt_regs(p) \ ((struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1) #define KSTK_EIP(tsk) ((unsigned long)task_pt_regs(tsk)->pc) #define KSTK_ESP(tsk) user_stack_pointer(task_pt_regs(tsk)) /* * Prefetching support */ #define ARCH_HAS_PREFETCH static inline void prefetch(const void *ptr) { asm volatile("prfm pldl1keep, %a0\n" : : "p" (ptr)); } #define ARCH_HAS_PREFETCHW static inline void prefetchw(const void *ptr) { asm volatile("prfm pstl1keep, %a0\n" : : "p" (ptr)); } extern unsigned long __ro_after_init signal_minsigstksz; /* sigframe size */ extern void __init minsigstksz_setup(void); /* * Not at the top of the file due to a direct #include cycle between * <asm/fpsimd.h> and <asm/processor.h>. Deferring this #include * ensures that contents of processor.h are visible to fpsimd.h even if * processor.h is included first. * * These prctl helpers are the only things in this file that require * fpsimd.h. The core code expects them to be in this header. */ #include <asm/fpsimd.h> /* Userspace interface for PR_S[MV]E_{SET,GET}_VL prctl()s: */ #define SVE_SET_VL(arg) sve_set_current_vl(arg) #define SVE_GET_VL() sve_get_current_vl() #define SME_SET_VL(arg) sme_set_current_vl(arg) #define SME_GET_VL() sme_get_current_vl() /* PR_PAC_RESET_KEYS prctl */ #define PAC_RESET_KEYS(tsk, arg) ptrauth_prctl_reset_keys(tsk, arg) /* PR_PAC_{SET,GET}_ENABLED_KEYS prctl */ #define PAC_SET_ENABLED_KEYS(tsk, keys, enabled) \ ptrauth_set_enabled_keys(tsk, keys, enabled) #define PAC_GET_ENABLED_KEYS(tsk) ptrauth_get_enabled_keys(tsk) #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI /* PR_{SET,GET}_TAGGED_ADDR_CTRL prctl */ long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg); long get_tagged_addr_ctrl(struct task_struct *task); #define SET_TAGGED_ADDR_CTRL(arg) set_tagged_addr_ctrl(current, arg) #define GET_TAGGED_ADDR_CTRL() get_tagged_addr_ctrl(current) #endif int get_tsc_mode(unsigned long adr); int set_tsc_mode(unsigned int val); #define GET_TSC_CTL(adr) get_tsc_mode((adr)) #define SET_TSC_CTL(val) set_tsc_mode((val)) #endif /* __ASSEMBLY__ */ #endif /* __ASM_PROCESSOR_H */ |
| 346 172 172 362 362 24 127 18 559 82 82 82 62 559 343 375 353 354 355 4 353 770 768 376 120 62 771 767 362 43 769 31 769 375 375 571 771 364 134 7 82 7 364 364 4 364 82 364 559 558 559 557 352 353 353 350 17 82 353 4 2 4 355 355 355 354 355 354 82 338 4 355 145 770 373 771 768 354 344 733 645 365 364 365 362 350 350 60 349 350 350 350 322 349 350 349 89 89 89 364 80 96 343 285 89 82 82 4 82 82 82 82 337 338 337 338 82 82 4 82 82 82 6 6 64 2 15 53 53 23 2 82 82 82 20 82 82 82 79 82 82 82 70 351 82 82 82 82 82 82 82 82 82 80 82 79 13 52 41 11 38 49 24 4 25 1 17 12 5 65 4 56 11 4 64 64 64 375 373 121 375 375 4 4 4 48 16 34 48 48 32 25 23 2 46 4 82 7 9 48 9 9 9 9 82 82 2 82 82 82 24 82 82 4 82 82 82 82 7 82 82 82 82 82 82 82 82 82 82 82 25 32 48 2 32 25 33 18 48 757 758 757 743 544 546 758 757 757 754 2 48 48 4 9 48 48 48 48 48 48 48 48 48 9 41 41 24 17 41 37 13 29 62 62 4 62 59 7 62 62 4 62 62 62 61 62 62 62 60 62 41 6 25 29 25 6 41 41 6 25 41 66 13 62 134 134 8 134 134 316 37 353 34 16 374 375 375 375 375 57 37 37 37 37 15 3 15 2 16 1 1 13 7 279 97 97 96 97 97 96 37 37 37 19 18 37 37 304 267 80 338 347 346 301 1 346 9 69 342 347 8 8 8 63 72 6 7 125 9 177 365 4 44 44 66 372 66 353 215 64 44 8 347 37 66 351 34 367 87 302 134 346 62 13 348 374 353 375 375 370 134 375 368 86 61 44 348 360 359 361 352 145 8 27 19 19 19 19 19 19 365 363 44 44 44 39 12 44 44 364 370 368 73 358 357 369 44 369 370 370 74 557 364 557 4 555 558 557 557 554 558 558 363 648 649 644 563 563 76 645 562 644 557 647 649 648 557 557 19 19 19 19 19 19 19 197 279 626 277 277 19 19 19 19 19 19 19 19 4 4 4 4 4 4 4 4 127 127 4 4 127 7 127 344 345 345 328 329 19 327 327 329 356 355 357 336 343 343 374 372 372 122 122 361 346 151 345 346 125 51 51 370 370 17 17 370 370 134 135 558 558 314 557 558 1 278 558 558 360 97 97 97 97 97 8 8 8 8 4 19 19 357 358 291 127 17 514 515 514 515 48 165 199 514 48 512 362 364 364 364 363 364 364 362 364 363 364 363 362 364 364 362 364 364 364 362 364 364 364 362 364 364 364 363 364 363 364 364 364 364 364 364 364 364 363 364 364 364 363 364 364 364 364 361 364 362 362 364 364 364 364 363 364 364 364 363 362 364 364 364 364 364 364 364 364 364 364 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 | // SPDX-License-Identifier: GPL-2.0+ /* * Maple Tree implementation * Copyright (c) 2018-2022 Oracle Corporation * Authors: Liam R. Howlett <Liam.Howlett@oracle.com> * Matthew Wilcox <willy@infradead.org> * Copyright (c) 2023 ByteDance * Author: Peng Zhang <zhangpeng.00@bytedance.com> */ /* * DOC: Interesting implementation details of the Maple Tree * * Each node type has a number of slots for entries and a number of slots for * pivots. In the case of dense nodes, the pivots are implied by the position * and are simply the slot index + the minimum of the node. * * In regular B-Tree terms, pivots are called keys. The term pivot is used to * indicate that the tree is specifying ranges. Pivots may appear in the * subtree with an entry attached to the value whereas keys are unique to a * specific position of a B-tree. Pivot values are inclusive of the slot with * the same index. * * * The following illustrates the layout of a range64 nodes slots and pivots. * * * Slots -> | 0 | 1 | 2 | ... | 12 | 13 | 14 | 15 | * ┬ ┬ ┬ ┬ ┬ ┬ ┬ ┬ ┬ * │ │ │ │ │ │ │ │ └─ Implied maximum * │ │ │ │ │ │ │ └─ Pivot 14 * │ │ │ │ │ │ └─ Pivot 13 * │ │ │ │ │ └─ Pivot 12 * │ │ │ │ └─ Pivot 11 * │ │ │ └─ Pivot 2 * │ │ └─ Pivot 1 * │ └─ Pivot 0 * └─ Implied minimum * * Slot contents: * Internal (non-leaf) nodes contain pointers to other nodes. * Leaf nodes contain entries. * * The location of interest is often referred to as an offset. All offsets have * a slot, but the last offset has an implied pivot from the node above (or * UINT_MAX for the root node. * * Ranges complicate certain write activities. When modifying any of * the B-tree variants, it is known that one entry will either be added or * deleted. When modifying the Maple Tree, one store operation may overwrite * the entire data set, or one half of the tree, or the middle half of the tree. * */ #include <linux/maple_tree.h> #include <linux/xarray.h> #include <linux/types.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/limits.h> #include <asm/barrier.h> #define CREATE_TRACE_POINTS #include <trace/events/maple_tree.h> /* * Kernel pointer hashing renders much of the maple tree dump useless as tagged * pointers get hashed to arbitrary values. * * If CONFIG_DEBUG_VM_MAPLE_TREE is set we are in a debug mode where it is * permissible to bypass this. Otherwise remain cautious and retain the hashing. * * Userland doesn't know about %px so also use %p there. */ #if defined(__KERNEL__) && defined(CONFIG_DEBUG_VM_MAPLE_TREE) #define PTR_FMT "%px" #else #define PTR_FMT "%p" #endif #define MA_ROOT_PARENT 1 /* * Maple state flags * * MA_STATE_BULK - Bulk insert mode * * MA_STATE_REBALANCE - Indicate a rebalance during bulk insert * * MA_STATE_PREALLOC - Preallocated nodes, WARN_ON allocation */ #define MA_STATE_BULK 1 #define MA_STATE_REBALANCE 2 #define MA_STATE_PREALLOC 4 #define ma_parent_ptr(x) ((struct maple_pnode *)(x)) #define mas_tree_parent(x) ((unsigned long)(x->tree) | MA_ROOT_PARENT) #define ma_mnode_ptr(x) ((struct maple_node *)(x)) #define ma_enode_ptr(x) ((struct maple_enode *)(x)) static struct kmem_cache *maple_node_cache; #ifdef CONFIG_DEBUG_MAPLE_TREE static const unsigned long mt_max[] = { [maple_dense] = MAPLE_NODE_SLOTS, [maple_leaf_64] = ULONG_MAX, [maple_range_64] = ULONG_MAX, [maple_arange_64] = ULONG_MAX, }; #define mt_node_max(x) mt_max[mte_node_type(x)] #endif static const unsigned char mt_slots[] = { [maple_dense] = MAPLE_NODE_SLOTS, [maple_leaf_64] = MAPLE_RANGE64_SLOTS, [maple_range_64] = MAPLE_RANGE64_SLOTS, [maple_arange_64] = MAPLE_ARANGE64_SLOTS, }; #define mt_slot_count(x) mt_slots[mte_node_type(x)] static const unsigned char mt_pivots[] = { [maple_dense] = 0, [maple_leaf_64] = MAPLE_RANGE64_SLOTS - 1, [maple_range_64] = MAPLE_RANGE64_SLOTS - 1, [maple_arange_64] = MAPLE_ARANGE64_SLOTS - 1, }; #define mt_pivot_count(x) mt_pivots[mte_node_type(x)] static const unsigned char mt_min_slots[] = { [maple_dense] = MAPLE_NODE_SLOTS / 2, [maple_leaf_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, [maple_range_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, [maple_arange_64] = (MAPLE_ARANGE64_SLOTS / 2) - 1, }; #define mt_min_slot_count(x) mt_min_slots[mte_node_type(x)] #define MAPLE_BIG_NODE_SLOTS (MAPLE_RANGE64_SLOTS * 2 + 2) #define MAPLE_BIG_NODE_GAPS (MAPLE_ARANGE64_SLOTS * 2 + 1) struct maple_big_node { unsigned long pivot[MAPLE_BIG_NODE_SLOTS - 1]; union { struct maple_enode *slot[MAPLE_BIG_NODE_SLOTS]; struct { unsigned long padding[MAPLE_BIG_NODE_GAPS]; unsigned long gap[MAPLE_BIG_NODE_GAPS]; }; }; unsigned char b_end; enum maple_type type; }; /* * The maple_subtree_state is used to build a tree to replace a segment of an * existing tree in a more atomic way. Any walkers of the older tree will hit a * dead node and restart on updates. */ struct maple_subtree_state { struct ma_state *orig_l; /* Original left side of subtree */ struct ma_state *orig_r; /* Original right side of subtree */ struct ma_state *l; /* New left side of subtree */ struct ma_state *m; /* New middle of subtree (rare) */ struct ma_state *r; /* New right side of subtree */ struct ma_topiary *free; /* nodes to be freed */ struct ma_topiary *destroy; /* Nodes to be destroyed (walked and freed) */ struct maple_big_node *bn; }; #ifdef CONFIG_KASAN_STACK /* Prevent mas_wr_bnode() from exceeding the stack frame limit */ #define noinline_for_kasan noinline_for_stack #else #define noinline_for_kasan inline #endif /* Functions */ static inline struct maple_node *mt_alloc_one(gfp_t gfp) { return kmem_cache_alloc(maple_node_cache, gfp); } static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) { return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); } static inline void mt_free_one(struct maple_node *node) { kmem_cache_free(maple_node_cache, node); } static inline void mt_free_bulk(size_t size, void __rcu **nodes) { kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); } static void mt_free_rcu(struct rcu_head *head) { struct maple_node *node = container_of(head, struct maple_node, rcu); kmem_cache_free(maple_node_cache, node); } /* * ma_free_rcu() - Use rcu callback to free a maple node * @node: The node to free * * The maple tree uses the parent pointer to indicate this node is no longer in * use and will be freed. */ static void ma_free_rcu(struct maple_node *node) { WARN_ON(node->parent != ma_parent_ptr(node)); call_rcu(&node->rcu, mt_free_rcu); } static void mas_set_height(struct ma_state *mas) { unsigned int new_flags = mas->tree->ma_flags; new_flags &= ~MT_FLAGS_HEIGHT_MASK; MAS_BUG_ON(mas, mas->depth > MAPLE_HEIGHT_MAX); new_flags |= mas->depth << MT_FLAGS_HEIGHT_OFFSET; mas->tree->ma_flags = new_flags; } static unsigned int mas_mt_height(struct ma_state *mas) { return mt_height(mas->tree); } static inline unsigned int mt_attr(struct maple_tree *mt) { return mt->ma_flags & ~MT_FLAGS_HEIGHT_MASK; } static __always_inline enum maple_type mte_node_type( const struct maple_enode *entry) { return ((unsigned long)entry >> MAPLE_NODE_TYPE_SHIFT) & MAPLE_NODE_TYPE_MASK; } static __always_inline bool ma_is_dense(const enum maple_type type) { return type < maple_leaf_64; } static __always_inline bool ma_is_leaf(const enum maple_type type) { return type < maple_range_64; } static __always_inline bool mte_is_leaf(const struct maple_enode *entry) { return ma_is_leaf(mte_node_type(entry)); } /* * We also reserve values with the bottom two bits set to '10' which are * below 4096 */ static __always_inline bool mt_is_reserved(const void *entry) { return ((unsigned long)entry < MAPLE_RESERVED_RANGE) && xa_is_internal(entry); } static __always_inline void mas_set_err(struct ma_state *mas, long err) { mas->node = MA_ERROR(err); mas->status = ma_error; } static __always_inline bool mas_is_ptr(const struct ma_state *mas) { return mas->status == ma_root; } static __always_inline bool mas_is_start(const struct ma_state *mas) { return mas->status == ma_start; } static __always_inline bool mas_is_none(const struct ma_state *mas) { return mas->status == ma_none; } static __always_inline bool mas_is_paused(const struct ma_state *mas) { return mas->status == ma_pause; } static __always_inline bool mas_is_overflow(struct ma_state *mas) { return mas->status == ma_overflow; } static inline bool mas_is_underflow(struct ma_state *mas) { return mas->status == ma_underflow; } static __always_inline struct maple_node *mte_to_node( const struct maple_enode *entry) { return (struct maple_node *)((unsigned long)entry & ~MAPLE_NODE_MASK); } /* * mte_to_mat() - Convert a maple encoded node to a maple topiary node. * @entry: The maple encoded node * * Return: a maple topiary pointer */ static inline struct maple_topiary *mte_to_mat(const struct maple_enode *entry) { return (struct maple_topiary *) ((unsigned long)entry & ~MAPLE_NODE_MASK); } /* * mas_mn() - Get the maple state node. * @mas: The maple state * * Return: the maple node (not encoded - bare pointer). */ static inline struct maple_node *mas_mn(const struct ma_state *mas) { return mte_to_node(mas->node); } /* * mte_set_node_dead() - Set a maple encoded node as dead. * @mn: The maple encoded node. */ static inline void mte_set_node_dead(struct maple_enode *mn) { mte_to_node(mn)->parent = ma_parent_ptr(mte_to_node(mn)); smp_wmb(); /* Needed for RCU */ } /* Bit 1 indicates the root is a node */ #define MAPLE_ROOT_NODE 0x02 /* maple_type stored bit 3-6 */ #define MAPLE_ENODE_TYPE_SHIFT 0x03 /* Bit 2 means a NULL somewhere below */ #define MAPLE_ENODE_NULL 0x04 static inline struct maple_enode *mt_mk_node(const struct maple_node *node, enum maple_type type) { return (void *)((unsigned long)node | (type << MAPLE_ENODE_TYPE_SHIFT) | MAPLE_ENODE_NULL); } static inline void *mte_mk_root(const struct maple_enode *node) { return (void *)((unsigned long)node | MAPLE_ROOT_NODE); } static inline void *mte_safe_root(const struct maple_enode *node) { return (void *)((unsigned long)node & ~MAPLE_ROOT_NODE); } static inline void __maybe_unused *mte_set_full(const struct maple_enode *node) { return (void *)((unsigned long)node & ~MAPLE_ENODE_NULL); } static inline void __maybe_unused *mte_clear_full(const struct maple_enode *node) { return (void *)((unsigned long)node | MAPLE_ENODE_NULL); } static inline bool __maybe_unused mte_has_null(const struct maple_enode *node) { return (unsigned long)node & MAPLE_ENODE_NULL; } static __always_inline bool ma_is_root(struct maple_node *node) { return ((unsigned long)node->parent & MA_ROOT_PARENT); } static __always_inline bool mte_is_root(const struct maple_enode *node) { return ma_is_root(mte_to_node(node)); } static inline bool mas_is_root_limits(const struct ma_state *mas) { return !mas->min && mas->max == ULONG_MAX; } static __always_inline bool mt_is_alloc(struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_ALLOC_RANGE); } /* * The Parent Pointer * Excluding root, the parent pointer is 256B aligned like all other tree nodes. * When storing a 32 or 64 bit values, the offset can fit into 5 bits. The 16 * bit values need an extra bit to store the offset. This extra bit comes from * a reuse of the last bit in the node type. This is possible by using bit 1 to * indicate if bit 2 is part of the type or the slot. * * Note types: * 0x??1 = Root * 0x?00 = 16 bit nodes * 0x010 = 32 bit nodes * 0x110 = 64 bit nodes * * Slot size and alignment * 0b??1 : Root * 0b?00 : 16 bit values, type in 0-1, slot in 2-7 * 0b010 : 32 bit values, type in 0-2, slot in 3-7 * 0b110 : 64 bit values, type in 0-2, slot in 3-7 */ #define MAPLE_PARENT_ROOT 0x01 #define MAPLE_PARENT_SLOT_SHIFT 0x03 #define MAPLE_PARENT_SLOT_MASK 0xF8 #define MAPLE_PARENT_16B_SLOT_SHIFT 0x02 #define MAPLE_PARENT_16B_SLOT_MASK 0xFC #define MAPLE_PARENT_RANGE64 0x06 #define MAPLE_PARENT_RANGE32 0x04 #define MAPLE_PARENT_NOT_RANGE16 0x02 /* * mte_parent_shift() - Get the parent shift for the slot storage. * @parent: The parent pointer cast as an unsigned long * Return: The shift into that pointer to the star to of the slot */ static inline unsigned long mte_parent_shift(unsigned long parent) { /* Note bit 1 == 0 means 16B */ if (likely(parent & MAPLE_PARENT_NOT_RANGE16)) return MAPLE_PARENT_SLOT_SHIFT; return MAPLE_PARENT_16B_SLOT_SHIFT; } /* * mte_parent_slot_mask() - Get the slot mask for the parent. * @parent: The parent pointer cast as an unsigned long. * Return: The slot mask for that parent. */ static inline unsigned long mte_parent_slot_mask(unsigned long parent) { /* Note bit 1 == 0 means 16B */ if (likely(parent & MAPLE_PARENT_NOT_RANGE16)) return MAPLE_PARENT_SLOT_MASK; return MAPLE_PARENT_16B_SLOT_MASK; } /* * mas_parent_type() - Return the maple_type of the parent from the stored * parent type. * @mas: The maple state * @enode: The maple_enode to extract the parent's enum * Return: The node->parent maple_type */ static inline enum maple_type mas_parent_type(struct ma_state *mas, struct maple_enode *enode) { unsigned long p_type; p_type = (unsigned long)mte_to_node(enode)->parent; if (WARN_ON(p_type & MAPLE_PARENT_ROOT)) return 0; p_type &= MAPLE_NODE_MASK; p_type &= ~mte_parent_slot_mask(p_type); switch (p_type) { case MAPLE_PARENT_RANGE64: /* or MAPLE_PARENT_ARANGE64 */ if (mt_is_alloc(mas->tree)) return maple_arange_64; return maple_range_64; } return 0; } /* * mas_set_parent() - Set the parent node and encode the slot * @mas: The maple state * @enode: The encoded maple node. * @parent: The encoded maple node that is the parent of @enode. * @slot: The slot that @enode resides in @parent. * * Slot number is encoded in the enode->parent bit 3-6 or 2-6, depending on the * parent type. */ static inline void mas_set_parent(struct ma_state *mas, struct maple_enode *enode, const struct maple_enode *parent, unsigned char slot) { unsigned long val = (unsigned long)parent; unsigned long shift; unsigned long type; enum maple_type p_type = mte_node_type(parent); MAS_BUG_ON(mas, p_type == maple_dense); MAS_BUG_ON(mas, p_type == maple_leaf_64); switch (p_type) { case maple_range_64: case maple_arange_64: shift = MAPLE_PARENT_SLOT_SHIFT; type = MAPLE_PARENT_RANGE64; break; default: case maple_dense: case maple_leaf_64: shift = type = 0; break; } val &= ~MAPLE_NODE_MASK; /* Clear all node metadata in parent */ val |= (slot << shift) | type; mte_to_node(enode)->parent = ma_parent_ptr(val); } /* * mte_parent_slot() - get the parent slot of @enode. * @enode: The encoded maple node. * * Return: The slot in the parent node where @enode resides. */ static __always_inline unsigned int mte_parent_slot(const struct maple_enode *enode) { unsigned long val = (unsigned long)mte_to_node(enode)->parent; if (unlikely(val & MA_ROOT_PARENT)) return 0; /* * Okay to use MAPLE_PARENT_16B_SLOT_MASK as the last bit will be lost * by shift if the parent shift is MAPLE_PARENT_SLOT_SHIFT */ return (val & MAPLE_PARENT_16B_SLOT_MASK) >> mte_parent_shift(val); } /* * mte_parent() - Get the parent of @node. * @enode: The encoded maple node. * * Return: The parent maple node. */ static __always_inline struct maple_node *mte_parent(const struct maple_enode *enode) { return (void *)((unsigned long) (mte_to_node(enode)->parent) & ~MAPLE_NODE_MASK); } /* * ma_dead_node() - check if the @enode is dead. * @enode: The encoded maple node * * Return: true if dead, false otherwise. */ static __always_inline bool ma_dead_node(const struct maple_node *node) { struct maple_node *parent; /* Do not reorder reads from the node prior to the parent check */ smp_rmb(); parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK); return (parent == node); } /* * mte_dead_node() - check if the @enode is dead. * @enode: The encoded maple node * * Return: true if dead, false otherwise. */ static __always_inline bool mte_dead_node(const struct maple_enode *enode) { struct maple_node *parent, *node; node = mte_to_node(enode); /* Do not reorder reads from the node prior to the parent check */ smp_rmb(); parent = mte_parent(enode); return (parent == node); } /* * mas_allocated() - Get the number of nodes allocated in a maple state. * @mas: The maple state * * The ma_state alloc member is overloaded to hold a pointer to the first * allocated node or to the number of requested nodes to allocate. If bit 0 is * set, then the alloc contains the number of requested nodes. If there is an * allocated node, then the total allocated nodes is in that node. * * Return: The total number of nodes allocated */ static inline unsigned long mas_allocated(const struct ma_state *mas) { if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) return 0; return mas->alloc->total; } /* * mas_set_alloc_req() - Set the requested number of allocations. * @mas: the maple state * @count: the number of allocations. * * The requested number of allocations is either in the first allocated node, * located in @mas->alloc->request_count, or directly in @mas->alloc if there is * no allocated node. Set the request either in the node or do the necessary * encoding to store in @mas->alloc directly. */ static inline void mas_set_alloc_req(struct ma_state *mas, unsigned long count) { if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) { if (!count) mas->alloc = NULL; else mas->alloc = (struct maple_alloc *)(((count) << 1U) | 1U); return; } mas->alloc->request_count = count; } /* * mas_alloc_req() - get the requested number of allocations. * @mas: The maple state * * The alloc count is either stored directly in @mas, or in * @mas->alloc->request_count if there is at least one node allocated. Decode * the request count if it's stored directly in @mas->alloc. * * Return: The allocation request count. */ static inline unsigned int mas_alloc_req(const struct ma_state *mas) { if ((unsigned long)mas->alloc & 0x1) return (unsigned long)(mas->alloc) >> 1; else if (mas->alloc) return mas->alloc->request_count; return 0; } /* * ma_pivots() - Get a pointer to the maple node pivots. * @node: the maple node * @type: the node type * * In the event of a dead node, this array may be %NULL * * Return: A pointer to the maple node pivots */ static inline unsigned long *ma_pivots(struct maple_node *node, enum maple_type type) { switch (type) { case maple_arange_64: return node->ma64.pivot; case maple_range_64: case maple_leaf_64: return node->mr64.pivot; case maple_dense: return NULL; } return NULL; } /* * ma_gaps() - Get a pointer to the maple node gaps. * @node: the maple node * @type: the node type * * Return: A pointer to the maple node gaps */ static inline unsigned long *ma_gaps(struct maple_node *node, enum maple_type type) { switch (type) { case maple_arange_64: return node->ma64.gap; case maple_range_64: case maple_leaf_64: case maple_dense: return NULL; } return NULL; } /* * mas_safe_pivot() - get the pivot at @piv or mas->max. * @mas: The maple state * @pivots: The pointer to the maple node pivots * @piv: The pivot to fetch * @type: The maple node type * * Return: The pivot at @piv within the limit of the @pivots array, @mas->max * otherwise. */ static __always_inline unsigned long mas_safe_pivot(const struct ma_state *mas, unsigned long *pivots, unsigned char piv, enum maple_type type) { if (piv >= mt_pivots[type]) return mas->max; return pivots[piv]; } /* * mas_safe_min() - Return the minimum for a given offset. * @mas: The maple state * @pivots: The pointer to the maple node pivots * @offset: The offset into the pivot array * * Return: The minimum range value that is contained in @offset. */ static inline unsigned long mas_safe_min(struct ma_state *mas, unsigned long *pivots, unsigned char offset) { if (likely(offset)) return pivots[offset - 1] + 1; return mas->min; } /* * mte_set_pivot() - Set a pivot to a value in an encoded maple node. * @mn: The encoded maple node * @piv: The pivot offset * @val: The value of the pivot */ static inline void mte_set_pivot(struct maple_enode *mn, unsigned char piv, unsigned long val) { struct maple_node *node = mte_to_node(mn); enum maple_type type = mte_node_type(mn); BUG_ON(piv >= mt_pivots[type]); switch (type) { case maple_range_64: case maple_leaf_64: node->mr64.pivot[piv] = val; break; case maple_arange_64: node->ma64.pivot[piv] = val; break; case maple_dense: break; } } /* * ma_slots() - Get a pointer to the maple node slots. * @mn: The maple node * @mt: The maple node type * * Return: A pointer to the maple node slots */ static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt) { switch (mt) { case maple_arange_64: return mn->ma64.slot; case maple_range_64: case maple_leaf_64: return mn->mr64.slot; case maple_dense: return mn->slot; } return NULL; } static inline bool mt_write_locked(const struct maple_tree *mt) { return mt_external_lock(mt) ? mt_write_lock_is_held(mt) : lockdep_is_held(&mt->ma_lock); } static __always_inline bool mt_locked(const struct maple_tree *mt) { return mt_external_lock(mt) ? mt_lock_is_held(mt) : lockdep_is_held(&mt->ma_lock); } static __always_inline void *mt_slot(const struct maple_tree *mt, void __rcu **slots, unsigned char offset) { return rcu_dereference_check(slots[offset], mt_locked(mt)); } static __always_inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots, unsigned char offset) { return rcu_dereference_protected(slots[offset], mt_write_locked(mt)); } /* * mas_slot_locked() - Get the slot value when holding the maple tree lock. * @mas: The maple state * @slots: The pointer to the slots * @offset: The offset into the slots array to fetch * * Return: The entry stored in @slots at the @offset. */ static __always_inline void *mas_slot_locked(struct ma_state *mas, void __rcu **slots, unsigned char offset) { return mt_slot_locked(mas->tree, slots, offset); } /* * mas_slot() - Get the slot value when not holding the maple tree lock. * @mas: The maple state * @slots: The pointer to the slots * @offset: The offset into the slots array to fetch * * Return: The entry stored in @slots at the @offset */ static __always_inline void *mas_slot(struct ma_state *mas, void __rcu **slots, unsigned char offset) { return mt_slot(mas->tree, slots, offset); } /* * mas_root() - Get the maple tree root. * @mas: The maple state. * * Return: The pointer to the root of the tree */ static __always_inline void *mas_root(struct ma_state *mas) { return rcu_dereference_check(mas->tree->ma_root, mt_locked(mas->tree)); } static inline void *mt_root_locked(struct maple_tree *mt) { return rcu_dereference_protected(mt->ma_root, mt_write_locked(mt)); } /* * mas_root_locked() - Get the maple tree root when holding the maple tree lock. * @mas: The maple state. * * Return: The pointer to the root of the tree */ static inline void *mas_root_locked(struct ma_state *mas) { return mt_root_locked(mas->tree); } static inline struct maple_metadata *ma_meta(struct maple_node *mn, enum maple_type mt) { switch (mt) { case maple_arange_64: return &mn->ma64.meta; default: return &mn->mr64.meta; } } /* * ma_set_meta() - Set the metadata information of a node. * @mn: The maple node * @mt: The maple node type * @offset: The offset of the highest sub-gap in this node. * @end: The end of the data in this node. */ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt, unsigned char offset, unsigned char end) { struct maple_metadata *meta = ma_meta(mn, mt); meta->gap = offset; meta->end = end; } /* * mt_clear_meta() - clear the metadata information of a node, if it exists * @mt: The maple tree * @mn: The maple node * @type: The maple node type */ static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn, enum maple_type type) { struct maple_metadata *meta; unsigned long *pivots; void __rcu **slots; void *next; switch (type) { case maple_range_64: pivots = mn->mr64.pivot; if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) { slots = mn->mr64.slot; next = mt_slot_locked(mt, slots, MAPLE_RANGE64_SLOTS - 1); if (unlikely((mte_to_node(next) && mte_node_type(next)))) return; /* no metadata, could be node */ } fallthrough; case maple_arange_64: meta = ma_meta(mn, type); break; default: return; } meta->gap = 0; meta->end = 0; } /* * ma_meta_end() - Get the data end of a node from the metadata * @mn: The maple node * @mt: The maple node type */ static inline unsigned char ma_meta_end(struct maple_node *mn, enum maple_type mt) { struct maple_metadata *meta = ma_meta(mn, mt); return meta->end; } /* * ma_meta_gap() - Get the largest gap location of a node from the metadata * @mn: The maple node */ static inline unsigned char ma_meta_gap(struct maple_node *mn) { return mn->ma64.meta.gap; } /* * ma_set_meta_gap() - Set the largest gap location in a nodes metadata * @mn: The maple node * @mt: The maple node type * @offset: The location of the largest gap. */ static inline void ma_set_meta_gap(struct maple_node *mn, enum maple_type mt, unsigned char offset) { struct maple_metadata *meta = ma_meta(mn, mt); meta->gap = offset; } /* * mat_add() - Add a @dead_enode to the ma_topiary of a list of dead nodes. * @mat: the ma_topiary, a linked list of dead nodes. * @dead_enode: the node to be marked as dead and added to the tail of the list * * Add the @dead_enode to the linked list in @mat. */ static inline void mat_add(struct ma_topiary *mat, struct maple_enode *dead_enode) { mte_set_node_dead(dead_enode); mte_to_mat(dead_enode)->next = NULL; if (!mat->tail) { mat->tail = mat->head = dead_enode; return; } mte_to_mat(mat->tail)->next = dead_enode; mat->tail = dead_enode; } static void mt_free_walk(struct rcu_head *head); static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free); /* * mas_mat_destroy() - Free all nodes and subtrees in a dead list. * @mas: the maple state * @mat: the ma_topiary linked list of dead nodes to free. * * Destroy walk a dead list. */ static void mas_mat_destroy(struct ma_state *mas, struct ma_topiary *mat) { struct maple_enode *next; struct maple_node *node; bool in_rcu = mt_in_rcu(mas->tree); while (mat->head) { next = mte_to_mat(mat->head)->next; node = mte_to_node(mat->head); mt_destroy_walk(mat->head, mas->tree, !in_rcu); if (in_rcu) call_rcu(&node->rcu, mt_free_walk); mat->head = next; } } /* * mas_descend() - Descend into the slot stored in the ma_state. * @mas: the maple state. * * Note: Not RCU safe, only use in write side or debug code. */ static inline void mas_descend(struct ma_state *mas) { enum maple_type type; unsigned long *pivots; struct maple_node *node; void __rcu **slots; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); slots = ma_slots(node, type); if (mas->offset) mas->min = pivots[mas->offset - 1] + 1; mas->max = mas_safe_pivot(mas, pivots, mas->offset, type); mas->node = mas_slot(mas, slots, mas->offset); } /* * mte_set_gap() - Set a maple node gap. * @mn: The encoded maple node * @gap: The offset of the gap to set * @val: The gap value */ static inline void mte_set_gap(const struct maple_enode *mn, unsigned char gap, unsigned long val) { switch (mte_node_type(mn)) { default: break; case maple_arange_64: mte_to_node(mn)->ma64.gap[gap] = val; break; } } /* * mas_ascend() - Walk up a level of the tree. * @mas: The maple state * * Sets the @mas->max and @mas->min to the correct values when walking up. This * may cause several levels of walking up to find the correct min and max. * May find a dead node which will cause a premature return. * Return: 1 on dead node, 0 otherwise */ static int mas_ascend(struct ma_state *mas) { struct maple_enode *p_enode; /* parent enode. */ struct maple_enode *a_enode; /* ancestor enode. */ struct maple_node *a_node; /* ancestor node. */ struct maple_node *p_node; /* parent node. */ unsigned char a_slot; enum maple_type a_type; unsigned long min, max; unsigned long *pivots; bool set_max = false, set_min = false; a_node = mas_mn(mas); if (ma_is_root(a_node)) { mas->offset = 0; return 0; } p_node = mte_parent(mas->node); if (unlikely(a_node == p_node)) return 1; a_type = mas_parent_type(mas, mas->node); mas->offset = mte_parent_slot(mas->node); a_enode = mt_mk_node(p_node, a_type); /* Check to make sure all parent information is still accurate */ if (p_node != mte_parent(mas->node)) return 1; mas->node = a_enode; if (mte_is_root(a_enode)) { mas->max = ULONG_MAX; mas->min = 0; return 0; } min = 0; max = ULONG_MAX; if (!mas->offset) { min = mas->min; set_min = true; } if (mas->max == ULONG_MAX) set_max = true; do { p_enode = a_enode; a_type = mas_parent_type(mas, p_enode); a_node = mte_parent(p_enode); a_slot = mte_parent_slot(p_enode); a_enode = mt_mk_node(a_node, a_type); pivots = ma_pivots(a_node, a_type); if (unlikely(ma_dead_node(a_node))) return 1; if (!set_min && a_slot) { set_min = true; min = pivots[a_slot - 1] + 1; } if (!set_max && a_slot < mt_pivots[a_type]) { set_max = true; max = pivots[a_slot]; } if (unlikely(ma_dead_node(a_node))) return 1; if (unlikely(ma_is_root(a_node))) break; } while (!set_min || !set_max); mas->max = max; mas->min = min; return 0; } /* * mas_pop_node() - Get a previously allocated maple node from the maple state. * @mas: The maple state * * Return: A pointer to a maple node. */ static inline struct maple_node *mas_pop_node(struct ma_state *mas) { struct maple_alloc *ret, *node = mas->alloc; unsigned long total = mas_allocated(mas); unsigned int req = mas_alloc_req(mas); /* nothing or a request pending. */ if (WARN_ON(!total)) return NULL; if (total == 1) { /* single allocation in this ma_state */ mas->alloc = NULL; ret = node; goto single_node; } if (node->node_count == 1) { /* Single allocation in this node. */ mas->alloc = node->slot[0]; mas->alloc->total = node->total - 1; ret = node; goto new_head; } node->total--; ret = node->slot[--node->node_count]; node->slot[node->node_count] = NULL; single_node: new_head: if (req) { req++; mas_set_alloc_req(mas, req); } memset(ret, 0, sizeof(*ret)); return (struct maple_node *)ret; } /* * mas_push_node() - Push a node back on the maple state allocation. * @mas: The maple state * @used: The used maple node * * Stores the maple node back into @mas->alloc for reuse. Updates allocated and * requested node count as necessary. */ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) { struct maple_alloc *reuse = (struct maple_alloc *)used; struct maple_alloc *head = mas->alloc; unsigned long count; unsigned int requested = mas_alloc_req(mas); count = mas_allocated(mas); reuse->request_count = 0; reuse->node_count = 0; if (count) { if (head->node_count < MAPLE_ALLOC_SLOTS) { head->slot[head->node_count++] = reuse; head->total++; goto done; } reuse->slot[0] = head; reuse->node_count = 1; } reuse->total = count + 1; mas->alloc = reuse; done: if (requested > 1) mas_set_alloc_req(mas, requested - 1); } /* * mas_alloc_nodes() - Allocate nodes into a maple state * @mas: The maple state * @gfp: The GFP Flags */ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { struct maple_alloc *node; unsigned long allocated = mas_allocated(mas); unsigned int requested = mas_alloc_req(mas); unsigned int count; void **slots = NULL; unsigned int max_req = 0; if (!requested) return; mas_set_alloc_req(mas, 0); if (mas->mas_flags & MA_STATE_PREALLOC) { if (allocated) return; BUG_ON(!allocated); WARN_ON(!allocated); } if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) { node = (struct maple_alloc *)mt_alloc_one(gfp); if (!node) goto nomem_one; if (allocated) { node->slot[0] = mas->alloc; node->node_count = 1; } else { node->node_count = 0; } mas->alloc = node; node->total = ++allocated; node->request_count = 0; requested--; } node = mas->alloc; while (requested) { max_req = MAPLE_ALLOC_SLOTS - node->node_count; slots = (void **)&node->slot[node->node_count]; max_req = min(requested, max_req); count = mt_alloc_bulk(gfp, max_req, slots); if (!count) goto nomem_bulk; if (node->node_count == 0) { node->slot[0]->node_count = 0; node->slot[0]->request_count = 0; } node->node_count += count; allocated += count; /* find a non-full node*/ do { node = node->slot[0]; } while (unlikely(node->node_count == MAPLE_ALLOC_SLOTS)); requested -= count; } mas->alloc->total = allocated; return; nomem_bulk: /* Clean up potential freed allocations on bulk failure */ memset(slots, 0, max_req * sizeof(unsigned long)); mas->alloc->total = allocated; nomem_one: mas_set_alloc_req(mas, requested); mas_set_err(mas, -ENOMEM); } /* * mas_free() - Free an encoded maple node * @mas: The maple state * @used: The encoded maple node to free. * * Uses rcu free if necessary, pushes @used back on the maple state allocations * otherwise. */ static inline void mas_free(struct ma_state *mas, struct maple_enode *used) { struct maple_node *tmp = mte_to_node(used); if (mt_in_rcu(mas->tree)) ma_free_rcu(tmp); else mas_push_node(mas, tmp); } /* * mas_node_count_gfp() - Check if enough nodes are allocated and request more * if there is not enough nodes. * @mas: The maple state * @count: The number of nodes needed * @gfp: the gfp flags */ static void mas_node_count_gfp(struct ma_state *mas, int count, gfp_t gfp) { unsigned long allocated = mas_allocated(mas); if (allocated < count) { mas_set_alloc_req(mas, count - allocated); mas_alloc_nodes(mas, gfp); } } /* * mas_node_count() - Check if enough nodes are allocated and request more if * there is not enough nodes. * @mas: The maple state * @count: The number of nodes needed * * Note: Uses GFP_NOWAIT | __GFP_NOWARN for gfp flags. */ static void mas_node_count(struct ma_state *mas, int count) { return mas_node_count_gfp(mas, count, GFP_NOWAIT | __GFP_NOWARN); } /* * mas_start() - Sets up maple state for operations. * @mas: The maple state. * * If mas->status == mas_start, then set the min, max and depth to * defaults. * * Return: * - If mas->node is an error or not mas_start, return NULL. * - If it's an empty tree: NULL & mas->status == ma_none * - If it's a single entry: The entry & mas->status == ma_root * - If it's a tree: NULL & mas->status == ma_active */ static inline struct maple_enode *mas_start(struct ma_state *mas) { if (likely(mas_is_start(mas))) { struct maple_enode *root; mas->min = 0; mas->max = ULONG_MAX; retry: mas->depth = 0; root = mas_root(mas); /* Tree with nodes */ if (likely(xa_is_node(root))) { mas->depth = 1; mas->status = ma_active; mas->node = mte_safe_root(root); mas->offset = 0; if (mte_dead_node(mas->node)) goto retry; return NULL; } mas->node = NULL; /* empty tree */ if (unlikely(!root)) { mas->status = ma_none; mas->offset = MAPLE_NODE_SLOTS; return NULL; } /* Single entry tree */ mas->status = ma_root; mas->offset = MAPLE_NODE_SLOTS; /* Single entry tree. */ if (mas->index > 0) return NULL; return root; } return NULL; } /* * ma_data_end() - Find the end of the data in a node. * @node: The maple node * @type: The maple node type * @pivots: The array of pivots in the node * @max: The maximum value in the node * * Uses metadata to find the end of the data when possible. * Return: The zero indexed last slot with data (may be null). */ static __always_inline unsigned char ma_data_end(struct maple_node *node, enum maple_type type, unsigned long *pivots, unsigned long max) { unsigned char offset; if (!pivots) return 0; if (type == maple_arange_64) return ma_meta_end(node, type); offset = mt_pivots[type] - 1; if (likely(!pivots[offset])) return ma_meta_end(node, type); if (likely(pivots[offset] == max)) return offset; return mt_pivots[type]; } /* * mas_data_end() - Find the end of the data (slot). * @mas: the maple state * * This method is optimized to check the metadata of a node if the node type * supports data end metadata. * * Return: The zero indexed last slot with data (may be null). */ static inline unsigned char mas_data_end(struct ma_state *mas) { enum maple_type type; struct maple_node *node; unsigned char offset; unsigned long *pivots; type = mte_node_type(mas->node); node = mas_mn(mas); if (type == maple_arange_64) return ma_meta_end(node, type); pivots = ma_pivots(node, type); if (unlikely(ma_dead_node(node))) return 0; offset = mt_pivots[type] - 1; if (likely(!pivots[offset])) return ma_meta_end(node, type); if (likely(pivots[offset] == mas->max)) return offset; return mt_pivots[type]; } /* * mas_leaf_max_gap() - Returns the largest gap in a leaf node * @mas: the maple state * * Return: The maximum gap in the leaf. */ static unsigned long mas_leaf_max_gap(struct ma_state *mas) { enum maple_type mt; unsigned long pstart, gap, max_gap; struct maple_node *mn; unsigned long *pivots; void __rcu **slots; unsigned char i; unsigned char max_piv; mt = mte_node_type(mas->node); mn = mas_mn(mas); slots = ma_slots(mn, mt); max_gap = 0; if (unlikely(ma_is_dense(mt))) { gap = 0; for (i = 0; i < mt_slots[mt]; i++) { if (slots[i]) { if (gap > max_gap) max_gap = gap; gap = 0; } else { gap++; } } if (gap > max_gap) max_gap = gap; return max_gap; } /* * Check the first implied pivot optimizes the loop below and slot 1 may * be skipped if there is a gap in slot 0. */ pivots = ma_pivots(mn, mt); if (likely(!slots[0])) { max_gap = pivots[0] - mas->min + 1; i = 2; } else { i = 1; } /* reduce max_piv as the special case is checked before the loop */ max_piv = ma_data_end(mn, mt, pivots, mas->max) - 1; /* * Check end implied pivot which can only be a gap on the right most * node. */ if (unlikely(mas->max == ULONG_MAX) && !slots[max_piv + 1]) { gap = ULONG_MAX - pivots[max_piv]; if (gap > max_gap) max_gap = gap; if (max_gap > pivots[max_piv] - mas->min) return max_gap; } for (; i <= max_piv; i++) { /* data == no gap. */ if (likely(slots[i])) continue; pstart = pivots[i - 1]; gap = pivots[i] - pstart; if (gap > max_gap) max_gap = gap; /* There cannot be two gaps in a row. */ i++; } return max_gap; } /* * ma_max_gap() - Get the maximum gap in a maple node (non-leaf) * @node: The maple node * @gaps: The pointer to the gaps * @mt: The maple node type * @off: Pointer to store the offset location of the gap. * * Uses the metadata data end to scan backwards across set gaps. * * Return: The maximum gap value */ static inline unsigned long ma_max_gap(struct maple_node *node, unsigned long *gaps, enum maple_type mt, unsigned char *off) { unsigned char offset, i; unsigned long max_gap = 0; i = offset = ma_meta_end(node, mt); do { if (gaps[i] > max_gap) { max_gap = gaps[i]; offset = i; } } while (i--); *off = offset; return max_gap; } /* * mas_max_gap() - find the largest gap in a non-leaf node and set the slot. * @mas: The maple state. * * Return: The gap value. */ static inline unsigned long mas_max_gap(struct ma_state *mas) { unsigned long *gaps; unsigned char offset; enum maple_type mt; struct maple_node *node; mt = mte_node_type(mas->node); if (ma_is_leaf(mt)) return mas_leaf_max_gap(mas); node = mas_mn(mas); MAS_BUG_ON(mas, mt != maple_arange_64); offset = ma_meta_gap(node); gaps = ma_gaps(node, mt); return gaps[offset]; } /* * mas_parent_gap() - Set the parent gap and any gaps above, as needed * @mas: The maple state * @offset: The gap offset in the parent to set * @new: The new gap value. * * Set the parent gap then continue to set the gap upwards, using the metadata * of the parent to see if it is necessary to check the node above. */ static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset, unsigned long new) { unsigned long meta_gap = 0; struct maple_node *pnode; struct maple_enode *penode; unsigned long *pgaps; unsigned char meta_offset; enum maple_type pmt; pnode = mte_parent(mas->node); pmt = mas_parent_type(mas, mas->node); penode = mt_mk_node(pnode, pmt); pgaps = ma_gaps(pnode, pmt); ascend: MAS_BUG_ON(mas, pmt != maple_arange_64); meta_offset = ma_meta_gap(pnode); meta_gap = pgaps[meta_offset]; pgaps[offset] = new; if (meta_gap == new) return; if (offset != meta_offset) { if (meta_gap > new) return; ma_set_meta_gap(pnode, pmt, offset); } else if (new < meta_gap) { new = ma_max_gap(pnode, pgaps, pmt, &meta_offset); ma_set_meta_gap(pnode, pmt, meta_offset); } if (ma_is_root(pnode)) return; /* Go to the parent node. */ pnode = mte_parent(penode); pmt = mas_parent_type(mas, penode); pgaps = ma_gaps(pnode, pmt); offset = mte_parent_slot(penode); penode = mt_mk_node(pnode, pmt); goto ascend; } /* * mas_update_gap() - Update a nodes gaps and propagate up if necessary. * @mas: the maple state. */ static inline void mas_update_gap(struct ma_state *mas) { unsigned char pslot; unsigned long p_gap; unsigned long max_gap; if (!mt_is_alloc(mas->tree)) return; if (mte_is_root(mas->node)) return; max_gap = mas_max_gap(mas); pslot = mte_parent_slot(mas->node); p_gap = ma_gaps(mte_parent(mas->node), mas_parent_type(mas, mas->node))[pslot]; if (p_gap != max_gap) mas_parent_gap(mas, pslot, max_gap); } /* * mas_adopt_children() - Set the parent pointer of all nodes in @parent to * @parent with the slot encoded. * @mas: the maple state (for the tree) * @parent: the maple encoded node containing the children. */ static inline void mas_adopt_children(struct ma_state *mas, struct maple_enode *parent) { enum maple_type type = mte_node_type(parent); struct maple_node *node = mte_to_node(parent); void __rcu **slots = ma_slots(node, type); unsigned long *pivots = ma_pivots(node, type); struct maple_enode *child; unsigned char offset; offset = ma_data_end(node, type, pivots, mas->max); do { child = mas_slot_locked(mas, slots, offset); mas_set_parent(mas, child, parent, offset); } while (offset--); } /* * mas_put_in_tree() - Put a new node in the tree, smp_wmb(), and mark the old * node as dead. * @mas: the maple state with the new node * @old_enode: The old maple encoded node to replace. */ static inline void mas_put_in_tree(struct ma_state *mas, struct maple_enode *old_enode) __must_hold(mas->tree->ma_lock) { unsigned char offset; void __rcu **slots; if (mte_is_root(mas->node)) { mas_mn(mas)->parent = ma_parent_ptr(mas_tree_parent(mas)); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); mas_set_height(mas); } else { offset = mte_parent_slot(mas->node); slots = ma_slots(mte_parent(mas->node), mas_parent_type(mas, mas->node)); rcu_assign_pointer(slots[offset], mas->node); } mte_set_node_dead(old_enode); } /* * mas_replace_node() - Replace a node by putting it in the tree, marking it * dead, and freeing it. * the parent encoding to locate the maple node in the tree. * @mas: the ma_state with @mas->node pointing to the new node. * @old_enode: The old maple encoded node. */ static inline void mas_replace_node(struct ma_state *mas, struct maple_enode *old_enode) __must_hold(mas->tree->ma_lock) { mas_put_in_tree(mas, old_enode); mas_free(mas, old_enode); } /* * mas_find_child() - Find a child who has the parent @mas->node. * @mas: the maple state with the parent. * @child: the maple state to store the child. */ static inline bool mas_find_child(struct ma_state *mas, struct ma_state *child) __must_hold(mas->tree->ma_lock) { enum maple_type mt; unsigned char offset; unsigned char end; unsigned long *pivots; struct maple_enode *entry; struct maple_node *node; void __rcu **slots; mt = mte_node_type(mas->node); node = mas_mn(mas); slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); end = ma_data_end(node, mt, pivots, mas->max); for (offset = mas->offset; offset <= end; offset++) { entry = mas_slot_locked(mas, slots, offset); if (mte_parent(entry) == node) { *child = *mas; mas->offset = offset + 1; child->offset = offset; mas_descend(child); child->offset = 0; return true; } } return false; } /* * mab_shift_right() - Shift the data in mab right. Note, does not clean out the * old data or set b_node->b_end. * @b_node: the maple_big_node * @shift: the shift count */ static inline void mab_shift_right(struct maple_big_node *b_node, unsigned char shift) { unsigned long size = b_node->b_end * sizeof(unsigned long); memmove(b_node->pivot + shift, b_node->pivot, size); memmove(b_node->slot + shift, b_node->slot, size); if (b_node->type == maple_arange_64) memmove(b_node->gap + shift, b_node->gap, size); } /* * mab_middle_node() - Check if a middle node is needed (unlikely) * @b_node: the maple_big_node that contains the data. * @split: the potential split location * @slot_count: the size that can be stored in a single node being considered. * * Return: true if a middle node is required. */ static inline bool mab_middle_node(struct maple_big_node *b_node, int split, unsigned char slot_count) { unsigned char size = b_node->b_end; if (size >= 2 * slot_count) return true; if (!b_node->slot[split] && (size >= 2 * slot_count - 1)) return true; return false; } /* * mab_no_null_split() - ensure the split doesn't fall on a NULL * @b_node: the maple_big_node with the data * @split: the suggested split location * @slot_count: the number of slots in the node being considered. * * Return: the split location. */ static inline int mab_no_null_split(struct maple_big_node *b_node, unsigned char split, unsigned char slot_count) { if (!b_node->slot[split]) { /* * If the split is less than the max slot && the right side will * still be sufficient, then increment the split on NULL. */ if ((split < slot_count - 1) && (b_node->b_end - split) > (mt_min_slots[b_node->type])) split++; else split--; } return split; } /* * mab_calc_split() - Calculate the split location and if there needs to be two * splits. * @mas: The maple state * @bn: The maple_big_node with the data * @mid_split: The second split, if required. 0 otherwise. * * Return: The first split location. The middle split is set in @mid_split. */ static inline int mab_calc_split(struct ma_state *mas, struct maple_big_node *bn, unsigned char *mid_split) { unsigned char b_end = bn->b_end; int split = b_end / 2; /* Assume equal split. */ unsigned char slot_count = mt_slots[bn->type]; /* * To support gap tracking, all NULL entries are kept together and a node cannot * end on a NULL entry, with the exception of the left-most leaf. The * limitation means that the split of a node must be checked for this condition * and be able to put more data in one direction or the other. */ if (unlikely((mas->mas_flags & MA_STATE_BULK))) { *mid_split = 0; split = b_end - mt_min_slots[bn->type]; if (!ma_is_leaf(bn->type)) return split; mas->mas_flags |= MA_STATE_REBALANCE; if (!bn->slot[split]) split--; return split; } /* * Although extremely rare, it is possible to enter what is known as the 3-way * split scenario. The 3-way split comes about by means of a store of a range * that overwrites the end and beginning of two full nodes. The result is a set * of entries that cannot be stored in 2 nodes. Sometimes, these two nodes can * also be located in different parent nodes which are also full. This can * carry upwards all the way to the root in the worst case. */ if (unlikely(mab_middle_node(bn, split, slot_count))) { split = b_end / 3; *mid_split = split * 2; } else { *mid_split = 0; } /* Avoid ending a node on a NULL entry */ split = mab_no_null_split(bn, split, slot_count); if (unlikely(*mid_split)) *mid_split = mab_no_null_split(bn, *mid_split, slot_count); return split; } /* * mas_mab_cp() - Copy data from a maple state inclusively to a maple_big_node * and set @b_node->b_end to the next free slot. * @mas: The maple state * @mas_start: The starting slot to copy * @mas_end: The end slot to copy (inclusively) * @b_node: The maple_big_node to place the data * @mab_start: The starting location in maple_big_node to store the data. */ static inline void mas_mab_cp(struct ma_state *mas, unsigned char mas_start, unsigned char mas_end, struct maple_big_node *b_node, unsigned char mab_start) { enum maple_type mt; struct maple_node *node; void __rcu **slots; unsigned long *pivots, *gaps; int i = mas_start, j = mab_start; unsigned char piv_end; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); if (!i) { b_node->pivot[j] = pivots[i++]; if (unlikely(i > mas_end)) goto complete; j++; } piv_end = min(mas_end, mt_pivots[mt]); for (; i < piv_end; i++, j++) { b_node->pivot[j] = pivots[i]; if (unlikely(!b_node->pivot[j])) goto complete; if (unlikely(mas->max == b_node->pivot[j])) goto complete; } b_node->pivot[j] = mas_safe_pivot(mas, pivots, i, mt); complete: b_node->b_end = ++j; j -= mab_start; slots = ma_slots(node, mt); memcpy(b_node->slot + mab_start, slots + mas_start, sizeof(void *) * j); if (!ma_is_leaf(mt) && mt_is_alloc(mas->tree)) { gaps = ma_gaps(node, mt); memcpy(b_node->gap + mab_start, gaps + mas_start, sizeof(unsigned long) * j); } } /* * mas_leaf_set_meta() - Set the metadata of a leaf if possible. * @node: The maple node * @mt: The maple type * @end: The node end */ static inline void mas_leaf_set_meta(struct maple_node *node, enum maple_type mt, unsigned char end) { if (end < mt_slots[mt] - 1) ma_set_meta(node, mt, 0, end); } /* * mab_mas_cp() - Copy data from maple_big_node to a maple encoded node. * @b_node: the maple_big_node that has the data * @mab_start: the start location in @b_node. * @mab_end: The end location in @b_node (inclusively) * @mas: The maple state with the maple encoded node. */ static inline void mab_mas_cp(struct maple_big_node *b_node, unsigned char mab_start, unsigned char mab_end, struct ma_state *mas, bool new_max) { int i, j = 0; enum maple_type mt = mte_node_type(mas->node); struct maple_node *node = mte_to_node(mas->node); void __rcu **slots = ma_slots(node, mt); unsigned long *pivots = ma_pivots(node, mt); unsigned long *gaps = NULL; unsigned char end; if (mab_end - mab_start > mt_pivots[mt]) mab_end--; if (!pivots[mt_pivots[mt] - 1]) slots[mt_pivots[mt]] = NULL; i = mab_start; do { pivots[j++] = b_node->pivot[i++]; } while (i <= mab_end && likely(b_node->pivot[i])); memcpy(slots, b_node->slot + mab_start, sizeof(void *) * (i - mab_start)); if (new_max) mas->max = b_node->pivot[i - 1]; end = j - 1; if (likely(!ma_is_leaf(mt) && mt_is_alloc(mas->tree))) { unsigned long max_gap = 0; unsigned char offset = 0; gaps = ma_gaps(node, mt); do { gaps[--j] = b_node->gap[--i]; if (gaps[j] > max_gap) { offset = j; max_gap = gaps[j]; } } while (j); ma_set_meta(node, mt, offset, end); } else { mas_leaf_set_meta(node, mt, end); } } /* * mas_bulk_rebalance() - Rebalance the end of a tree after a bulk insert. * @mas: The maple state * @end: The maple node end * @mt: The maple node type */ static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end, enum maple_type mt) { if (!(mas->mas_flags & MA_STATE_BULK)) return; if (mte_is_root(mas->node)) return; if (end > mt_min_slots[mt]) { mas->mas_flags &= ~MA_STATE_REBALANCE; return; } } /* * mas_store_b_node() - Store an @entry into the b_node while also copying the * data from a maple encoded node. * @wr_mas: the maple write state * @b_node: the maple_big_node to fill with data * @offset_end: the offset to end copying * * Return: The actual end of the data stored in @b_node */ static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node, unsigned char offset_end) { unsigned char slot; unsigned char b_end; /* Possible underflow of piv will wrap back to 0 before use. */ unsigned long piv; struct ma_state *mas = wr_mas->mas; b_node->type = wr_mas->type; b_end = 0; slot = mas->offset; if (slot) { /* Copy start data up to insert. */ mas_mab_cp(mas, 0, slot - 1, b_node, 0); b_end = b_node->b_end; piv = b_node->pivot[b_end - 1]; } else piv = mas->min - 1; if (piv + 1 < mas->index) { /* Handle range starting after old range */ b_node->slot[b_end] = wr_mas->content; if (!wr_mas->content) b_node->gap[b_end] = mas->index - 1 - piv; b_node->pivot[b_end++] = mas->index - 1; } /* Store the new entry. */ mas->offset = b_end; b_node->slot[b_end] = wr_mas->entry; b_node->pivot[b_end] = mas->last; /* Appended. */ if (mas->last >= mas->max) goto b_end; /* Handle new range ending before old range ends */ piv = mas_safe_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type); if (piv > mas->last) { if (piv == ULONG_MAX) mas_bulk_rebalance(mas, b_node->b_end, wr_mas->type); if (offset_end != slot) wr_mas->content = mas_slot_locked(mas, wr_mas->slots, offset_end); b_node->slot[++b_end] = wr_mas->content; if (!wr_mas->content) b_node->gap[b_end] = piv - mas->last + 1; b_node->pivot[b_end] = piv; } slot = offset_end + 1; if (slot > mas->end) goto b_end; /* Copy end data to the end of the node. */ mas_mab_cp(mas, slot, mas->end + 1, b_node, ++b_end); b_node->b_end--; return; b_end: b_node->b_end = b_end; } /* * mas_prev_sibling() - Find the previous node with the same parent. * @mas: the maple state * * Return: True if there is a previous sibling, false otherwise. */ static inline bool mas_prev_sibling(struct ma_state *mas) { unsigned int p_slot = mte_parent_slot(mas->node); /* For root node, p_slot is set to 0 by mte_parent_slot(). */ if (!p_slot) return false; mas_ascend(mas); mas->offset = p_slot - 1; mas_descend(mas); return true; } /* * mas_next_sibling() - Find the next node with the same parent. * @mas: the maple state * * Return: true if there is a next sibling, false otherwise. */ static inline bool mas_next_sibling(struct ma_state *mas) { MA_STATE(parent, mas->tree, mas->index, mas->last); if (mte_is_root(mas->node)) return false; parent = *mas; mas_ascend(&parent); parent.offset = mte_parent_slot(mas->node) + 1; if (parent.offset > mas_data_end(&parent)) return false; *mas = parent; mas_descend(mas); return true; } /* * mas_node_or_none() - Set the enode and state. * @mas: the maple state * @enode: The encoded maple node. * * Set the node to the enode and the status. */ static inline void mas_node_or_none(struct ma_state *mas, struct maple_enode *enode) { if (enode) { mas->node = enode; mas->status = ma_active; } else { mas->node = NULL; mas->status = ma_none; } } /* * mas_wr_node_walk() - Find the correct offset for the index in the @mas. * If @mas->index cannot be found within the containing * node, we traverse to the last entry in the node. * @wr_mas: The maple write state * * Uses mas_slot_locked() and does not need to worry about dead nodes. */ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char count, offset; if (unlikely(ma_is_dense(wr_mas->type))) { wr_mas->r_max = wr_mas->r_min = mas->index; mas->offset = mas->index = mas->min; return; } wr_mas->node = mas_mn(wr_mas->mas); wr_mas->pivots = ma_pivots(wr_mas->node, wr_mas->type); count = mas->end = ma_data_end(wr_mas->node, wr_mas->type, wr_mas->pivots, mas->max); offset = mas->offset; while (offset < count && mas->index > wr_mas->pivots[offset]) offset++; wr_mas->r_max = offset < count ? wr_mas->pivots[offset] : mas->max; wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, offset); wr_mas->offset_end = mas->offset = offset; } /* * mast_rebalance_next() - Rebalance against the next node * @mast: The maple subtree state */ static inline void mast_rebalance_next(struct maple_subtree_state *mast) { unsigned char b_end = mast->bn->b_end; mas_mab_cp(mast->orig_r, 0, mt_slot_count(mast->orig_r->node), mast->bn, b_end); mast->orig_r->last = mast->orig_r->max; } /* * mast_rebalance_prev() - Rebalance against the previous node * @mast: The maple subtree state */ static inline void mast_rebalance_prev(struct maple_subtree_state *mast) { unsigned char end = mas_data_end(mast->orig_l) + 1; unsigned char b_end = mast->bn->b_end; mab_shift_right(mast->bn, end); mas_mab_cp(mast->orig_l, 0, end - 1, mast->bn, 0); mast->l->min = mast->orig_l->min; mast->orig_l->index = mast->orig_l->min; mast->bn->b_end = end + b_end; mast->l->offset += end; } /* * mast_spanning_rebalance() - Rebalance nodes with nearest neighbour favouring * the node to the right. Checking the nodes to the right then the left at each * level upwards until root is reached. * Data is copied into the @mast->bn. * @mast: The maple_subtree_state. */ static inline bool mast_spanning_rebalance(struct maple_subtree_state *mast) { struct ma_state r_tmp = *mast->orig_r; struct ma_state l_tmp = *mast->orig_l; unsigned char depth = 0; do { mas_ascend(mast->orig_r); mas_ascend(mast->orig_l); depth++; if (mast->orig_r->offset < mas_data_end(mast->orig_r)) { mast->orig_r->offset++; do { mas_descend(mast->orig_r); mast->orig_r->offset = 0; } while (--depth); mast_rebalance_next(mast); *mast->orig_l = l_tmp; return true; } else if (mast->orig_l->offset != 0) { mast->orig_l->offset--; do { mas_descend(mast->orig_l); mast->orig_l->offset = mas_data_end(mast->orig_l); } while (--depth); mast_rebalance_prev(mast); *mast->orig_r = r_tmp; return true; } } while (!mte_is_root(mast->orig_r->node)); *mast->orig_r = r_tmp; *mast->orig_l = l_tmp; return false; } /* * mast_ascend() - Ascend the original left and right maple states. * @mast: the maple subtree state. * * Ascend the original left and right sides. Set the offsets to point to the * data already in the new tree (@mast->l and @mast->r). */ static inline void mast_ascend(struct maple_subtree_state *mast) { MA_WR_STATE(wr_mas, mast->orig_r, NULL); mas_ascend(mast->orig_l); mas_ascend(mast->orig_r); mast->orig_r->offset = 0; mast->orig_r->index = mast->r->max; /* last should be larger than or equal to index */ if (mast->orig_r->last < mast->orig_r->index) mast->orig_r->last = mast->orig_r->index; wr_mas.type = mte_node_type(mast->orig_r->node); mas_wr_node_walk(&wr_mas); /* Set up the left side of things */ mast->orig_l->offset = 0; mast->orig_l->index = mast->l->min; wr_mas.mas = mast->orig_l; wr_mas.type = mte_node_type(mast->orig_l->node); mas_wr_node_walk(&wr_mas); mast->bn->type = wr_mas.type; } /* * mas_new_ma_node() - Create and return a new maple node. Helper function. * @mas: the maple state with the allocations. * @b_node: the maple_big_node with the type encoding. * * Use the node type from the maple_big_node to allocate a new node from the * ma_state. This function exists mainly for code readability. * * Return: A new maple encoded node */ static inline struct maple_enode *mas_new_ma_node(struct ma_state *mas, struct maple_big_node *b_node) { return mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), b_node->type); } /* * mas_mab_to_node() - Set up right and middle nodes * * @mas: the maple state that contains the allocations. * @b_node: the node which contains the data. * @left: The pointer which will have the left node * @right: The pointer which may have the right node * @middle: the pointer which may have the middle node (rare) * @mid_split: the split location for the middle node * * Return: the split of left. */ static inline unsigned char mas_mab_to_node(struct ma_state *mas, struct maple_big_node *b_node, struct maple_enode **left, struct maple_enode **right, struct maple_enode **middle, unsigned char *mid_split) { unsigned char split = 0; unsigned char slot_count = mt_slots[b_node->type]; *left = mas_new_ma_node(mas, b_node); *right = NULL; *middle = NULL; *mid_split = 0; if (b_node->b_end < slot_count) { split = b_node->b_end; } else { split = mab_calc_split(mas, b_node, mid_split); *right = mas_new_ma_node(mas, b_node); } if (*mid_split) *middle = mas_new_ma_node(mas, b_node); return split; } /* * mab_set_b_end() - Add entry to b_node at b_node->b_end and increment the end * pointer. * @b_node: the big node to add the entry * @mas: the maple state to get the pivot (mas->max) * @entry: the entry to add, if NULL nothing happens. */ static inline void mab_set_b_end(struct maple_big_node *b_node, struct ma_state *mas, void *entry) { if (!entry) return; b_node->slot[b_node->b_end] = entry; if (mt_is_alloc(mas->tree)) b_node->gap[b_node->b_end] = mas_max_gap(mas); b_node->pivot[b_node->b_end++] = mas->max; } /* * mas_set_split_parent() - combine_then_separate helper function. Sets the parent * of @mas->node to either @left or @right, depending on @slot and @split * * @mas: the maple state with the node that needs a parent * @left: possible parent 1 * @right: possible parent 2 * @slot: the slot the mas->node was placed * @split: the split location between @left and @right */ static inline void mas_set_split_parent(struct ma_state *mas, struct maple_enode *left, struct maple_enode *right, unsigned char *slot, unsigned char split) { if (mas_is_none(mas)) return; if ((*slot) <= split) mas_set_parent(mas, mas->node, left, *slot); else if (right) mas_set_parent(mas, mas->node, right, (*slot) - split - 1); (*slot)++; } /* * mte_mid_split_check() - Check if the next node passes the mid-split * @l: Pointer to left encoded maple node. * @m: Pointer to middle encoded maple node. * @r: Pointer to right encoded maple node. * @slot: The offset * @split: The split location. * @mid_split: The middle split. */ static inline void mte_mid_split_check(struct maple_enode **l, struct maple_enode **r, struct maple_enode *right, unsigned char slot, unsigned char *split, unsigned char mid_split) { if (*r == right) return; if (slot < mid_split) return; *l = *r; *r = right; *split = mid_split; } /* * mast_set_split_parents() - Helper function to set three nodes parents. Slot * is taken from @mast->l. * @mast: the maple subtree state * @left: the left node * @right: the right node * @split: the split location. */ static inline void mast_set_split_parents(struct maple_subtree_state *mast, struct maple_enode *left, struct maple_enode *middle, struct maple_enode *right, unsigned char split, unsigned char mid_split) { unsigned char slot; struct maple_enode *l = left; struct maple_enode *r = right; if (mas_is_none(mast->l)) return; if (middle) r = middle; slot = mast->l->offset; mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->l, l, r, &slot, split); mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->m, l, r, &slot, split); mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->r, l, r, &slot, split); } /* * mas_topiary_node() - Dispose of a single node * @mas: The maple state for pushing nodes * @in_rcu: If the tree is in rcu mode * * The node will either be RCU freed or pushed back on the maple state. */ static inline void mas_topiary_node(struct ma_state *mas, struct ma_state *tmp_mas, bool in_rcu) { struct maple_node *tmp; struct maple_enode *enode; if (mas_is_none(tmp_mas)) return; enode = tmp_mas->node; tmp = mte_to_node(enode); mte_set_node_dead(enode); if (in_rcu) ma_free_rcu(tmp); else mas_push_node(mas, tmp); } /* * mas_topiary_replace() - Replace the data with new data, then repair the * parent links within the new tree. Iterate over the dead sub-tree and collect * the dead subtrees and topiary the nodes that are no longer of use. * * The new tree will have up to three children with the correct parent. Keep * track of the new entries as they need to be followed to find the next level * of new entries. * * The old tree will have up to three children with the old parent. Keep track * of the old entries as they may have more nodes below replaced. Nodes within * [index, last] are dead subtrees, others need to be freed and followed. * * @mas: The maple state pointing at the new data * @old_enode: The maple encoded node being replaced * */ static inline void mas_topiary_replace(struct ma_state *mas, struct maple_enode *old_enode) { struct ma_state tmp[3], tmp_next[3]; MA_TOPIARY(subtrees, mas->tree); bool in_rcu; int i, n; /* Place data in tree & then mark node as old */ mas_put_in_tree(mas, old_enode); /* Update the parent pointers in the tree */ tmp[0] = *mas; tmp[0].offset = 0; tmp[1].status = ma_none; tmp[2].status = ma_none; while (!mte_is_leaf(tmp[0].node)) { n = 0; for (i = 0; i < 3; i++) { if (mas_is_none(&tmp[i])) continue; while (n < 3) { if (!mas_find_child(&tmp[i], &tmp_next[n])) break; n++; } mas_adopt_children(&tmp[i], tmp[i].node); } if (MAS_WARN_ON(mas, n == 0)) break; while (n < 3) tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) tmp[i] = tmp_next[i]; } /* Collect the old nodes that need to be discarded */ if (mte_is_leaf(old_enode)) return mas_free(mas, old_enode); tmp[0] = *mas; tmp[0].offset = 0; tmp[0].node = old_enode; tmp[1].status = ma_none; tmp[2].status = ma_none; in_rcu = mt_in_rcu(mas->tree); do { n = 0; for (i = 0; i < 3; i++) { if (mas_is_none(&tmp[i])) continue; while (n < 3) { if (!mas_find_child(&tmp[i], &tmp_next[n])) break; if ((tmp_next[n].min >= tmp_next->index) && (tmp_next[n].max <= tmp_next->last)) { mat_add(&subtrees, tmp_next[n].node); tmp_next[n].status = ma_none; } else { n++; } } } if (MAS_WARN_ON(mas, n == 0)) break; while (n < 3) tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) { mas_topiary_node(mas, &tmp[i], in_rcu); tmp[i] = tmp_next[i]; } } while (!mte_is_leaf(tmp[0].node)); for (i = 0; i < 3; i++) mas_topiary_node(mas, &tmp[i], in_rcu); mas_mat_destroy(mas, &subtrees); } /* * mas_wmb_replace() - Write memory barrier and replace * @mas: The maple state * @old_enode: The old maple encoded node that is being replaced. * * Updates gap as necessary. */ static inline void mas_wmb_replace(struct ma_state *mas, struct maple_enode *old_enode) { /* Insert the new data in the tree */ mas_topiary_replace(mas, old_enode); if (mte_is_leaf(mas->node)) return; mas_update_gap(mas); } /* * mast_cp_to_nodes() - Copy data out to nodes. * @mast: The maple subtree state * @left: The left encoded maple node * @middle: The middle encoded maple node * @right: The right encoded maple node * @split: The location to split between left and (middle ? middle : right) * @mid_split: The location to split between middle and right. */ static inline void mast_cp_to_nodes(struct maple_subtree_state *mast, struct maple_enode *left, struct maple_enode *middle, struct maple_enode *right, unsigned char split, unsigned char mid_split) { bool new_lmax = true; mas_node_or_none(mast->l, left); mas_node_or_none(mast->m, middle); mas_node_or_none(mast->r, right); mast->l->min = mast->orig_l->min; if (split == mast->bn->b_end) { mast->l->max = mast->orig_r->max; new_lmax = false; } mab_mas_cp(mast->bn, 0, split, mast->l, new_lmax); if (middle) { mab_mas_cp(mast->bn, 1 + split, mid_split, mast->m, true); mast->m->min = mast->bn->pivot[split] + 1; split = mid_split; } mast->r->max = mast->orig_r->max; if (right) { mab_mas_cp(mast->bn, 1 + split, mast->bn->b_end, mast->r, false); mast->r->min = mast->bn->pivot[split] + 1; } } /* * mast_combine_cp_left - Copy in the original left side of the tree into the * combined data set in the maple subtree state big node. * @mast: The maple subtree state */ static inline void mast_combine_cp_left(struct maple_subtree_state *mast) { unsigned char l_slot = mast->orig_l->offset; if (!l_slot) return; mas_mab_cp(mast->orig_l, 0, l_slot - 1, mast->bn, 0); } /* * mast_combine_cp_right: Copy in the original right side of the tree into the * combined data set in the maple subtree state big node. * @mast: The maple subtree state */ static inline void mast_combine_cp_right(struct maple_subtree_state *mast) { if (mast->bn->pivot[mast->bn->b_end - 1] >= mast->orig_r->max) return; mas_mab_cp(mast->orig_r, mast->orig_r->offset + 1, mt_slot_count(mast->orig_r->node), mast->bn, mast->bn->b_end); mast->orig_r->last = mast->orig_r->max; } /* * mast_sufficient: Check if the maple subtree state has enough data in the big * node to create at least one sufficient node * @mast: the maple subtree state */ static inline bool mast_sufficient(struct maple_subtree_state *mast) { if (mast->bn->b_end > mt_min_slot_count(mast->orig_l->node)) return true; return false; } /* * mast_overflow: Check if there is too much data in the subtree state for a * single node. * @mast: The maple subtree state */ static inline bool mast_overflow(struct maple_subtree_state *mast) { if (mast->bn->b_end >= mt_slot_count(mast->orig_l->node)) return true; return false; } static inline void *mtree_range_walk(struct ma_state *mas) { unsigned long *pivots; unsigned char offset; struct maple_node *node; struct maple_enode *next, *last; enum maple_type type; void __rcu **slots; unsigned char end; unsigned long max, min; unsigned long prev_max, prev_min; next = mas->node; min = mas->min; max = mas->max; do { last = next; node = mte_to_node(next); type = mte_node_type(next); pivots = ma_pivots(node, type); end = ma_data_end(node, type, pivots, max); prev_min = min; prev_max = max; if (pivots[0] >= mas->index) { offset = 0; max = pivots[0]; goto next; } offset = 1; while (offset < end) { if (pivots[offset] >= mas->index) { max = pivots[offset]; break; } offset++; } min = pivots[offset - 1] + 1; next: slots = ma_slots(node, type); next = mt_slot(mas->tree, slots, offset); if (unlikely(ma_dead_node(node))) goto dead_node; } while (!ma_is_leaf(type)); mas->end = end; mas->offset = offset; mas->index = min; mas->last = max; mas->min = prev_min; mas->max = prev_max; mas->node = last; return (void *)next; dead_node: mas_reset(mas); return NULL; } /* * mas_spanning_rebalance() - Rebalance across two nodes which may not be peers. * @mas: The starting maple state * @mast: The maple_subtree_state, keeps track of 4 maple states. * @count: The estimated count of iterations needed. * * Follow the tree upwards from @l_mas and @r_mas for @count, or until the root * is hit. First @b_node is split into two entries which are inserted into the * next iteration of the loop. @b_node is returned populated with the final * iteration. @mas is used to obtain allocations. orig_l_mas keeps track of the * nodes that will remain active by using orig_l_mas->index and orig_l_mas->last * to account of what has been copied into the new sub-tree. The update of * orig_l_mas->last is used in mas_consume to find the slots that will need to * be either freed or destroyed. orig_l_mas->depth keeps track of the height of * the new sub-tree in case the sub-tree becomes the full tree. */ static void mas_spanning_rebalance(struct ma_state *mas, struct maple_subtree_state *mast, unsigned char count) { unsigned char split, mid_split; unsigned char slot = 0; struct maple_enode *left = NULL, *middle = NULL, *right = NULL; struct maple_enode *old_enode; MA_STATE(l_mas, mas->tree, mas->index, mas->index); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(m_mas, mas->tree, mas->index, mas->index); /* * The tree needs to be rebalanced and leaves need to be kept at the same level. * Rebalancing is done by use of the ``struct maple_topiary``. */ mast->l = &l_mas; mast->m = &m_mas; mast->r = &r_mas; l_mas.status = r_mas.status = m_mas.status = ma_none; /* Check if this is not root and has sufficient data. */ if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) && unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) mast_spanning_rebalance(mast); l_mas.depth = 0; /* * Each level of the tree is examined and balanced, pushing data to the left or * right, or rebalancing against left or right nodes is employed to avoid * rippling up the tree to limit the amount of churn. Once a new sub-section of * the tree is created, there may be a mix of new and old nodes. The old nodes * will have the incorrect parent pointers and currently be in two trees: the * original tree and the partially new tree. To remedy the parent pointers in * the old tree, the new data is swapped into the active tree and a walk down * the tree is performed and the parent pointers are updated. * See mas_topiary_replace() for more information. */ while (count--) { mast->bn->b_end--; mast->bn->type = mte_node_type(mast->orig_l->node); split = mas_mab_to_node(mas, mast->bn, &left, &right, &middle, &mid_split); mast_set_split_parents(mast, left, middle, right, split, mid_split); mast_cp_to_nodes(mast, left, middle, right, split, mid_split); /* * Copy data from next level in the tree to mast->bn from next * iteration */ memset(mast->bn, 0, sizeof(struct maple_big_node)); mast->bn->type = mte_node_type(left); l_mas.depth++; /* Root already stored in l->node. */ if (mas_is_root_limits(mast->l)) goto new_root; mast_ascend(mast); mast_combine_cp_left(mast); l_mas.offset = mast->bn->b_end; mab_set_b_end(mast->bn, &l_mas, left); mab_set_b_end(mast->bn, &m_mas, middle); mab_set_b_end(mast->bn, &r_mas, right); /* Copy anything necessary out of the right node. */ mast_combine_cp_right(mast); mast->orig_l->last = mast->orig_l->max; if (mast_sufficient(mast)) continue; if (mast_overflow(mast)) continue; /* May be a new root stored in mast->bn */ if (mas_is_root_limits(mast->orig_l)) break; mast_spanning_rebalance(mast); /* rebalancing from other nodes may require another loop. */ if (!count) count++; } l_mas.node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), mte_node_type(mast->orig_l->node)); l_mas.depth++; mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true); mas_set_parent(mas, left, l_mas.node, slot); if (middle) mas_set_parent(mas, middle, l_mas.node, ++slot); if (right) mas_set_parent(mas, right, l_mas.node, ++slot); if (mas_is_root_limits(mast->l)) { new_root: mas_mn(mast->l)->parent = ma_parent_ptr(mas_tree_parent(mas)); while (!mte_is_root(mast->orig_l->node)) mast_ascend(mast); } else { mas_mn(&l_mas)->parent = mas_mn(mast->orig_l)->parent; } old_enode = mast->orig_l->node; mas->depth = l_mas.depth; mas->node = l_mas.node; mas->min = l_mas.min; mas->max = l_mas.max; mas->offset = l_mas.offset; mas_wmb_replace(mas, old_enode); mtree_range_walk(mas); return; } /* * mas_rebalance() - Rebalance a given node. * @mas: The maple state * @b_node: The big maple node. * * Rebalance two nodes into a single node or two new nodes that are sufficient. * Continue upwards until tree is sufficient. */ static inline void mas_rebalance(struct ma_state *mas, struct maple_big_node *b_node) { char empty_count = mas_mt_height(mas); struct maple_subtree_state mast; unsigned char shift, b_end = ++b_node->b_end; MA_STATE(l_mas, mas->tree, mas->index, mas->last); MA_STATE(r_mas, mas->tree, mas->index, mas->last); trace_ma_op(__func__, mas); /* * Rebalancing occurs if a node is insufficient. Data is rebalanced * against the node to the right if it exists, otherwise the node to the * left of this node is rebalanced against this node. If rebalancing * causes just one node to be produced instead of two, then the parent * is also examined and rebalanced if it is insufficient. Every level * tries to combine the data in the same way. If one node contains the * entire range of the tree, then that node is used as a new root node. */ mast.orig_l = &l_mas; mast.orig_r = &r_mas; mast.bn = b_node; mast.bn->type = mte_node_type(mas->node); l_mas = r_mas = *mas; if (mas_next_sibling(&r_mas)) { mas_mab_cp(&r_mas, 0, mt_slot_count(r_mas.node), b_node, b_end); r_mas.last = r_mas.index = r_mas.max; } else { mas_prev_sibling(&l_mas); shift = mas_data_end(&l_mas) + 1; mab_shift_right(b_node, shift); mas->offset += shift; mas_mab_cp(&l_mas, 0, shift - 1, b_node, 0); b_node->b_end = shift + b_end; l_mas.index = l_mas.last = l_mas.min; } return mas_spanning_rebalance(mas, &mast, empty_count); } /* * mas_destroy_rebalance() - Rebalance left-most node while destroying the maple * state. * @mas: The maple state * @end: The end of the left-most node. * * During a mass-insert event (such as forking), it may be necessary to * rebalance the left-most node when it is not sufficient. */ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end) { enum maple_type mt = mte_node_type(mas->node); struct maple_node reuse, *newnode, *parent, *new_left, *left, *node; struct maple_enode *eparent, *old_eparent; unsigned char offset, tmp, split = mt_slots[mt] / 2; void __rcu **l_slots, **slots; unsigned long *l_pivs, *pivs, gap; bool in_rcu = mt_in_rcu(mas->tree); MA_STATE(l_mas, mas->tree, mas->index, mas->last); l_mas = *mas; mas_prev_sibling(&l_mas); /* set up node. */ if (in_rcu) { newnode = mas_pop_node(mas); } else { newnode = &reuse; } node = mas_mn(mas); newnode->parent = node->parent; slots = ma_slots(newnode, mt); pivs = ma_pivots(newnode, mt); left = mas_mn(&l_mas); l_slots = ma_slots(left, mt); l_pivs = ma_pivots(left, mt); if (!l_slots[split]) split++; tmp = mas_data_end(&l_mas) - split; memcpy(slots, l_slots + split + 1, sizeof(void *) * tmp); memcpy(pivs, l_pivs + split + 1, sizeof(unsigned long) * tmp); pivs[tmp] = l_mas.max; memcpy(slots + tmp, ma_slots(node, mt), sizeof(void *) * end); memcpy(pivs + tmp, ma_pivots(node, mt), sizeof(unsigned long) * end); l_mas.max = l_pivs[split]; mas->min = l_mas.max + 1; old_eparent = mt_mk_node(mte_parent(l_mas.node), mas_parent_type(&l_mas, l_mas.node)); tmp += end; if (!in_rcu) { unsigned char max_p = mt_pivots[mt]; unsigned char max_s = mt_slots[mt]; if (tmp < max_p) memset(pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp)); if (tmp < mt_slots[mt]) memset(slots + tmp, 0, sizeof(void *) * (max_s - tmp)); memcpy(node, newnode, sizeof(struct maple_node)); ma_set_meta(node, mt, 0, tmp - 1); mte_set_pivot(old_eparent, mte_parent_slot(l_mas.node), l_pivs[split]); /* Remove data from l_pivs. */ tmp = split + 1; memset(l_pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp)); memset(l_slots + tmp, 0, sizeof(void *) * (max_s - tmp)); ma_set_meta(left, mt, 0, split); eparent = old_eparent; goto done; } /* RCU requires replacing both l_mas, mas, and parent. */ mas->node = mt_mk_node(newnode, mt); ma_set_meta(newnode, mt, 0, tmp); new_left = mas_pop_node(mas); new_left->parent = left->parent; mt = mte_node_type(l_mas.node); slots = ma_slots(new_left, mt); pivs = ma_pivots(new_left, mt); memcpy(slots, l_slots, sizeof(void *) * split); memcpy(pivs, l_pivs, sizeof(unsigned long) * split); ma_set_meta(new_left, mt, 0, split); l_mas.node = mt_mk_node(new_left, mt); /* replace parent. */ offset = mte_parent_slot(mas->node); mt = mas_parent_type(&l_mas, l_mas.node); parent = mas_pop_node(mas); slots = ma_slots(parent, mt); pivs = ma_pivots(parent, mt); memcpy(parent, mte_to_node(old_eparent), sizeof(struct maple_node)); rcu_assign_pointer(slots[offset], mas->node); rcu_assign_pointer(slots[offset - 1], l_mas.node); pivs[offset - 1] = l_mas.max; eparent = mt_mk_node(parent, mt); done: gap = mas_leaf_max_gap(mas); mte_set_gap(eparent, mte_parent_slot(mas->node), gap); gap = mas_leaf_max_gap(&l_mas); mte_set_gap(eparent, mte_parent_slot(l_mas.node), gap); mas_ascend(mas); if (in_rcu) { mas_replace_node(mas, old_eparent); mas_adopt_children(mas, mas->node); } mas_update_gap(mas); } /* * mas_split_final_node() - Split the final node in a subtree operation. * @mast: the maple subtree state * @mas: The maple state * @height: The height of the tree in case it's a new root. */ static inline void mas_split_final_node(struct maple_subtree_state *mast, struct ma_state *mas, int height) { struct maple_enode *ancestor; if (mte_is_root(mas->node)) { if (mt_is_alloc(mas->tree)) mast->bn->type = maple_arange_64; else mast->bn->type = maple_range_64; mas->depth = height; } /* * Only a single node is used here, could be root. * The Big_node data should just fit in a single node. */ ancestor = mas_new_ma_node(mas, mast->bn); mas_set_parent(mas, mast->l->node, ancestor, mast->l->offset); mas_set_parent(mas, mast->r->node, ancestor, mast->r->offset); mte_to_node(ancestor)->parent = mas_mn(mas)->parent; mast->l->node = ancestor; mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, mast->l, true); mas->offset = mast->bn->b_end - 1; } /* * mast_fill_bnode() - Copy data into the big node in the subtree state * @mast: The maple subtree state * @mas: the maple state * @skip: The number of entries to skip for new nodes insertion. */ static inline void mast_fill_bnode(struct maple_subtree_state *mast, struct ma_state *mas, unsigned char skip) { bool cp = true; unsigned char split; memset(mast->bn, 0, sizeof(struct maple_big_node)); if (mte_is_root(mas->node)) { cp = false; } else { mas_ascend(mas); mas->offset = mte_parent_slot(mas->node); } if (cp && mast->l->offset) mas_mab_cp(mas, 0, mast->l->offset - 1, mast->bn, 0); split = mast->bn->b_end; mab_set_b_end(mast->bn, mast->l, mast->l->node); mast->r->offset = mast->bn->b_end; mab_set_b_end(mast->bn, mast->r, mast->r->node); if (mast->bn->pivot[mast->bn->b_end - 1] == mas->max) cp = false; if (cp) mas_mab_cp(mas, split + skip, mt_slot_count(mas->node) - 1, mast->bn, mast->bn->b_end); mast->bn->b_end--; mast->bn->type = mte_node_type(mas->node); } /* * mast_split_data() - Split the data in the subtree state big node into regular * nodes. * @mast: The maple subtree state * @mas: The maple state * @split: The location to split the big node */ static inline void mast_split_data(struct maple_subtree_state *mast, struct ma_state *mas, unsigned char split) { unsigned char p_slot; mab_mas_cp(mast->bn, 0, split, mast->l, true); mte_set_pivot(mast->r->node, 0, mast->r->max); mab_mas_cp(mast->bn, split + 1, mast->bn->b_end, mast->r, false); mast->l->offset = mte_parent_slot(mas->node); mast->l->max = mast->bn->pivot[split]; mast->r->min = mast->l->max + 1; if (mte_is_leaf(mas->node)) return; p_slot = mast->orig_l->offset; mas_set_split_parent(mast->orig_l, mast->l->node, mast->r->node, &p_slot, split); mas_set_split_parent(mast->orig_r, mast->l->node, mast->r->node, &p_slot, split); } /* * mas_push_data() - Instead of splitting a node, it is beneficial to push the * data to the right or left node if there is room. * @mas: The maple state * @height: The current height of the maple state * @mast: The maple subtree state * @left: Push left or not. * * Keeping the height of the tree low means faster lookups. * * Return: True if pushed, false otherwise. */ static inline bool mas_push_data(struct ma_state *mas, int height, struct maple_subtree_state *mast, bool left) { unsigned char slot_total = mast->bn->b_end; unsigned char end, space, split; MA_STATE(tmp_mas, mas->tree, mas->index, mas->last); tmp_mas = *mas; tmp_mas.depth = mast->l->depth; if (left && !mas_prev_sibling(&tmp_mas)) return false; else if (!left && !mas_next_sibling(&tmp_mas)) return false; end = mas_data_end(&tmp_mas); slot_total += end; space = 2 * mt_slot_count(mas->node) - 2; /* -2 instead of -1 to ensure there isn't a triple split */ if (ma_is_leaf(mast->bn->type)) space--; if (mas->max == ULONG_MAX) space--; if (slot_total >= space) return false; /* Get the data; Fill mast->bn */ mast->bn->b_end++; if (left) { mab_shift_right(mast->bn, end + 1); mas_mab_cp(&tmp_mas, 0, end, mast->bn, 0); mast->bn->b_end = slot_total + 1; } else { mas_mab_cp(&tmp_mas, 0, end, mast->bn, mast->bn->b_end); } /* Configure mast for splitting of mast->bn */ split = mt_slots[mast->bn->type] - 2; if (left) { /* Switch mas to prev node */ *mas = tmp_mas; /* Start using mast->l for the left side. */ tmp_mas.node = mast->l->node; *mast->l = tmp_mas; } else { tmp_mas.node = mast->r->node; *mast->r = tmp_mas; split = slot_total - split; } split = mab_no_null_split(mast->bn, split, mt_slots[mast->bn->type]); /* Update parent slot for split calculation. */ if (left) mast->orig_l->offset += end + 1; mast_split_data(mast, mas, split); mast_fill_bnode(mast, mas, 2); mas_split_final_node(mast, mas, height + 1); return true; } /* * mas_split() - Split data that is too big for one node into two. * @mas: The maple state * @b_node: The maple big node */ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) { struct maple_subtree_state mast; int height = 0; unsigned char mid_split, split = 0; struct maple_enode *old; /* * Splitting is handled differently from any other B-tree; the Maple * Tree splits upwards. Splitting up means that the split operation * occurs when the walk of the tree hits the leaves and not on the way * down. The reason for splitting up is that it is impossible to know * how much space will be needed until the leaf is (or leaves are) * reached. Since overwriting data is allowed and a range could * overwrite more than one range or result in changing one entry into 3 * entries, it is impossible to know if a split is required until the * data is examined. * * Splitting is a balancing act between keeping allocations to a minimum * and avoiding a 'jitter' event where a tree is expanded to make room * for an entry followed by a contraction when the entry is removed. To * accomplish the balance, there are empty slots remaining in both left * and right nodes after a split. */ MA_STATE(l_mas, mas->tree, mas->index, mas->last); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last); trace_ma_op(__func__, mas); mas->depth = mas_mt_height(mas); mast.l = &l_mas; mast.r = &r_mas; mast.orig_l = &prev_l_mas; mast.orig_r = &prev_r_mas; mast.bn = b_node; while (height++ <= mas->depth) { if (mt_slots[b_node->type] > b_node->b_end) { mas_split_final_node(&mast, mas, height); break; } l_mas = r_mas = *mas; l_mas.node = mas_new_ma_node(mas, b_node); r_mas.node = mas_new_ma_node(mas, b_node); /* * Another way that 'jitter' is avoided is to terminate a split up early if the * left or right node has space to spare. This is referred to as "pushing left" * or "pushing right" and is similar to the B* tree, except the nodes left or * right can rarely be reused due to RCU, but the ripple upwards is halted which * is a significant savings. */ /* Try to push left. */ if (mas_push_data(mas, height, &mast, true)) break; /* Try to push right. */ if (mas_push_data(mas, height, &mast, false)) break; split = mab_calc_split(mas, b_node, &mid_split); mast_split_data(&mast, mas, split); /* * Usually correct, mab_mas_cp in the above call overwrites * r->max. */ mast.r->max = mas->max; mast_fill_bnode(&mast, mas, 1); prev_l_mas = *mast.l; prev_r_mas = *mast.r; } /* Set the original node as dead */ old = mas->node; mas->node = l_mas.node; mas_wmb_replace(mas, old); mtree_range_walk(mas); return; } /* * mas_commit_b_node() - Commit the big node into the tree. * @wr_mas: The maple write state * @b_node: The maple big node */ static noinline_for_kasan void mas_commit_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node) { enum store_type type = wr_mas->mas->store_type; WARN_ON_ONCE(type != wr_rebalance && type != wr_split_store); if (type == wr_rebalance) return mas_rebalance(wr_mas->mas, b_node); return mas_split(wr_mas->mas, b_node); } /* * mas_root_expand() - Expand a root to a node * @mas: The maple state * @entry: The entry to store into the tree */ static inline void mas_root_expand(struct ma_state *mas, void *entry) { void *contents = mas_root_locked(mas); enum maple_type type = maple_leaf_64; struct maple_node *node; void __rcu **slots; unsigned long *pivots; int slot = 0; node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); mas->status = ma_active; if (mas->index) { if (contents) { rcu_assign_pointer(slots[slot], contents); if (likely(mas->index > 1)) slot++; } pivots[slot++] = mas->index - 1; } rcu_assign_pointer(slots[slot], entry); mas->offset = slot; pivots[slot] = mas->last; if (mas->last != ULONG_MAX) pivots[++slot] = ULONG_MAX; mas->depth = 1; mas_set_height(mas); ma_set_meta(node, maple_leaf_64, 0, slot); /* swap the new root into the tree */ rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); return; } /* * mas_store_root() - Storing value into root. * @mas: The maple state * @entry: The entry to store. * * There is no root node now and we are storing a value into the root - this * function either assigns the pointer or expands into a node. */ static inline void mas_store_root(struct ma_state *mas, void *entry) { if (!entry) { if (!mas->index) rcu_assign_pointer(mas->tree->ma_root, NULL); } else if (likely((mas->last != 0) || (mas->index != 0))) mas_root_expand(mas, entry); else if (((unsigned long) (entry) & 3) == 2) mas_root_expand(mas, entry); else { rcu_assign_pointer(mas->tree->ma_root, entry); mas->status = ma_start; } } /* * mas_is_span_wr() - Check if the write needs to be treated as a write that * spans the node. * @wr_mas: The maple write state * * Spanning writes are writes that start in one node and end in another OR if * the write of a %NULL will cause the node to end with a %NULL. * * Return: True if this is a spanning write, false otherwise. */ static bool mas_is_span_wr(struct ma_wr_state *wr_mas) { unsigned long max = wr_mas->r_max; unsigned long last = wr_mas->mas->last; enum maple_type type = wr_mas->type; void *entry = wr_mas->entry; /* Contained in this pivot, fast path */ if (last < max) return false; if (ma_is_leaf(type)) { max = wr_mas->mas->max; if (last < max) return false; } if (last == max) { /* * The last entry of leaf node cannot be NULL unless it is the * rightmost node (writing ULONG_MAX), otherwise it spans slots. */ if (entry || last == ULONG_MAX) return false; } trace_ma_write(__func__, wr_mas->mas, wr_mas->r_max, entry); return true; } static inline void mas_wr_walk_descend(struct ma_wr_state *wr_mas) { wr_mas->type = mte_node_type(wr_mas->mas->node); mas_wr_node_walk(wr_mas); wr_mas->slots = ma_slots(wr_mas->node, wr_mas->type); } static inline void mas_wr_walk_traverse(struct ma_wr_state *wr_mas) { wr_mas->mas->max = wr_mas->r_max; wr_mas->mas->min = wr_mas->r_min; wr_mas->mas->node = wr_mas->content; wr_mas->mas->offset = 0; wr_mas->mas->depth++; } /* * mas_wr_walk() - Walk the tree for a write. * @wr_mas: The maple write state * * Uses mas_slot_locked() and does not need to worry about dead nodes. * * Return: True if it's contained in a node, false on spanning write. */ static bool mas_wr_walk(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; while (true) { mas_wr_walk_descend(wr_mas); if (unlikely(mas_is_span_wr(wr_mas))) return false; wr_mas->content = mas_slot_locked(mas, wr_mas->slots, mas->offset); if (ma_is_leaf(wr_mas->type)) return true; mas_wr_walk_traverse(wr_mas); } return true; } static void mas_wr_walk_index(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; while (true) { mas_wr_walk_descend(wr_mas); wr_mas->content = mas_slot_locked(mas, wr_mas->slots, mas->offset); if (ma_is_leaf(wr_mas->type)) return; mas_wr_walk_traverse(wr_mas); } } /* * mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs. * @l_wr_mas: The left maple write state * @r_wr_mas: The right maple write state */ static inline void mas_extend_spanning_null(struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas) { struct ma_state *r_mas = r_wr_mas->mas; struct ma_state *l_mas = l_wr_mas->mas; unsigned char l_slot; l_slot = l_mas->offset; if (!l_wr_mas->content) l_mas->index = l_wr_mas->r_min; if ((l_mas->index == l_wr_mas->r_min) && (l_slot && !mas_slot_locked(l_mas, l_wr_mas->slots, l_slot - 1))) { if (l_slot > 1) l_mas->index = l_wr_mas->pivots[l_slot - 2] + 1; else l_mas->index = l_mas->min; l_mas->offset = l_slot - 1; } if (!r_wr_mas->content) { if (r_mas->last < r_wr_mas->r_max) r_mas->last = r_wr_mas->r_max; r_mas->offset++; } else if ((r_mas->last == r_wr_mas->r_max) && (r_mas->last < r_mas->max) && !mas_slot_locked(r_mas, r_wr_mas->slots, r_mas->offset + 1)) { r_mas->last = mas_safe_pivot(r_mas, r_wr_mas->pivots, r_wr_mas->type, r_mas->offset + 1); r_mas->offset++; } } static inline void *mas_state_walk(struct ma_state *mas) { void *entry; entry = mas_start(mas); if (mas_is_none(mas)) return NULL; if (mas_is_ptr(mas)) return entry; return mtree_range_walk(mas); } /* * mtree_lookup_walk() - Internal quick lookup that does not keep maple state up * to date. * * @mas: The maple state. * * Note: Leaves mas in undesirable state. * Return: The entry for @mas->index or %NULL on dead node. */ static inline void *mtree_lookup_walk(struct ma_state *mas) { unsigned long *pivots; unsigned char offset; struct maple_node *node; struct maple_enode *next; enum maple_type type; void __rcu **slots; unsigned char end; next = mas->node; do { node = mte_to_node(next); type = mte_node_type(next); pivots = ma_pivots(node, type); end = mt_pivots[type]; offset = 0; do { if (pivots[offset] >= mas->index) break; } while (++offset < end); slots = ma_slots(node, type); next = mt_slot(mas->tree, slots, offset); if (unlikely(ma_dead_node(node))) goto dead_node; } while (!ma_is_leaf(type)); return (void *)next; dead_node: mas_reset(mas); return NULL; } static void mte_destroy_walk(struct maple_enode *, struct maple_tree *); /* * mas_new_root() - Create a new root node that only contains the entry passed * in. * @mas: The maple state * @entry: The entry to store. * * Only valid when the index == 0 and the last == ULONG_MAX */ static inline void mas_new_root(struct ma_state *mas, void *entry) { struct maple_enode *root = mas_root_locked(mas); enum maple_type type = maple_leaf_64; struct maple_node *node; void __rcu **slots; unsigned long *pivots; WARN_ON_ONCE(mas->index || mas->last != ULONG_MAX); if (!entry) { mas->depth = 0; mas_set_height(mas); rcu_assign_pointer(mas->tree->ma_root, entry); mas->status = ma_start; goto done; } node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); mas->status = ma_active; rcu_assign_pointer(slots[0], entry); pivots[0] = mas->last; mas->depth = 1; mas_set_height(mas); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); done: if (xa_is_node(root)) mte_destroy_walk(root, mas->tree); return; } /* * mas_wr_spanning_store() - Create a subtree with the store operation completed * and new nodes where necessary, then place the sub-tree in the actual tree. * Note that mas is expected to point to the node which caused the store to * span. * @wr_mas: The maple write state */ static noinline void mas_wr_spanning_store(struct ma_wr_state *wr_mas) { struct maple_subtree_state mast; struct maple_big_node b_node; struct ma_state *mas; unsigned char height; /* Left and Right side of spanning store */ MA_STATE(l_mas, NULL, 0, 0); MA_STATE(r_mas, NULL, 0, 0); MA_WR_STATE(r_wr_mas, &r_mas, wr_mas->entry); MA_WR_STATE(l_wr_mas, &l_mas, wr_mas->entry); /* * A store operation that spans multiple nodes is called a spanning * store and is handled early in the store call stack by the function * mas_is_span_wr(). When a spanning store is identified, the maple * state is duplicated. The first maple state walks the left tree path * to ``index``, the duplicate walks the right tree path to ``last``. * The data in the two nodes are combined into a single node, two nodes, * or possibly three nodes (see the 3-way split above). A ``NULL`` * written to the last entry of a node is considered a spanning store as * a rebalance is required for the operation to complete and an overflow * of data may happen. */ mas = wr_mas->mas; trace_ma_op(__func__, mas); if (unlikely(!mas->index && mas->last == ULONG_MAX)) return mas_new_root(mas, wr_mas->entry); /* * Node rebalancing may occur due to this store, so there may be three new * entries per level plus a new root. */ height = mas_mt_height(mas); /* * Set up right side. Need to get to the next offset after the spanning * store to ensure it's not NULL and to combine both the next node and * the node with the start together. */ r_mas = *mas; /* Avoid overflow, walk to next slot in the tree. */ if (r_mas.last + 1) r_mas.last++; r_mas.index = r_mas.last; mas_wr_walk_index(&r_wr_mas); r_mas.last = r_mas.index = mas->last; /* Set up left side. */ l_mas = *mas; mas_wr_walk_index(&l_wr_mas); if (!wr_mas->entry) { mas_extend_spanning_null(&l_wr_mas, &r_wr_mas); mas->offset = l_mas.offset; mas->index = l_mas.index; mas->last = l_mas.last = r_mas.last; } /* expanding NULLs may make this cover the entire range */ if (!l_mas.index && r_mas.last == ULONG_MAX) { mas_set_range(mas, 0, ULONG_MAX); return mas_new_root(mas, wr_mas->entry); } memset(&b_node, 0, sizeof(struct maple_big_node)); /* Copy l_mas and store the value in b_node. */ mas_store_b_node(&l_wr_mas, &b_node, l_mas.end); /* Copy r_mas into b_node if there is anything to copy. */ if (r_mas.max > r_mas.last) mas_mab_cp(&r_mas, r_mas.offset, r_mas.end, &b_node, b_node.b_end + 1); else b_node.b_end++; /* Stop spanning searches by searching for just index. */ l_mas.index = l_mas.last = mas->index; mast.bn = &b_node; mast.orig_l = &l_mas; mast.orig_r = &r_mas; /* Combine l_mas and r_mas and split them up evenly again. */ return mas_spanning_rebalance(mas, &mast, height + 1); } /* * mas_wr_node_store() - Attempt to store the value in a node * @wr_mas: The maple write state * * Attempts to reuse the node, but may allocate. */ static inline void mas_wr_node_store(struct ma_wr_state *wr_mas, unsigned char new_end) { struct ma_state *mas = wr_mas->mas; void __rcu **dst_slots; unsigned long *dst_pivots; unsigned char dst_offset, offset_end = wr_mas->offset_end; struct maple_node reuse, *newnode; unsigned char copy_size, node_pivots = mt_pivots[wr_mas->type]; bool in_rcu = mt_in_rcu(mas->tree); if (mas->last == wr_mas->end_piv) offset_end++; /* don't copy this offset */ else if (unlikely(wr_mas->r_max == ULONG_MAX)) mas_bulk_rebalance(mas, mas->end, wr_mas->type); /* set up node. */ if (in_rcu) { newnode = mas_pop_node(mas); } else { memset(&reuse, 0, sizeof(struct maple_node)); newnode = &reuse; } newnode->parent = mas_mn(mas)->parent; dst_pivots = ma_pivots(newnode, wr_mas->type); dst_slots = ma_slots(newnode, wr_mas->type); /* Copy from start to insert point */ memcpy(dst_pivots, wr_mas->pivots, sizeof(unsigned long) * mas->offset); memcpy(dst_slots, wr_mas->slots, sizeof(void *) * mas->offset); /* Handle insert of new range starting after old range */ if (wr_mas->r_min < mas->index) { rcu_assign_pointer(dst_slots[mas->offset], wr_mas->content); dst_pivots[mas->offset++] = mas->index - 1; } /* Store the new entry and range end. */ if (mas->offset < node_pivots) dst_pivots[mas->offset] = mas->last; rcu_assign_pointer(dst_slots[mas->offset], wr_mas->entry); /* * this range wrote to the end of the node or it overwrote the rest of * the data */ if (offset_end > mas->end) goto done; dst_offset = mas->offset + 1; /* Copy to the end of node if necessary. */ copy_size = mas->end - offset_end + 1; memcpy(dst_slots + dst_offset, wr_mas->slots + offset_end, sizeof(void *) * copy_size); memcpy(dst_pivots + dst_offset, wr_mas->pivots + offset_end, sizeof(unsigned long) * (copy_size - 1)); if (new_end < node_pivots) dst_pivots[new_end] = mas->max; done: mas_leaf_set_meta(newnode, maple_leaf_64, new_end); if (in_rcu) { struct maple_enode *old_enode = mas->node; mas->node = mt_mk_node(newnode, wr_mas->type); mas_replace_node(mas, old_enode); } else { memcpy(wr_mas->node, newnode, sizeof(struct maple_node)); } trace_ma_write(__func__, mas, 0, wr_mas->entry); mas_update_gap(mas); mas->end = new_end; return; } /* * mas_wr_slot_store: Attempt to store a value in a slot. * @wr_mas: the maple write state */ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char offset = mas->offset; void __rcu **slots = wr_mas->slots; bool gap = false; gap |= !mt_slot_locked(mas->tree, slots, offset); gap |= !mt_slot_locked(mas->tree, slots, offset + 1); if (wr_mas->offset_end - offset == 1) { if (mas->index == wr_mas->r_min) { /* Overwriting the range and a part of the next one */ rcu_assign_pointer(slots[offset], wr_mas->entry); wr_mas->pivots[offset] = mas->last; } else { /* Overwriting a part of the range and the next one */ rcu_assign_pointer(slots[offset + 1], wr_mas->entry); wr_mas->pivots[offset] = mas->index - 1; mas->offset++; /* Keep mas accurate. */ } } else { WARN_ON_ONCE(mt_in_rcu(mas->tree)); /* * Expand the range, only partially overwriting the previous and * next ranges */ gap |= !mt_slot_locked(mas->tree, slots, offset + 2); rcu_assign_pointer(slots[offset + 1], wr_mas->entry); wr_mas->pivots[offset] = mas->index - 1; wr_mas->pivots[offset + 1] = mas->last; mas->offset++; /* Keep mas accurate. */ } trace_ma_write(__func__, mas, 0, wr_mas->entry); /* * Only update gap when the new entry is empty or there is an empty * entry in the original two ranges. */ if (!wr_mas->entry || gap) mas_update_gap(mas); return; } static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; if (!wr_mas->slots[wr_mas->offset_end]) { /* If this one is null, the next and prev are not */ mas->last = wr_mas->end_piv; } else { /* Check next slot(s) if we are overwriting the end */ if ((mas->last == wr_mas->end_piv) && (mas->end != wr_mas->offset_end) && !wr_mas->slots[wr_mas->offset_end + 1]) { wr_mas->offset_end++; if (wr_mas->offset_end == mas->end) mas->last = mas->max; else mas->last = wr_mas->pivots[wr_mas->offset_end]; wr_mas->end_piv = mas->last; } } if (!wr_mas->content) { /* If this one is null, the next and prev are not */ mas->index = wr_mas->r_min; } else { /* Check prev slot if we are overwriting the start */ if (mas->index == wr_mas->r_min && mas->offset && !wr_mas->slots[mas->offset - 1]) { mas->offset--; wr_mas->r_min = mas->index = mas_safe_min(mas, wr_mas->pivots, mas->offset); wr_mas->r_max = wr_mas->pivots[mas->offset]; } } } static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) { while ((wr_mas->offset_end < wr_mas->mas->end) && (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end])) wr_mas->offset_end++; if (wr_mas->offset_end < wr_mas->mas->end) wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end]; else wr_mas->end_piv = wr_mas->mas->max; } static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end = mas->end + 2; new_end -= wr_mas->offset_end - mas->offset; if (wr_mas->r_min == mas->index) new_end--; if (wr_mas->end_piv == mas->last) new_end--; return new_end; } /* * mas_wr_append: Attempt to append * @wr_mas: the maple write state * @new_end: The end of the node after the modification * * This is currently unsafe in rcu mode since the end of the node may be cached * by readers while the node contents may be updated which could result in * inaccurate information. */ static inline void mas_wr_append(struct ma_wr_state *wr_mas, unsigned char new_end) { struct ma_state *mas = wr_mas->mas; void __rcu **slots; unsigned char end = mas->end; if (new_end < mt_pivots[wr_mas->type]) { wr_mas->pivots[new_end] = wr_mas->pivots[end]; ma_set_meta(wr_mas->node, wr_mas->type, 0, new_end); } slots = wr_mas->slots; if (new_end == end + 1) { if (mas->last == wr_mas->r_max) { /* Append to end of range */ rcu_assign_pointer(slots[new_end], wr_mas->entry); wr_mas->pivots[end] = mas->index - 1; mas->offset = new_end; } else { /* Append to start of range */ rcu_assign_pointer(slots[new_end], wr_mas->content); wr_mas->pivots[end] = mas->last; rcu_assign_pointer(slots[end], wr_mas->entry); } } else { /* Append to the range without touching any boundaries. */ rcu_assign_pointer(slots[new_end], wr_mas->content); wr_mas->pivots[end + 1] = mas->last; rcu_assign_pointer(slots[end + 1], wr_mas->entry); wr_mas->pivots[end] = mas->index - 1; mas->offset = end + 1; } if (!wr_mas->content || !wr_mas->entry) mas_update_gap(mas); mas->end = new_end; trace_ma_write(__func__, mas, new_end, wr_mas->entry); return; } /* * mas_wr_bnode() - Slow path for a modification. * @wr_mas: The write maple state * * This is where split, rebalance end up. */ static void mas_wr_bnode(struct ma_wr_state *wr_mas) { struct maple_big_node b_node; trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry); memset(&b_node, 0, sizeof(struct maple_big_node)); mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); mas_commit_b_node(wr_mas, &b_node); } /* * mas_wr_store_entry() - Internal call to store a value * @wr_mas: The maple write state */ static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end = mas_wr_new_end(wr_mas); switch (mas->store_type) { case wr_invalid: MT_BUG_ON(mas->tree, 1); return; case wr_new_root: mas_new_root(mas, wr_mas->entry); break; case wr_store_root: mas_store_root(mas, wr_mas->entry); break; case wr_exact_fit: rcu_assign_pointer(wr_mas->slots[mas->offset], wr_mas->entry); if (!!wr_mas->entry ^ !!wr_mas->content) mas_update_gap(mas); break; case wr_append: mas_wr_append(wr_mas, new_end); break; case wr_slot_store: mas_wr_slot_store(wr_mas); break; case wr_node_store: mas_wr_node_store(wr_mas, new_end); break; case wr_spanning_store: mas_wr_spanning_store(wr_mas); break; case wr_split_store: case wr_rebalance: mas_wr_bnode(wr_mas); break; } return; } static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; if (!mas_is_active(mas)) { if (mas_is_start(mas)) goto set_content; if (unlikely(mas_is_paused(mas))) goto reset; if (unlikely(mas_is_none(mas))) goto reset; if (unlikely(mas_is_overflow(mas))) goto reset; if (unlikely(mas_is_underflow(mas))) goto reset; } /* * A less strict version of mas_is_span_wr() where we allow spanning * writes within this node. This is to stop partial walks in * mas_prealloc() from being reset. */ if (mas->last > mas->max) goto reset; if (wr_mas->entry) goto set_content; if (mte_is_leaf(mas->node) && mas->last == mas->max) goto reset; goto set_content; reset: mas_reset(mas); set_content: wr_mas->content = mas_start(mas); } /** * mas_prealloc_calc() - Calculate number of nodes needed for a * given store oepration * @mas: The maple state * @entry: The entry to store into the tree * * Return: Number of nodes required for preallocation. */ static inline int mas_prealloc_calc(struct ma_state *mas, void *entry) { int ret = mas_mt_height(mas) * 3 + 1; switch (mas->store_type) { case wr_invalid: WARN_ON_ONCE(1); break; case wr_new_root: ret = 1; break; case wr_store_root: if (likely((mas->last != 0) || (mas->index != 0))) ret = 1; else if (((unsigned long) (entry) & 3) == 2) ret = 1; else ret = 0; break; case wr_spanning_store: ret = mas_mt_height(mas) * 3 + 1; break; case wr_split_store: ret = mas_mt_height(mas) * 2 + 1; break; case wr_rebalance: ret = mas_mt_height(mas) * 2 - 1; break; case wr_node_store: ret = mt_in_rcu(mas->tree) ? 1 : 0; break; case wr_append: case wr_exact_fit: case wr_slot_store: ret = 0; } return ret; } /* * mas_wr_store_type() - Determine the store type for a given * store operation. * @wr_mas: The maple write state * * Return: the type of store needed for the operation */ static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end; if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) return wr_store_root; if (unlikely(!mas_wr_walk(wr_mas))) return wr_spanning_store; /* At this point, we are at the leaf node that needs to be altered. */ mas_wr_end_piv(wr_mas); if (!wr_mas->entry) mas_wr_extend_null(wr_mas); if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) return wr_exact_fit; if (unlikely(!mas->index && mas->last == ULONG_MAX)) return wr_new_root; new_end = mas_wr_new_end(wr_mas); /* Potential spanning rebalance collapsing a node */ if (new_end < mt_min_slots[wr_mas->type]) { if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK)) return wr_rebalance; return wr_node_store; } if (new_end >= mt_slots[wr_mas->type]) return wr_split_store; if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end)) return wr_append; if ((new_end == mas->end) && (!mt_in_rcu(mas->tree) || (wr_mas->offset_end - mas->offset == 1))) return wr_slot_store; return wr_node_store; } /** * mas_wr_preallocate() - Preallocate enough nodes for a store operation * @wr_mas: The maple write state * @entry: The entry that will be stored * */ static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry) { struct ma_state *mas = wr_mas->mas; int request; mas_wr_prealloc_setup(wr_mas); mas->store_type = mas_wr_store_type(wr_mas); request = mas_prealloc_calc(mas, entry); if (!request) return; mas_node_count(mas, request); } /** * mas_insert() - Internal call to insert a value * @mas: The maple state * @entry: The entry to store * * Return: %NULL or the contents that already exists at the requested index * otherwise. The maple state needs to be checked for error conditions. */ static inline void *mas_insert(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); /* * Inserting a new range inserts either 0, 1, or 2 pivots within the * tree. If the insert fits exactly into an existing gap with a value * of NULL, then the slot only needs to be written with the new value. * If the range being inserted is adjacent to another range, then only a * single pivot needs to be inserted (as well as writing the entry). If * the new range is within a gap but does not touch any other ranges, * then two pivots need to be inserted: the start - 1, and the end. As * usual, the entry must be written. Most operations require a new node * to be allocated and replace an existing node to ensure RCU safety, * when in RCU mode. The exception to requiring a newly allocated node * is when inserting at the end of a node (appending). When done * carefully, appending can reuse the node in place. */ wr_mas.content = mas_start(mas); if (wr_mas.content) goto exists; mas_wr_preallocate(&wr_mas, entry); if (mas_is_err(mas)) return NULL; /* spanning writes always overwrite something */ if (mas->store_type == wr_spanning_store) goto exists; /* At this point, we are at the leaf node that needs to be altered. */ if (mas->store_type != wr_new_root && mas->store_type != wr_store_root) { wr_mas.offset_end = mas->offset; wr_mas.end_piv = wr_mas.r_max; if (wr_mas.content || (mas->last > wr_mas.r_max)) goto exists; } mas_wr_store_entry(&wr_mas); return wr_mas.content; exists: mas_set_err(mas, -EEXIST); return wr_mas.content; } /** * mas_alloc_cyclic() - Internal call to find somewhere to store an entry * @mas: The maple state. * @startp: Pointer to ID. * @range_lo: Lower bound of range to search. * @range_hi: Upper bound of range to search. * @entry: The entry to store. * @next: Pointer to next ID to allocate. * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 if the allocation succeeded without wrapping, 1 if the * allocation succeeded after wrapping, or -EBUSY if there are no * free entries. */ int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp) { unsigned long min = range_lo; int ret = 0; range_lo = max(min, *next); ret = mas_empty_area(mas, range_lo, range_hi, 1); if ((mas->tree->ma_flags & MT_FLAGS_ALLOC_WRAPPED) && ret == 0) { mas->tree->ma_flags &= ~MT_FLAGS_ALLOC_WRAPPED; ret = 1; } if (ret < 0 && range_lo > min) { mas_reset(mas); ret = mas_empty_area(mas, min, range_hi, 1); if (ret == 0) ret = 1; } if (ret < 0) return ret; do { mas_insert(mas, entry); } while (mas_nomem(mas, gfp)); if (mas_is_err(mas)) return xa_err(mas->node); *startp = mas->index; *next = *startp + 1; if (*next == 0) mas->tree->ma_flags |= MT_FLAGS_ALLOC_WRAPPED; mas_destroy(mas); return ret; } EXPORT_SYMBOL(mas_alloc_cyclic); static __always_inline void mas_rewalk(struct ma_state *mas, unsigned long index) { retry: mas_set(mas, index); mas_state_walk(mas); if (mas_is_start(mas)) goto retry; } static __always_inline bool mas_rewalk_if_dead(struct ma_state *mas, struct maple_node *node, const unsigned long index) { if (unlikely(ma_dead_node(node))) { mas_rewalk(mas, index); return true; } return false; } /* * mas_prev_node() - Find the prev non-null entry at the same level in the * tree. The prev value will be mas->node[mas->offset] or the status will be * ma_none. * @mas: The maple state * @min: The lower limit to search * * The prev node value will be mas->node[mas->offset] or the status will be * ma_none. * Return: 1 if the node is dead, 0 otherwise. */ static int mas_prev_node(struct ma_state *mas, unsigned long min) { enum maple_type mt; int offset, level; void __rcu **slots; struct maple_node *node; unsigned long *pivots; unsigned long max; node = mas_mn(mas); if (!mas->min) goto no_entry; max = mas->min - 1; if (max < min) goto no_entry; level = 0; do { if (ma_is_root(node)) goto no_entry; /* Walk up. */ if (unlikely(mas_ascend(mas))) return 1; offset = mas->offset; level++; node = mas_mn(mas); } while (!offset); offset--; mt = mte_node_type(mas->node); while (level > 1) { level--; slots = ma_slots(node, mt); mas->node = mas_slot(mas, slots, offset); if (unlikely(ma_dead_node(node))) return 1; mt = mte_node_type(mas->node); node = mas_mn(mas); pivots = ma_pivots(node, mt); offset = ma_data_end(node, mt, pivots, max); if (unlikely(ma_dead_node(node))) return 1; } slots = ma_slots(node, mt); mas->node = mas_slot(mas, slots, offset); pivots = ma_pivots(node, mt); if (unlikely(ma_dead_node(node))) return 1; if (likely(offset)) mas->min = pivots[offset - 1] + 1; mas->max = max; mas->offset = mas_data_end(mas); if (unlikely(mte_dead_node(mas->node))) return 1; mas->end = mas->offset; return 0; no_entry: if (unlikely(ma_dead_node(node))) return 1; mas->status = ma_underflow; return 0; } /* * mas_prev_slot() - Get the entry in the previous slot * * @mas: The maple state * @min: The minimum starting range * @empty: Can be empty * * Return: The entry in the previous slot which is possibly NULL */ static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty) { void *entry; void __rcu **slots; unsigned long pivot; enum maple_type type; unsigned long *pivots; struct maple_node *node; unsigned long save_point = mas->index; retry: node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (mas->min <= min) { pivot = mas_safe_min(mas, pivots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (pivot <= min) goto underflow; } again: if (likely(mas->offset)) { mas->offset--; mas->last = mas->index - 1; mas->index = mas_safe_min(mas, pivots, mas->offset); } else { if (mas->index <= min) goto underflow; if (mas_prev_node(mas, min)) { mas_rewalk(mas, save_point); goto retry; } if (WARN_ON_ONCE(mas_is_underflow(mas))) return NULL; mas->last = mas->max; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); mas->index = pivots[mas->offset - 1] + 1; } slots = ma_slots(node, type); entry = mas_slot(mas, slots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (likely(entry)) return entry; if (!empty) { if (mas->index <= min) { mas->status = ma_underflow; return NULL; } goto again; } return entry; underflow: mas->status = ma_underflow; return NULL; } /* * mas_next_node() - Get the next node at the same level in the tree. * @mas: The maple state * @node: The maple node * @max: The maximum pivot value to check. * * The next value will be mas->node[mas->offset] or the status will have * overflowed. * Return: 1 on dead node, 0 otherwise. */ static int mas_next_node(struct ma_state *mas, struct maple_node *node, unsigned long max) { unsigned long min; unsigned long *pivots; struct maple_enode *enode; struct maple_node *tmp; int level = 0; unsigned char node_end; enum maple_type mt; void __rcu **slots; if (mas->max >= max) goto overflow; min = mas->max + 1; level = 0; do { if (ma_is_root(node)) goto overflow; /* Walk up. */ if (unlikely(mas_ascend(mas))) return 1; level++; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); node_end = ma_data_end(node, mt, pivots, mas->max); if (unlikely(ma_dead_node(node))) return 1; } while (unlikely(mas->offset == node_end)); slots = ma_slots(node, mt); mas->offset++; enode = mas_slot(mas, slots, mas->offset); if (unlikely(ma_dead_node(node))) return 1; if (level > 1) mas->offset = 0; while (unlikely(level > 1)) { level--; mas->node = enode; node = mas_mn(mas); mt = mte_node_type(mas->node); slots = ma_slots(node, mt); enode = mas_slot(mas, slots, 0); if (unlikely(ma_dead_node(node))) return 1; } if (!mas->offset) pivots = ma_pivots(node, mt); mas->max = mas_safe_pivot(mas, pivots, mas->offset, mt); tmp = mte_to_node(enode); mt = mte_node_type(enode); pivots = ma_pivots(tmp, mt); mas->end = ma_data_end(tmp, mt, pivots, mas->max); if (unlikely(ma_dead_node(node))) return 1; mas->node = enode; mas->min = min; return 0; overflow: if (unlikely(ma_dead_node(node))) return 1; mas->status = ma_overflow; return 0; } /* * mas_next_slot() - Get the entry in the next slot * * @mas: The maple state * @max: The maximum starting range * @empty: Can be empty * * Return: The entry in the next slot which is possibly NULL */ static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty) { void __rcu **slots; unsigned long *pivots; unsigned long pivot; enum maple_type type; struct maple_node *node; unsigned long save_point = mas->last; void *entry; retry: node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (mas->max >= max) { if (likely(mas->offset < mas->end)) pivot = pivots[mas->offset]; else pivot = mas->max; if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (pivot >= max) { /* Was at the limit, next will extend beyond */ mas->status = ma_overflow; return NULL; } } if (likely(mas->offset < mas->end)) { mas->index = pivots[mas->offset] + 1; again: mas->offset++; if (likely(mas->offset < mas->end)) mas->last = pivots[mas->offset]; else mas->last = mas->max; } else { if (mas->last >= max) { mas->status = ma_overflow; return NULL; } if (mas_next_node(mas, node, max)) { mas_rewalk(mas, save_point); goto retry; } if (WARN_ON_ONCE(mas_is_overflow(mas))) return NULL; mas->offset = 0; mas->index = mas->min; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); mas->last = pivots[0]; } slots = ma_slots(node, type); entry = mt_slot(mas->tree, slots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (entry) return entry; if (!empty) { if (mas->last >= max) { mas->status = ma_overflow; return NULL; } mas->index = mas->last + 1; goto again; } return entry; } /* * mas_rev_awalk() - Internal function. Reverse allocation walk. Find the * highest gap address of a given size in a given node and descend. * @mas: The maple state * @size: The needed size. * * Return: True if found in a leaf, false otherwise. * */ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size, unsigned long *gap_min, unsigned long *gap_max) { enum maple_type type = mte_node_type(mas->node); struct maple_node *node = mas_mn(mas); unsigned long *pivots, *gaps; void __rcu **slots; unsigned long gap = 0; unsigned long max, min; unsigned char offset; if (unlikely(mas_is_err(mas))) return true; if (ma_is_dense(type)) { /* dense nodes. */ mas->offset = (unsigned char)(mas->index - mas->min); return true; } pivots = ma_pivots(node, type); slots = ma_slots(node, type); gaps = ma_gaps(node, type); offset = mas->offset; min = mas_safe_min(mas, pivots, offset); /* Skip out of bounds. */ while (mas->last < min) min = mas_safe_min(mas, pivots, --offset); max = mas_safe_pivot(mas, pivots, offset, type); while (mas->index <= max) { gap = 0; if (gaps) gap = gaps[offset]; else if (!mas_slot(mas, slots, offset)) gap = max - min + 1; if (gap) { if ((size <= gap) && (size <= mas->last - min + 1)) break; if (!gaps) { /* Skip the next slot, it cannot be a gap. */ if (offset < 2) goto ascend; offset -= 2; max = pivots[offset]; min = mas_safe_min(mas, pivots, offset); continue; } } if (!offset) goto ascend; offset--; max = min - 1; min = mas_safe_min(mas, pivots, offset); } if (unlikely((mas->index > max) || (size - 1 > max - mas->index))) goto no_space; if (unlikely(ma_is_leaf(type))) { mas->offset = offset; *gap_min = min; *gap_max = min + gap - 1; return true; } /* descend, only happens under lock. */ mas->node = mas_slot(mas, slots, offset); mas->min = min; mas->max = max; mas->offset = mas_data_end(mas); return false; ascend: if (!mte_is_root(mas->node)) return false; no_space: mas_set_err(mas, -EBUSY); return false; } static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) { enum maple_type type = mte_node_type(mas->node); unsigned long pivot, min, gap = 0; unsigned char offset, data_end; unsigned long *gaps, *pivots; void __rcu **slots; struct maple_node *node; bool found = false; if (ma_is_dense(type)) { mas->offset = (unsigned char)(mas->index - mas->min); return true; } node = mas_mn(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); gaps = ma_gaps(node, type); offset = mas->offset; min = mas_safe_min(mas, pivots, offset); data_end = ma_data_end(node, type, pivots, mas->max); for (; offset <= data_end; offset++) { pivot = mas_safe_pivot(mas, pivots, offset, type); /* Not within lower bounds */ if (mas->index > pivot) goto next_slot; if (gaps) gap = gaps[offset]; else if (!mas_slot(mas, slots, offset)) gap = min(pivot, mas->last) - max(mas->index, min) + 1; else goto next_slot; if (gap >= size) { if (ma_is_leaf(type)) { found = true; break; } mas->node = mas_slot(mas, slots, offset); mas->min = min; mas->max = pivot; offset = 0; break; } next_slot: min = pivot + 1; if (mas->last <= pivot) { mas_set_err(mas, -EBUSY); return true; } } mas->offset = offset; return found; } /** * mas_walk() - Search for @mas->index in the tree. * @mas: The maple state. * * mas->index and mas->last will be set to the range if there is a value. If * mas->status is ma_none, reset to ma_start * * Return: the entry at the location or %NULL. */ void *mas_walk(struct ma_state *mas) { void *entry; if (!mas_is_active(mas) || !mas_is_start(mas)) mas->status = ma_start; retry: entry = mas_state_walk(mas); if (mas_is_start(mas)) { goto retry; } else if (mas_is_none(mas)) { mas->index = 0; mas->last = ULONG_MAX; } else if (mas_is_ptr(mas)) { if (!mas->index) { mas->last = 0; return entry; } mas->index = 1; mas->last = ULONG_MAX; mas->status = ma_none; return NULL; } return entry; } EXPORT_SYMBOL_GPL(mas_walk); static inline bool mas_rewind_node(struct ma_state *mas) { unsigned char slot; do { if (mte_is_root(mas->node)) { slot = mas->offset; if (!slot) return false; } else { mas_ascend(mas); slot = mas->offset; } } while (!slot); mas->offset = --slot; return true; } /* * mas_skip_node() - Internal function. Skip over a node. * @mas: The maple state. * * Return: true if there is another node, false otherwise. */ static inline bool mas_skip_node(struct ma_state *mas) { if (mas_is_err(mas)) return false; do { if (mte_is_root(mas->node)) { if (mas->offset >= mas_data_end(mas)) { mas_set_err(mas, -EBUSY); return false; } } else { mas_ascend(mas); } } while (mas->offset >= mas_data_end(mas)); mas->offset++; return true; } /* * mas_awalk() - Allocation walk. Search from low address to high, for a gap of * @size * @mas: The maple state * @size: The size of the gap required * * Search between @mas->index and @mas->last for a gap of @size. */ static inline void mas_awalk(struct ma_state *mas, unsigned long size) { struct maple_enode *last = NULL; /* * There are 4 options: * go to child (descend) * go back to parent (ascend) * no gap found. (return, error == -EBUSY) * found the gap. (return) */ while (!mas_is_err(mas) && !mas_anode_descend(mas, size)) { if (last == mas->node) mas_skip_node(mas); else last = mas->node; } } /* * mas_sparse_area() - Internal function. Return upper or lower limit when * searching for a gap in an empty tree. * @mas: The maple state * @min: the minimum range * @max: The maximum range * @size: The size of the gap * @fwd: Searching forward or back */ static inline int mas_sparse_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size, bool fwd) { if (!unlikely(mas_is_none(mas)) && min == 0) { min++; /* * At this time, min is increased, we need to recheck whether * the size is satisfied. */ if (min > max || max - min + 1 < size) return -EBUSY; } /* mas_is_ptr */ if (fwd) { mas->index = min; mas->last = min + size - 1; } else { mas->last = max; mas->index = max - size + 1; } return 0; } /* * mas_empty_area() - Get the lowest address within the range that is * sufficient for the size requested. * @mas: The maple state * @min: The lowest value of the range * @max: The highest value of the range * @size: The size needed */ int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size) { unsigned char offset; unsigned long *pivots; enum maple_type mt; struct maple_node *node; if (min > max) return -EINVAL; if (size == 0 || max - min < size - 1) return -EINVAL; if (mas_is_start(mas)) mas_start(mas); else if (mas->offset >= 2) mas->offset -= 2; else if (!mas_skip_node(mas)) return -EBUSY; /* Empty set */ if (mas_is_none(mas) || mas_is_ptr(mas)) return mas_sparse_area(mas, min, max, size, true); /* The start of the window can only be within these values */ mas->index = min; mas->last = max; mas_awalk(mas, size); if (unlikely(mas_is_err(mas))) return xa_err(mas->node); offset = mas->offset; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); min = mas_safe_min(mas, pivots, offset); if (mas->index < min) mas->index = min; mas->last = mas->index + size - 1; mas->end = ma_data_end(node, mt, pivots, mas->max); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area); /* * mas_empty_area_rev() - Get the highest address within the range that is * sufficient for the size requested. * @mas: The maple state * @min: The lowest value of the range * @max: The highest value of the range * @size: The size needed */ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size) { struct maple_enode *last = mas->node; if (min > max) return -EINVAL; if (size == 0 || max - min < size - 1) return -EINVAL; if (mas_is_start(mas)) mas_start(mas); else if ((mas->offset < 2) && (!mas_rewind_node(mas))) return -EBUSY; if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) return mas_sparse_area(mas, min, max, size, false); else if (mas->offset >= 2) mas->offset -= 2; else mas->offset = mas_data_end(mas); /* The start of the window can only be within these values. */ mas->index = min; mas->last = max; while (!mas_rev_awalk(mas, size, &min, &max)) { if (last == mas->node) { if (!mas_rewind_node(mas)) return -EBUSY; } else { last = mas->node; } } if (mas_is_err(mas)) return xa_err(mas->node); if (unlikely(mas->offset == MAPLE_NODE_SLOTS)) return -EBUSY; /* Trim the upper limit to the max. */ if (max < mas->last) mas->last = max; mas->index = mas->last - size + 1; mas->end = mas_data_end(mas); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area_rev); /* * mte_dead_leaves() - Mark all leaves of a node as dead. * @enode: the encoded node * @mt: the maple tree * @slots: Pointer to the slot array * * Must hold the write lock. * * Return: The number of leaves marked as dead. */ static inline unsigned char mte_dead_leaves(struct maple_enode *enode, struct maple_tree *mt, void __rcu **slots) { struct maple_node *node; enum maple_type type; void *entry; int offset; for (offset = 0; offset < mt_slot_count(enode); offset++) { entry = mt_slot(mt, slots, offset); type = mte_node_type(entry); node = mte_to_node(entry); /* Use both node and type to catch LE & BE metadata */ if (!node || !type) break; mte_set_node_dead(entry); node->type = type; rcu_assign_pointer(slots[offset], node); } return offset; } /** * mte_dead_walk() - Walk down a dead tree to just before the leaves * @enode: The maple encoded node * @offset: The starting offset * * Note: This can only be used from the RCU callback context. */ static void __rcu **mte_dead_walk(struct maple_enode **enode, unsigned char offset) { struct maple_node *node, *next; void __rcu **slots = NULL; next = mte_to_node(*enode); do { *enode = ma_enode_ptr(next); node = mte_to_node(*enode); slots = ma_slots(node, node->type); next = rcu_dereference_protected(slots[offset], lock_is_held(&rcu_callback_map)); offset = 0; } while (!ma_is_leaf(next->type)); return slots; } /** * mt_free_walk() - Walk & free a tree in the RCU callback context * @head: The RCU head that's within the node. * * Note: This can only be used from the RCU callback context. */ static void mt_free_walk(struct rcu_head *head) { void __rcu **slots; struct maple_node *node, *start; struct maple_enode *enode; unsigned char offset; enum maple_type type; node = container_of(head, struct maple_node, rcu); if (ma_is_leaf(node->type)) goto free_leaf; start = node; enode = mt_mk_node(node, node->type); slots = mte_dead_walk(&enode, 0); node = mte_to_node(enode); do { mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; enode = node->piv_parent; if (mte_to_node(enode) == node) goto free_leaf; type = mte_node_type(enode); slots = ma_slots(mte_to_node(enode), type); if ((offset < mt_slots[type]) && rcu_dereference_protected(slots[offset], lock_is_held(&rcu_callback_map))) slots = mte_dead_walk(&enode, offset); node = mte_to_node(enode); } while ((node != start) || (node->slot_len < offset)); slots = ma_slots(node, node->type); mt_free_bulk(node->slot_len, slots); free_leaf: mt_free_rcu(&node->rcu); } static inline void __rcu **mte_destroy_descend(struct maple_enode **enode, struct maple_tree *mt, struct maple_enode *prev, unsigned char offset) { struct maple_node *node; struct maple_enode *next = *enode; void __rcu **slots = NULL; enum maple_type type; unsigned char next_offset = 0; do { *enode = next; node = mte_to_node(*enode); type = mte_node_type(*enode); slots = ma_slots(node, type); next = mt_slot_locked(mt, slots, next_offset); if ((mte_dead_node(next))) next = mt_slot_locked(mt, slots, ++next_offset); mte_set_node_dead(*enode); node->type = type; node->piv_parent = prev; node->parent_slot = offset; offset = next_offset; next_offset = 0; prev = *enode; } while (!mte_is_leaf(next)); return slots; } static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free) { void __rcu **slots; struct maple_node *node = mte_to_node(enode); struct maple_enode *start; if (mte_is_leaf(enode)) { node->type = mte_node_type(enode); goto free_leaf; } start = enode; slots = mte_destroy_descend(&enode, mt, start, 0); node = mte_to_node(enode); // Updated in the above call. do { enum maple_type type; unsigned char offset; struct maple_enode *parent, *tmp; node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; enode = node->piv_parent; if (mte_to_node(enode) == node) goto free_leaf; type = mte_node_type(enode); slots = ma_slots(mte_to_node(enode), type); if (offset >= mt_slots[type]) goto next; tmp = mt_slot_locked(mt, slots, offset); if (mte_node_type(tmp) && mte_to_node(tmp)) { parent = enode; enode = tmp; slots = mte_destroy_descend(&enode, mt, parent, offset); } next: node = mte_to_node(enode); } while (start != enode); node = mte_to_node(enode); node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); free_leaf: if (free) mt_free_rcu(&node->rcu); else mt_clear_meta(mt, node, node->type); } /* * mte_destroy_walk() - Free a tree or sub-tree. * @enode: the encoded maple node (maple_enode) to start * @mt: the tree to free - needed for node types. * * Must hold the write lock. */ static inline void mte_destroy_walk(struct maple_enode *enode, struct maple_tree *mt) { struct maple_node *node = mte_to_node(enode); if (mt_in_rcu(mt)) { mt_destroy_walk(enode, mt, false); call_rcu(&node->rcu, mt_free_walk); } else { mt_destroy_walk(enode, mt, true); } } /* Interface */ /** * mas_store() - Store an @entry. * @mas: The maple state. * @entry: The entry to store. * * The @mas->index and @mas->last is used to set the range for the @entry. * * Return: the first entry between mas->index and mas->last or %NULL. */ void *mas_store(struct ma_state *mas, void *entry) { int request; MA_WR_STATE(wr_mas, mas, entry); trace_ma_write(__func__, mas, 0, entry); #ifdef CONFIG_DEBUG_MAPLE_TREE if (MAS_WARN_ON(mas, mas->index > mas->last)) pr_err("Error %lX > %lX " PTR_FMT "\n", mas->index, mas->last, entry); if (mas->index > mas->last) { mas_set_err(mas, -EINVAL); return NULL; } #endif /* * Storing is the same operation as insert with the added caveat that it * can overwrite entries. Although this seems simple enough, one may * want to examine what happens if a single store operation was to * overwrite multiple entries within a self-balancing B-Tree. */ mas_wr_prealloc_setup(&wr_mas); mas->store_type = mas_wr_store_type(&wr_mas); if (mas->mas_flags & MA_STATE_PREALLOC) { mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); return wr_mas.content; } request = mas_prealloc_calc(mas, entry); if (!request) goto store; mas_node_count(mas, request); if (mas_is_err(mas)) return NULL; store: mas_wr_store_entry(&wr_mas); mas_destroy(mas); return wr_mas.content; } EXPORT_SYMBOL_GPL(mas_store); /** * mas_store_gfp() - Store a value into the tree. * @mas: The maple state * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations if necessary. * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp) { unsigned long index = mas->index; unsigned long last = mas->last; MA_WR_STATE(wr_mas, mas, entry); int ret = 0; retry: mas_wr_preallocate(&wr_mas, entry); if (unlikely(mas_nomem(mas, gfp))) { if (!entry) __mas_set_range(mas, index, last); goto retry; } if (mas_is_err(mas)) { ret = xa_err(mas->node); goto out; } mas_wr_store_entry(&wr_mas); out: mas_destroy(mas); return ret; } EXPORT_SYMBOL_GPL(mas_store_gfp); /** * mas_store_prealloc() - Store a value into the tree using memory * preallocated in the maple state. * @mas: The maple state * @entry: The entry to store. */ void mas_store_prealloc(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); if (mas->store_type == wr_store_root) { mas_wr_prealloc_setup(&wr_mas); goto store; } mas_wr_walk_descend(&wr_mas); if (mas->store_type != wr_spanning_store) { /* set wr_mas->content to current slot */ wr_mas.content = mas_slot_locked(mas, wr_mas.slots, mas->offset); mas_wr_end_piv(&wr_mas); } store: trace_ma_write(__func__, mas, 0, entry); mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); mas_destroy(mas); } EXPORT_SYMBOL_GPL(mas_store_prealloc); /** * mas_preallocate() - Preallocate enough nodes for a store operation * @mas: The maple state * @entry: The entry that will be stored * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { MA_WR_STATE(wr_mas, mas, entry); int ret = 0; int request; mas_wr_prealloc_setup(&wr_mas); mas->store_type = mas_wr_store_type(&wr_mas); request = mas_prealloc_calc(mas, entry); if (!request) return ret; mas_node_count_gfp(mas, request, gfp); if (mas_is_err(mas)) { mas_set_alloc_req(mas, 0); ret = xa_err(mas->node); mas_destroy(mas); mas_reset(mas); return ret; } mas->mas_flags |= MA_STATE_PREALLOC; return ret; } EXPORT_SYMBOL_GPL(mas_preallocate); /* * mas_destroy() - destroy a maple state. * @mas: The maple state * * Upon completion, check the left-most node and rebalance against the node to * the right if necessary. Frees any allocated nodes associated with this maple * state. */ void mas_destroy(struct ma_state *mas) { struct maple_alloc *node; unsigned long total; /* * When using mas_for_each() to insert an expected number of elements, * it is possible that the number inserted is less than the expected * number. To fix an invalid final node, a check is performed here to * rebalance the previous node with the final node. */ if (mas->mas_flags & MA_STATE_REBALANCE) { unsigned char end; if (mas_is_err(mas)) mas_reset(mas); mas_start(mas); mtree_range_walk(mas); end = mas->end + 1; if (end < mt_min_slot_count(mas->node) - 1) mas_destroy_rebalance(mas, end); mas->mas_flags &= ~MA_STATE_REBALANCE; } mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC); total = mas_allocated(mas); while (total) { node = mas->alloc; mas->alloc = node->slot[0]; if (node->node_count > 1) { size_t count = node->node_count - 1; mt_free_bulk(count, (void __rcu **)&node->slot[1]); total -= count; } mt_free_one(ma_mnode_ptr(node)); total--; } mas->alloc = NULL; } EXPORT_SYMBOL_GPL(mas_destroy); /* * mas_expected_entries() - Set the expected number of entries that will be inserted. * @mas: The maple state * @nr_entries: The number of expected entries. * * This will attempt to pre-allocate enough nodes to store the expected number * of entries. The allocations will occur using the bulk allocator interface * for speed. Please call mas_destroy() on the @mas after inserting the entries * to ensure any unused nodes are freed. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries) { int nonleaf_cap = MAPLE_ARANGE64_SLOTS - 2; struct maple_enode *enode = mas->node; int nr_nodes; int ret; /* * Sometimes it is necessary to duplicate a tree to a new tree, such as * forking a process and duplicating the VMAs from one tree to a new * tree. When such a situation arises, it is known that the new tree is * not going to be used until the entire tree is populated. For * performance reasons, it is best to use a bulk load with RCU disabled. * This allows for optimistic splitting that favours the left and reuse * of nodes during the operation. */ /* Optimize splitting for bulk insert in-order */ mas->mas_flags |= MA_STATE_BULK; /* * Avoid overflow, assume a gap between each entry and a trailing null. * If this is wrong, it just means allocation can happen during * insertion of entries. */ nr_nodes = max(nr_entries, nr_entries * 2 + 1); if (!mt_is_alloc(mas->tree)) nonleaf_cap = MAPLE_RANGE64_SLOTS - 2; /* Leaves; reduce slots to keep space for expansion */ nr_nodes = DIV_ROUND_UP(nr_nodes, MAPLE_RANGE64_SLOTS - 2); /* Internal nodes */ nr_nodes += DIV_ROUND_UP(nr_nodes, nonleaf_cap); /* Add working room for split (2 nodes) + new parents */ mas_node_count_gfp(mas, nr_nodes + 3, GFP_KERNEL); /* Detect if allocations run out */ mas->mas_flags |= MA_STATE_PREALLOC; if (!mas_is_err(mas)) return 0; ret = xa_err(mas->node); mas->node = enode; mas_destroy(mas); return ret; } EXPORT_SYMBOL_GPL(mas_expected_entries); static bool mas_next_setup(struct ma_state *mas, unsigned long max, void **entry) { bool was_none = mas_is_none(mas); if (unlikely(mas->last >= max)) { mas->status = ma_overflow; return true; } switch (mas->status) { case ma_active: return false; case ma_none: fallthrough; case ma_pause: mas->status = ma_start; fallthrough; case ma_start: mas_walk(mas); /* Retries on dead nodes handled by mas_walk */ break; case ma_overflow: /* Overflowed before, but the max changed */ mas->status = ma_active; break; case ma_underflow: /* The user expects the mas to be one before where it is */ mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (likely(mas_is_active(mas))) /* Fast path */ return false; if (mas_is_ptr(mas)) { *entry = NULL; if (was_none && mas->index == 0) { mas->index = mas->last = 0; return true; } mas->index = 1; mas->last = ULONG_MAX; mas->status = ma_none; return true; } if (mas_is_none(mas)) return true; return false; } /** * mas_next() - Get the next entry. * @mas: The maple state * @max: The maximum index to check. * * Returns the next entry after @mas->index. * Must hold rcu_read_lock or the write lock. * Can return the zero entry. * * Return: The next entry or %NULL */ void *mas_next(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_next_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, false); } EXPORT_SYMBOL_GPL(mas_next); /** * mas_next_range() - Advance the maple state to the next range * @mas: The maple state * @max: The maximum index to check. * * Sets @mas->index and @mas->last to the range. * Must hold rcu_read_lock or the write lock. * Can return the zero entry. * * Return: The next entry or %NULL */ void *mas_next_range(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_next_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_next_range); /** * mt_next() - get the next value in the maple tree * @mt: The maple tree * @index: The start index * @max: The maximum index to check * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * Return: The entry higher than @index or %NULL if nothing is found. */ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max) { void *entry = NULL; MA_STATE(mas, mt, index, index); rcu_read_lock(); entry = mas_next(&mas, max); rcu_read_unlock(); return entry; } EXPORT_SYMBOL_GPL(mt_next); static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry) { if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } switch (mas->status) { case ma_active: return false; case ma_start: break; case ma_none: fallthrough; case ma_pause: mas->status = ma_start; break; case ma_underflow: /* underflowed before but the min changed */ mas->status = ma_active; break; case ma_overflow: /* User expects mas to be one after where it is */ mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) mas_walk(mas); if (unlikely(mas_is_ptr(mas))) { if (!mas->index) { mas->status = ma_none; return true; } mas->index = mas->last = 0; *entry = mas_root(mas); return true; } if (mas_is_none(mas)) { if (mas->index) { /* Walked to out-of-range pointer? */ mas->index = mas->last = 0; mas->status = ma_root; *entry = mas_root(mas); return true; } return true; } return false; } /** * mas_prev() - Get the previous entry * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * Will reset mas to ma_start if the status is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. */ void *mas_prev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_prev_setup(mas, min, &entry)) return entry; return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_prev); /** * mas_prev_range() - Advance to the previous range * @mas: The maple state * @min: The minimum value to check. * * Sets @mas->index and @mas->last to the range. * Must hold rcu_read_lock or the write lock. * Will reset mas to ma_start if the node is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. */ void *mas_prev_range(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_prev_setup(mas, min, &entry)) return entry; return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_prev_range); /** * mt_prev() - get the previous value in the maple tree * @mt: The maple tree * @index: The start index * @min: The minimum index to check * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * Return: The entry before @index or %NULL if nothing is found. */ void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min) { void *entry = NULL; MA_STATE(mas, mt, index, index); rcu_read_lock(); entry = mas_prev(&mas, min); rcu_read_unlock(); return entry; } EXPORT_SYMBOL_GPL(mt_prev); /** * mas_pause() - Pause a mas_find/mas_for_each to drop the lock. * @mas: The maple state to pause * * Some users need to pause a walk and drop the lock they're holding in * order to yield to a higher priority thread or carry out an operation * on an entry. Those users should call this function before they drop * the lock. It resets the @mas to be suitable for the next iteration * of the loop after the user has reacquired the lock. If most entries * found during a walk require you to call mas_pause(), the mt_for_each() * iterator may be more appropriate. * */ void mas_pause(struct ma_state *mas) { mas->status = ma_pause; mas->node = NULL; } EXPORT_SYMBOL_GPL(mas_pause); /** * mas_find_setup() - Internal function to set up mas_find*(). * @mas: The maple state * @max: The maximum index * @entry: Pointer to the entry * * Returns: True if entry is the answer, false otherwise. */ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long max, void **entry) { switch (mas->status) { case ma_active: if (mas->last < max) return false; return true; case ma_start: break; case ma_pause: if (unlikely(mas->last >= max)) return true; mas->index = ++mas->last; mas->status = ma_start; break; case ma_none: if (unlikely(mas->last >= max)) return true; mas->index = mas->last; mas->status = ma_start; break; case ma_underflow: /* mas is pointing at entry before unable to go lower */ if (unlikely(mas->index >= max)) { mas->status = ma_overflow; return true; } mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_overflow: if (unlikely(mas->last >= max)) return true; mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) { /* First run or continue */ if (mas->index > max) return true; *entry = mas_walk(mas); if (*entry) return true; } if (unlikely(mas_is_ptr(mas))) goto ptr_out_of_range; if (unlikely(mas_is_none(mas))) return true; if (mas->index == max) return true; return false; ptr_out_of_range: mas->status = ma_none; mas->index = 1; mas->last = ULONG_MAX; return true; } /** * mas_find() - On the first call, find the entry at or after mas->index up to * %max. Otherwise, find the entry after mas->index. * @mas: The maple state * @max: The maximum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ void *mas_find(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_find_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ entry = mas_next_slot(mas, max, false); /* Ignore overflow */ mas->status = ma_active; return entry; } EXPORT_SYMBOL_GPL(mas_find); /** * mas_find_range() - On the first call, find the entry at or after * mas->index up to %max. Otherwise, advance to the next slot mas->index. * @mas: The maple state * @max: The maximum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ void *mas_find_range(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_find_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_find_range); /** * mas_find_rev_setup() - Internal function to set up mas_find_*_rev() * @mas: The maple state * @min: The minimum index * @entry: Pointer to the entry * * Returns: True if entry is the answer, false otherwise. */ static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min, void **entry) { switch (mas->status) { case ma_active: goto active; case ma_start: break; case ma_pause: if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } mas->last = --mas->index; mas->status = ma_start; break; case ma_none: if (mas->index <= min) goto none; mas->last = mas->index; mas->status = ma_start; break; case ma_overflow: /* user expects the mas to be one after where it is */ if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } mas->status = ma_active; break; case ma_underflow: /* user expects the mas to be one before where it is */ if (unlikely(mas->index <= min)) return true; mas->status = ma_active; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) { /* First run or continue */ if (mas->index < min) return true; *entry = mas_walk(mas); if (*entry) return true; } if (unlikely(mas_is_ptr(mas))) goto none; if (unlikely(mas_is_none(mas))) { /* * Walked to the location, and there was nothing so the previous * location is 0. */ mas->last = mas->index = 0; mas->status = ma_root; *entry = mas_root(mas); return true; } active: if (mas->index < min) return true; return false; none: mas->status = ma_none; return true; } /** * mas_find_rev: On the first call, find the first non-null entry at or below * mas->index down to %min. Otherwise find the first non-null entry below * mas->index down to %min. * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ void *mas_find_rev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_find_rev_setup(mas, min, &entry)) return entry; /* Retries on dead nodes handled by mas_prev_slot */ return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_find_rev); /** * mas_find_range_rev: On the first call, find the first non-null entry at or * below mas->index down to %min. Otherwise advance to the previous slot after * mas->index down to %min. * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ void *mas_find_range_rev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_find_rev_setup(mas, min, &entry)) return entry; /* Retries on dead nodes handled by mas_prev_slot */ return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_find_range_rev); /** * mas_erase() - Find the range in which index resides and erase the entire * range. * @mas: The maple state * * Must hold the write lock. * Searches for @mas->index, sets @mas->index and @mas->last to the range and * erases that range. * * Return: the entry that was erased or %NULL, @mas->index and @mas->last are updated. */ void *mas_erase(struct ma_state *mas) { void *entry; unsigned long index = mas->index; MA_WR_STATE(wr_mas, mas, NULL); if (!mas_is_active(mas) || !mas_is_start(mas)) mas->status = ma_start; write_retry: entry = mas_state_walk(mas); if (!entry) return NULL; /* Must reset to ensure spanning writes of last slot are detected */ mas_reset(mas); mas_wr_preallocate(&wr_mas, NULL); if (mas_nomem(mas, GFP_KERNEL)) { /* in case the range of entry changed when unlocked */ mas->index = mas->last = index; goto write_retry; } if (mas_is_err(mas)) goto out; mas_wr_store_entry(&wr_mas); out: mas_destroy(mas); return entry; } EXPORT_SYMBOL_GPL(mas_erase); /** * mas_nomem() - Check if there was an error allocating and do the allocation * if necessary If there are allocations, then free them. * @mas: The maple state * @gfp: The GFP_FLAGS to use for allocations * Return: true on allocation, false otherwise. */ bool mas_nomem(struct ma_state *mas, gfp_t gfp) __must_hold(mas->tree->ma_lock) { if (likely(mas->node != MA_ERROR(-ENOMEM))) return false; if (gfpflags_allow_blocking(gfp) && !mt_external_lock(mas->tree)) { mtree_unlock(mas->tree); mas_alloc_nodes(mas, gfp); mtree_lock(mas->tree); } else { mas_alloc_nodes(mas, gfp); } if (!mas_allocated(mas)) return false; mas->status = ma_start; return true; } void __init maple_tree_init(void) { maple_node_cache = kmem_cache_create("maple_node", sizeof(struct maple_node), sizeof(struct maple_node), SLAB_PANIC, NULL); } /** * mtree_load() - Load a value stored in a maple tree * @mt: The maple tree * @index: The index to load * * Return: the entry or %NULL */ void *mtree_load(struct maple_tree *mt, unsigned long index) { MA_STATE(mas, mt, index, index); void *entry; trace_ma_read(__func__, &mas); rcu_read_lock(); retry: entry = mas_start(&mas); if (unlikely(mas_is_none(&mas))) goto unlock; if (unlikely(mas_is_ptr(&mas))) { if (index) entry = NULL; goto unlock; } entry = mtree_lookup_walk(&mas); if (!entry && unlikely(mas_is_start(&mas))) goto retry; unlock: rcu_read_unlock(); if (xa_is_zero(entry)) return NULL; return entry; } EXPORT_SYMBOL(mtree_load); /** * mtree_store_range() - Store an entry at a given range. * @mt: The maple tree * @index: The start of the range * @last: The end of the range * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mtree_store_range(struct maple_tree *mt, unsigned long index, unsigned long last, void *entry, gfp_t gfp) { MA_STATE(mas, mt, index, last); int ret = 0; trace_ma_write(__func__, &mas, 0, entry); if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (index > last) return -EINVAL; mtree_lock(mt); ret = mas_store_gfp(&mas, entry, gfp); mtree_unlock(mt); return ret; } EXPORT_SYMBOL(mtree_store_range); /** * mtree_store() - Store an entry at a given index. * @mt: The maple tree * @index: The index to store the value * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp) { return mtree_store_range(mt, index, index, entry, gfp); } EXPORT_SYMBOL(mtree_store); /** * mtree_insert_range() - Insert an entry at a given range if there is no value. * @mt: The maple tree * @first: The start of the range * @last: The end of the range * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid * request, -ENOMEM if memory could not be allocated. */ int mtree_insert_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp) { MA_STATE(ms, mt, first, last); int ret = 0; if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (first > last) return -EINVAL; mtree_lock(mt); retry: mas_insert(&ms, entry); if (mas_nomem(&ms, gfp)) goto retry; mtree_unlock(mt); if (mas_is_err(&ms)) ret = xa_err(ms.node); mas_destroy(&ms); return ret; } EXPORT_SYMBOL(mtree_insert_range); /** * mtree_insert() - Insert an entry at a given index if there is no value. * @mt: The maple tree * @index : The index to store the value * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid * request, -ENOMEM if memory could not be allocated. */ int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp) { return mtree_insert_range(mt, index, index, entry, gfp); } EXPORT_SYMBOL(mtree_insert); int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); retry: ret = mas_empty_area(&mas, min, max, size); if (ret) goto unlock; mas_insert(&mas, entry); /* * mas_nomem() may release the lock, causing the allocated area * to be unavailable, so try to allocate a free area again. */ if (mas_nomem(&mas, gfp)) goto retry; if (mas_is_err(&mas)) ret = xa_err(mas.node); else *startp = mas.index; unlock: mtree_unlock(mt); mas_destroy(&mas); return ret; } EXPORT_SYMBOL(mtree_alloc_range); /** * mtree_alloc_cyclic() - Find somewhere to store this entry in the tree. * @mt: The maple tree. * @startp: Pointer to ID. * @range_lo: Lower bound of range to search. * @range_hi: Upper bound of range to search. * @entry: The entry to store. * @next: Pointer to next ID to allocate. * @gfp: The GFP_FLAGS to use for allocations. * * Finds an empty entry in @mt after @next, stores the new index into * the @id pointer, stores the entry at that index, then updates @next. * * @mt must be initialized with the MT_FLAGS_ALLOC_RANGE flag. * * Context: Any context. Takes and releases the mt.lock. May sleep if * the @gfp flags permit. * * Return: 0 if the allocation succeeded without wrapping, 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated, -EINVAL if @mt cannot be used, or -EBUSY if there are no * free entries. */ int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp) { int ret; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); ret = mas_alloc_cyclic(&mas, startp, entry, range_lo, range_hi, next, gfp); mtree_unlock(mt); return ret; } EXPORT_SYMBOL(mtree_alloc_cyclic); int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); retry: ret = mas_empty_area_rev(&mas, min, max, size); if (ret) goto unlock; mas_insert(&mas, entry); /* * mas_nomem() may release the lock, causing the allocated area * to be unavailable, so try to allocate a free area again. */ if (mas_nomem(&mas, gfp)) goto retry; if (mas_is_err(&mas)) ret = xa_err(mas.node); else *startp = mas.index; unlock: mtree_unlock(mt); mas_destroy(&mas); return ret; } EXPORT_SYMBOL(mtree_alloc_rrange); /** * mtree_erase() - Find an index and erase the entire range. * @mt: The maple tree * @index: The index to erase * * Erasing is the same as a walk to an entry then a store of a NULL to that * ENTIRE range. In fact, it is implemented as such using the advanced API. * * Return: The entry stored at the @index or %NULL */ void *mtree_erase(struct maple_tree *mt, unsigned long index) { void *entry = NULL; MA_STATE(mas, mt, index, index); trace_ma_op(__func__, &mas); mtree_lock(mt); entry = mas_erase(&mas); mtree_unlock(mt); return entry; } EXPORT_SYMBOL(mtree_erase); /* * mas_dup_free() - Free an incomplete duplication of a tree. * @mas: The maple state of a incomplete tree. * * The parameter @mas->node passed in indicates that the allocation failed on * this node. This function frees all nodes starting from @mas->node in the * reverse order of mas_dup_build(). There is no need to hold the source tree * lock at this time. */ static void mas_dup_free(struct ma_state *mas) { struct maple_node *node; enum maple_type type; void __rcu **slots; unsigned char count, i; /* Maybe the first node allocation failed. */ if (mas_is_none(mas)) return; while (!mte_is_root(mas->node)) { mas_ascend(mas); if (mas->offset) { mas->offset--; do { mas_descend(mas); mas->offset = mas_data_end(mas); } while (!mte_is_leaf(mas->node)); mas_ascend(mas); } node = mte_to_node(mas->node); type = mte_node_type(mas->node); slots = ma_slots(node, type); count = mas_data_end(mas) + 1; for (i = 0; i < count; i++) ((unsigned long *)slots)[i] &= ~MAPLE_NODE_MASK; mt_free_bulk(count, slots); } node = mte_to_node(mas->node); mt_free_one(node); } /* * mas_copy_node() - Copy a maple node and replace the parent. * @mas: The maple state of source tree. * @new_mas: The maple state of new tree. * @parent: The parent of the new node. * * Copy @mas->node to @new_mas->node, set @parent to be the parent of * @new_mas->node. If memory allocation fails, @mas is set to -ENOMEM. */ static inline void mas_copy_node(struct ma_state *mas, struct ma_state *new_mas, struct maple_pnode *parent) { struct maple_node *node = mte_to_node(mas->node); struct maple_node *new_node = mte_to_node(new_mas->node); unsigned long val; /* Copy the node completely. */ memcpy(new_node, node, sizeof(struct maple_node)); /* Update the parent node pointer. */ val = (unsigned long)node->parent & MAPLE_NODE_MASK; new_node->parent = ma_parent_ptr(val | (unsigned long)parent); } /* * mas_dup_alloc() - Allocate child nodes for a maple node. * @mas: The maple state of source tree. * @new_mas: The maple state of new tree. * @gfp: The GFP_FLAGS to use for allocations. * * This function allocates child nodes for @new_mas->node during the duplication * process. If memory allocation fails, @mas is set to -ENOMEM. */ static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas, gfp_t gfp) { struct maple_node *node = mte_to_node(mas->node); struct maple_node *new_node = mte_to_node(new_mas->node); enum maple_type type; unsigned char request, count, i; void __rcu **slots; void __rcu **new_slots; unsigned long val; /* Allocate memory for child nodes. */ type = mte_node_type(mas->node); new_slots = ma_slots(new_node, type); request = mas_data_end(mas) + 1; count = mt_alloc_bulk(gfp, request, (void **)new_slots); if (unlikely(count < request)) { memset(new_slots, 0, request * sizeof(void *)); mas_set_err(mas, -ENOMEM); return; } /* Restore node type information in slots. */ slots = ma_slots(node, type); for (i = 0; i < count; i++) { val = (unsigned long)mt_slot_locked(mas->tree, slots, i); val &= MAPLE_NODE_MASK; ((unsigned long *)new_slots)[i] |= val; } } /* * mas_dup_build() - Build a new maple tree from a source tree * @mas: The maple state of source tree, need to be in MAS_START state. * @new_mas: The maple state of new tree, need to be in MAS_START state. * @gfp: The GFP_FLAGS to use for allocations. * * This function builds a new tree in DFS preorder. If the memory allocation * fails, the error code -ENOMEM will be set in @mas, and @new_mas points to the * last node. mas_dup_free() will free the incomplete duplication of a tree. * * Note that the attributes of the two trees need to be exactly the same, and the * new tree needs to be empty, otherwise -EINVAL will be set in @mas. */ static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas, gfp_t gfp) { struct maple_node *node; struct maple_pnode *parent = NULL; struct maple_enode *root; enum maple_type type; if (unlikely(mt_attr(mas->tree) != mt_attr(new_mas->tree)) || unlikely(!mtree_empty(new_mas->tree))) { mas_set_err(mas, -EINVAL); return; } root = mas_start(mas); if (mas_is_ptr(mas) || mas_is_none(mas)) goto set_new_tree; node = mt_alloc_one(gfp); if (!node) { new_mas->status = ma_none; mas_set_err(mas, -ENOMEM); return; } type = mte_node_type(mas->node); root = mt_mk_node(node, type); new_mas->node = root; new_mas->min = 0; new_mas->max = ULONG_MAX; root = mte_mk_root(root); while (1) { mas_copy_node(mas, new_mas, parent); if (!mte_is_leaf(mas->node)) { /* Only allocate child nodes for non-leaf nodes. */ mas_dup_alloc(mas, new_mas, gfp); if (unlikely(mas_is_err(mas))) return; } else { /* * This is the last leaf node and duplication is * completed. */ if (mas->max == ULONG_MAX) goto done; /* This is not the last leaf node and needs to go up. */ do { mas_ascend(mas); mas_ascend(new_mas); } while (mas->offset == mas_data_end(mas)); /* Move to the next subtree. */ mas->offset++; new_mas->offset++; } mas_descend(mas); parent = ma_parent_ptr(mte_to_node(new_mas->node)); mas_descend(new_mas); mas->offset = 0; new_mas->offset = 0; } done: /* Specially handle the parent of the root node. */ mte_to_node(root)->parent = ma_parent_ptr(mas_tree_parent(new_mas)); set_new_tree: /* Make them the same height */ new_mas->tree->ma_flags = mas->tree->ma_flags; rcu_assign_pointer(new_mas->tree->ma_root, root); } /** * __mt_dup(): Duplicate an entire maple tree * @mt: The source maple tree * @new: The new maple tree * @gfp: The GFP_FLAGS to use for allocations * * This function duplicates a maple tree in Depth-First Search (DFS) pre-order * traversal. It uses memcpy() to copy nodes in the source tree and allocate * new child nodes in non-leaf nodes. The new node is exactly the same as the * source node except for all the addresses stored in it. It will be faster than * traversing all elements in the source tree and inserting them one by one into * the new tree. * The user needs to ensure that the attributes of the source tree and the new * tree are the same, and the new tree needs to be an empty tree, otherwise * -EINVAL will be returned. * Note that the user needs to manually lock the source tree and the new tree. * * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If * the attributes of the two trees are different or the new tree is not an empty * tree. */ int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); MA_STATE(new_mas, new, 0, 0); mas_dup_build(&mas, &new_mas, gfp); if (unlikely(mas_is_err(&mas))) { ret = xa_err(mas.node); if (ret == -ENOMEM) mas_dup_free(&new_mas); } return ret; } EXPORT_SYMBOL(__mt_dup); /** * mtree_dup(): Duplicate an entire maple tree * @mt: The source maple tree * @new: The new maple tree * @gfp: The GFP_FLAGS to use for allocations * * This function duplicates a maple tree in Depth-First Search (DFS) pre-order * traversal. It uses memcpy() to copy nodes in the source tree and allocate * new child nodes in non-leaf nodes. The new node is exactly the same as the * source node except for all the addresses stored in it. It will be faster than * traversing all elements in the source tree and inserting them one by one into * the new tree. * The user needs to ensure that the attributes of the source tree and the new * tree are the same, and the new tree needs to be an empty tree, otherwise * -EINVAL will be returned. * * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If * the attributes of the two trees are different or the new tree is not an empty * tree. */ int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); MA_STATE(new_mas, new, 0, 0); mas_lock(&new_mas); mas_lock_nested(&mas, SINGLE_DEPTH_NESTING); mas_dup_build(&mas, &new_mas, gfp); mas_unlock(&mas); if (unlikely(mas_is_err(&mas))) { ret = xa_err(mas.node); if (ret == -ENOMEM) mas_dup_free(&new_mas); } mas_unlock(&new_mas); return ret; } EXPORT_SYMBOL(mtree_dup); /** * __mt_destroy() - Walk and free all nodes of a locked maple tree. * @mt: The maple tree * * Note: Does not handle locking. */ void __mt_destroy(struct maple_tree *mt) { void *root = mt_root_locked(mt); rcu_assign_pointer(mt->ma_root, NULL); if (xa_is_node(root)) mte_destroy_walk(root, mt); mt->ma_flags = mt_attr(mt); } EXPORT_SYMBOL_GPL(__mt_destroy); /** * mtree_destroy() - Destroy a maple tree * @mt: The maple tree * * Frees all resources used by the tree. Handles locking. */ void mtree_destroy(struct maple_tree *mt) { mtree_lock(mt); __mt_destroy(mt); mtree_unlock(mt); } EXPORT_SYMBOL(mtree_destroy); /** * mt_find() - Search from the start up until an entry is found. * @mt: The maple tree * @index: Pointer which contains the start location of the search * @max: The maximum value of the search range * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * In case that an entry is found @index is updated to point to the next * possible entry independent whether the found entry is occupying a * single index or a range if indices. * * Return: The entry at or after the @index or %NULL */ void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max) { MA_STATE(mas, mt, *index, *index); void *entry; #ifdef CONFIG_DEBUG_MAPLE_TREE unsigned long copy = *index; #endif trace_ma_read(__func__, &mas); if ((*index) > max) return NULL; rcu_read_lock(); retry: entry = mas_state_walk(&mas); if (mas_is_start(&mas)) goto retry; if (unlikely(xa_is_zero(entry))) entry = NULL; if (entry) goto unlock; while (mas_is_active(&mas) && (mas.last < max)) { entry = mas_next_slot(&mas, max, false); if (likely(entry && !xa_is_zero(entry))) break; } if (unlikely(xa_is_zero(entry))) entry = NULL; unlock: rcu_read_unlock(); if (likely(entry)) { *index = mas.last + 1; #ifdef CONFIG_DEBUG_MAPLE_TREE if (MT_WARN_ON(mt, (*index) && ((*index) <= copy))) pr_err("index not increased! %lx <= %lx\n", *index, copy); #endif } return entry; } EXPORT_SYMBOL(mt_find); /** * mt_find_after() - Search from the start up until an entry is found. * @mt: The maple tree * @index: Pointer which contains the start location of the search * @max: The maximum value to check * * Same as mt_find() except that it checks @index for 0 before * searching. If @index == 0, the search is aborted. This covers a wrap * around of @index to 0 in an iterator loop. * * Return: The entry at or after the @index or %NULL */ void *mt_find_after(struct maple_tree *mt, unsigned long *index, unsigned long max) { if (!(*index)) return NULL; return mt_find(mt, index, max); } EXPORT_SYMBOL(mt_find_after); #ifdef CONFIG_DEBUG_MAPLE_TREE atomic_t maple_tree_tests_run; EXPORT_SYMBOL_GPL(maple_tree_tests_run); atomic_t maple_tree_tests_passed; EXPORT_SYMBOL_GPL(maple_tree_tests_passed); #ifndef __KERNEL__ extern void kmem_cache_set_non_kernel(struct kmem_cache *, unsigned int); void mt_set_non_kernel(unsigned int val) { kmem_cache_set_non_kernel(maple_node_cache, val); } extern void kmem_cache_set_callback(struct kmem_cache *cachep, void (*callback)(void *)); void mt_set_callback(void (*callback)(void *)) { kmem_cache_set_callback(maple_node_cache, callback); } extern void kmem_cache_set_private(struct kmem_cache *cachep, void *private); void mt_set_private(void *private) { kmem_cache_set_private(maple_node_cache, private); } extern unsigned long kmem_cache_get_alloc(struct kmem_cache *); unsigned long mt_get_alloc_size(void) { return kmem_cache_get_alloc(maple_node_cache); } extern void kmem_cache_zero_nr_tallocated(struct kmem_cache *); void mt_zero_nr_tallocated(void) { kmem_cache_zero_nr_tallocated(maple_node_cache); } extern unsigned int kmem_cache_nr_tallocated(struct kmem_cache *); unsigned int mt_nr_tallocated(void) { return kmem_cache_nr_tallocated(maple_node_cache); } extern unsigned int kmem_cache_nr_allocated(struct kmem_cache *); unsigned int mt_nr_allocated(void) { return kmem_cache_nr_allocated(maple_node_cache); } void mt_cache_shrink(void) { } #else /* * mt_cache_shrink() - For testing, don't use this. * * Certain testcases can trigger an OOM when combined with other memory * debugging configuration options. This function is used to reduce the * possibility of an out of memory even due to kmem_cache objects remaining * around for longer than usual. */ void mt_cache_shrink(void) { kmem_cache_shrink(maple_node_cache); } EXPORT_SYMBOL_GPL(mt_cache_shrink); #endif /* not defined __KERNEL__ */ /* * mas_get_slot() - Get the entry in the maple state node stored at @offset. * @mas: The maple state * @offset: The offset into the slot array to fetch. * * Return: The entry stored at @offset. */ static inline struct maple_enode *mas_get_slot(struct ma_state *mas, unsigned char offset) { return mas_slot(mas, ma_slots(mas_mn(mas), mte_node_type(mas->node)), offset); } /* Depth first search, post-order */ static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) { struct maple_enode *p, *mn = mas->node; unsigned long p_min, p_max; mas_next_node(mas, mas_mn(mas), max); if (!mas_is_overflow(mas)) return; if (mte_is_root(mn)) return; mas->node = mn; mas_ascend(mas); do { p = mas->node; p_min = mas->min; p_max = mas->max; mas_prev_node(mas, 0); } while (!mas_is_underflow(mas)); mas->node = p; mas->max = p_max; mas->min = p_min; } /* Tree validations */ static void mt_dump_node(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format); static void mt_dump_range(unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { static const char spaces[] = " "; switch(format) { case mt_dump_hex: if (min == max) pr_info("%.*s%lx: ", depth * 2, spaces, min); else pr_info("%.*s%lx-%lx: ", depth * 2, spaces, min, max); break; case mt_dump_dec: if (min == max) pr_info("%.*s%lu: ", depth * 2, spaces, min); else pr_info("%.*s%lu-%lu: ", depth * 2, spaces, min, max); } } static void mt_dump_entry(void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { mt_dump_range(min, max, depth, format); if (xa_is_value(entry)) pr_cont("value %ld (0x%lx) [" PTR_FMT "]\n", xa_to_value(entry), xa_to_value(entry), entry); else if (xa_is_zero(entry)) pr_cont("zero (%ld)\n", xa_to_internal(entry)); else if (mt_is_reserved(entry)) pr_cont("UNKNOWN ENTRY (" PTR_FMT ")\n", entry); else pr_cont(PTR_FMT "\n", entry); } static void mt_dump_range64(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_range_64 *node = &mte_to_node(entry)->mr64; bool leaf = mte_is_leaf(entry); unsigned long first = min; int i; pr_cont(" contents: "); for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) { switch(format) { case mt_dump_hex: pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]); break; case mt_dump_dec: pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]); } } pr_cont(PTR_FMT "\n", node->slot[i]); for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) { unsigned long last = max; if (i < (MAPLE_RANGE64_SLOTS - 1)) last = node->pivot[i]; else if (!node->slot[i] && max != mt_node_max(entry)) break; if (last == 0 && i > 0) break; if (leaf) mt_dump_entry(mt_slot(mt, node->slot, i), first, last, depth + 1, format); else if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), first, last, depth + 1, format); if (last == max) break; if (last > max) { switch(format) { case mt_dump_hex: pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); break; case mt_dump_dec: pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); } } first = last + 1; } } static void mt_dump_arange64(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_arange_64 *node = &mte_to_node(entry)->ma64; unsigned long first = min; int i; pr_cont(" contents: "); for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { switch (format) { case mt_dump_hex: pr_cont("%lx ", node->gap[i]); break; case mt_dump_dec: pr_cont("%lu ", node->gap[i]); } } pr_cont("| %02X %02X| ", node->meta.end, node->meta.gap); for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) { switch (format) { case mt_dump_hex: pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]); break; case mt_dump_dec: pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]); } } pr_cont(PTR_FMT "\n", node->slot[i]); for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { unsigned long last = max; if (i < (MAPLE_ARANGE64_SLOTS - 1)) last = node->pivot[i]; else if (!node->slot[i]) break; if (last == 0 && i > 0) break; if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), first, last, depth + 1, format); if (last == max) break; if (last > max) { switch(format) { case mt_dump_hex: pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); break; case mt_dump_dec: pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); } } first = last + 1; } } static void mt_dump_node(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_node *node = mte_to_node(entry); unsigned int type = mte_node_type(entry); unsigned int i; mt_dump_range(min, max, depth, format); pr_cont("node " PTR_FMT " depth %d type %d parent " PTR_FMT, node, depth, type, node ? node->parent : NULL); switch (type) { case maple_dense: pr_cont("\n"); for (i = 0; i < MAPLE_NODE_SLOTS; i++) { if (min + i > max) pr_cont("OUT OF RANGE: "); mt_dump_entry(mt_slot(mt, node->slot, i), min + i, min + i, depth, format); } break; case maple_leaf_64: case maple_range_64: mt_dump_range64(mt, entry, min, max, depth, format); break; case maple_arange_64: mt_dump_arange64(mt, entry, min, max, depth, format); break; default: pr_cont(" UNKNOWN TYPE\n"); } } void mt_dump(const struct maple_tree *mt, enum mt_dump_format format) { void *entry = rcu_dereference_check(mt->ma_root, mt_locked(mt)); pr_info("maple_tree(" PTR_FMT ") flags %X, height %u root " PTR_FMT "\n", mt, mt->ma_flags, mt_height(mt), entry); if (xa_is_node(entry)) mt_dump_node(mt, entry, 0, mt_node_max(entry), 0, format); else if (entry) mt_dump_entry(entry, 0, 0, 0, format); else pr_info("(empty)\n"); } EXPORT_SYMBOL_GPL(mt_dump); /* * Calculate the maximum gap in a node and check if that's what is reported in * the parent (unless root). */ static void mas_validate_gaps(struct ma_state *mas) { struct maple_enode *mte = mas->node; struct maple_node *p_mn, *node = mte_to_node(mte); enum maple_type mt = mte_node_type(mas->node); unsigned long gap = 0, max_gap = 0; unsigned long p_end, p_start = mas->min; unsigned char p_slot, offset; unsigned long *gaps = NULL; unsigned long *pivots = ma_pivots(node, mt); unsigned int i; if (ma_is_dense(mt)) { for (i = 0; i < mt_slot_count(mte); i++) { if (mas_get_slot(mas, i)) { if (gap > max_gap) max_gap = gap; gap = 0; continue; } gap++; } goto counted; } gaps = ma_gaps(node, mt); for (i = 0; i < mt_slot_count(mte); i++) { p_end = mas_safe_pivot(mas, pivots, i, mt); if (!gaps) { if (!mas_get_slot(mas, i)) gap = p_end - p_start + 1; } else { void *entry = mas_get_slot(mas, i); gap = gaps[i]; MT_BUG_ON(mas->tree, !entry); if (gap > p_end - p_start + 1) { pr_err(PTR_FMT "[%u] %lu >= %lu - %lu + 1 (%lu)\n", mas_mn(mas), i, gap, p_end, p_start, p_end - p_start + 1); MT_BUG_ON(mas->tree, gap > p_end - p_start + 1); } } if (gap > max_gap) max_gap = gap; p_start = p_end + 1; if (p_end >= mas->max) break; } counted: if (mt == maple_arange_64) { MT_BUG_ON(mas->tree, !gaps); offset = ma_meta_gap(node); if (offset > i) { pr_err("gap offset " PTR_FMT "[%u] is invalid\n", node, offset); MT_BUG_ON(mas->tree, 1); } if (gaps[offset] != max_gap) { pr_err("gap " PTR_FMT "[%u] is not the largest gap %lu\n", node, offset, max_gap); MT_BUG_ON(mas->tree, 1); } for (i++ ; i < mt_slot_count(mte); i++) { if (gaps[i] != 0) { pr_err("gap " PTR_FMT "[%u] beyond node limit != 0\n", node, i); MT_BUG_ON(mas->tree, 1); } } } if (mte_is_root(mte)) return; p_slot = mte_parent_slot(mas->node); p_mn = mte_parent(mte); MT_BUG_ON(mas->tree, max_gap > mas->max); if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) { pr_err("gap " PTR_FMT "[%u] != %lu\n", p_mn, p_slot, max_gap); mt_dump(mas->tree, mt_dump_hex); MT_BUG_ON(mas->tree, 1); } } static void mas_validate_parent_slot(struct ma_state *mas) { struct maple_node *parent; struct maple_enode *node; enum maple_type p_type; unsigned char p_slot; void __rcu **slots; int i; if (mte_is_root(mas->node)) return; p_slot = mte_parent_slot(mas->node); p_type = mas_parent_type(mas, mas->node); parent = mte_parent(mas->node); slots = ma_slots(parent, p_type); MT_BUG_ON(mas->tree, mas_mn(mas) == parent); /* Check prev/next parent slot for duplicate node entry */ for (i = 0; i < mt_slots[p_type]; i++) { node = mas_slot(mas, slots, i); if (i == p_slot) { if (node != mas->node) pr_err("parent " PTR_FMT "[%u] does not have " PTR_FMT "\n", parent, i, mas_mn(mas)); MT_BUG_ON(mas->tree, node != mas->node); } else if (node == mas->node) { pr_err("Invalid child " PTR_FMT " at parent " PTR_FMT "[%u] p_slot %u\n", mas_mn(mas), parent, i, p_slot); MT_BUG_ON(mas->tree, node == mas->node); } } } static void mas_validate_child_slot(struct ma_state *mas) { enum maple_type type = mte_node_type(mas->node); void __rcu **slots = ma_slots(mte_to_node(mas->node), type); unsigned long *pivots = ma_pivots(mte_to_node(mas->node), type); struct maple_enode *child; unsigned char i; if (mte_is_leaf(mas->node)) return; for (i = 0; i < mt_slots[type]; i++) { child = mas_slot(mas, slots, i); if (!child) { pr_err("Non-leaf node lacks child at " PTR_FMT "[%u]\n", mas_mn(mas), i); MT_BUG_ON(mas->tree, 1); } if (mte_parent_slot(child) != i) { pr_err("Slot error at " PTR_FMT "[%u]: child " PTR_FMT " has pslot %u\n", mas_mn(mas), i, mte_to_node(child), mte_parent_slot(child)); MT_BUG_ON(mas->tree, 1); } if (mte_parent(child) != mte_to_node(mas->node)) { pr_err("child " PTR_FMT " has parent " PTR_FMT " not " PTR_FMT "\n", mte_to_node(child), mte_parent(child), mte_to_node(mas->node)); MT_BUG_ON(mas->tree, 1); } if (i < mt_pivots[type] && pivots[i] == mas->max) break; } } /* * Validate all pivots are within mas->min and mas->max, check metadata ends * where the maximum ends and ensure there is no slots or pivots set outside of * the end of the data. */ static void mas_validate_limits(struct ma_state *mas) { int i; unsigned long prev_piv = 0; enum maple_type type = mte_node_type(mas->node); void __rcu **slots = ma_slots(mte_to_node(mas->node), type); unsigned long *pivots = ma_pivots(mas_mn(mas), type); for (i = 0; i < mt_slots[type]; i++) { unsigned long piv; piv = mas_safe_pivot(mas, pivots, i, type); if (!piv && (i != 0)) { pr_err("Missing node limit pivot at " PTR_FMT "[%u]", mas_mn(mas), i); MAS_WARN_ON(mas, 1); } if (prev_piv > piv) { pr_err(PTR_FMT "[%u] piv %lu < prev_piv %lu\n", mas_mn(mas), i, piv, prev_piv); MAS_WARN_ON(mas, piv < prev_piv); } if (piv < mas->min) { pr_err(PTR_FMT "[%u] %lu < %lu\n", mas_mn(mas), i, piv, mas->min); MAS_WARN_ON(mas, piv < mas->min); } if (piv > mas->max) { pr_err(PTR_FMT "[%u] %lu > %lu\n", mas_mn(mas), i, piv, mas->max); MAS_WARN_ON(mas, piv > mas->max); } prev_piv = piv; if (piv == mas->max) break; } if (mas_data_end(mas) != i) { pr_err("node" PTR_FMT ": data_end %u != the last slot offset %u\n", mas_mn(mas), mas_data_end(mas), i); MT_BUG_ON(mas->tree, 1); } for (i += 1; i < mt_slots[type]; i++) { void *entry = mas_slot(mas, slots, i); if (entry && (i != mt_slots[type] - 1)) { pr_err(PTR_FMT "[%u] should not have entry " PTR_FMT "\n", mas_mn(mas), i, entry); MT_BUG_ON(mas->tree, entry != NULL); } if (i < mt_pivots[type]) { unsigned long piv = pivots[i]; if (!piv) continue; pr_err(PTR_FMT "[%u] should not have piv %lu\n", mas_mn(mas), i, piv); MAS_WARN_ON(mas, i < mt_pivots[type] - 1); } } } static void mt_validate_nulls(struct maple_tree *mt) { void *entry, *last = (void *)1; unsigned char offset = 0; void __rcu **slots; MA_STATE(mas, mt, 0, 0); mas_start(&mas); if (mas_is_none(&mas) || (mas_is_ptr(&mas))) return; while (!mte_is_leaf(mas.node)) mas_descend(&mas); slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node)); do { entry = mas_slot(&mas, slots, offset); if (!last && !entry) { pr_err("Sequential nulls end at " PTR_FMT "[%u]\n", mas_mn(&mas), offset); } MT_BUG_ON(mt, !last && !entry); last = entry; if (offset == mas_data_end(&mas)) { mas_next_node(&mas, mas_mn(&mas), ULONG_MAX); if (mas_is_overflow(&mas)) return; offset = 0; slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node)); } else { offset++; } } while (!mas_is_overflow(&mas)); } /* * validate a maple tree by checking: * 1. The limits (pivots are within mas->min to mas->max) * 2. The gap is correctly set in the parents */ void mt_validate(struct maple_tree *mt) __must_hold(mas->tree->ma_lock) { unsigned char end; MA_STATE(mas, mt, 0, 0); mas_start(&mas); if (!mas_is_active(&mas)) return; while (!mte_is_leaf(mas.node)) mas_descend(&mas); while (!mas_is_overflow(&mas)) { MAS_WARN_ON(&mas, mte_dead_node(mas.node)); end = mas_data_end(&mas); if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) && (!mte_is_root(mas.node)))) { pr_err("Invalid size %u of " PTR_FMT "\n", end, mas_mn(&mas)); } mas_validate_parent_slot(&mas); mas_validate_limits(&mas); mas_validate_child_slot(&mas); if (mt_is_alloc(mt)) mas_validate_gaps(&mas); mas_dfs_postorder(&mas, ULONG_MAX); } mt_validate_nulls(mt); } EXPORT_SYMBOL_GPL(mt_validate); void mas_dump(const struct ma_state *mas) { pr_err("MAS: tree=" PTR_FMT " enode=" PTR_FMT " ", mas->tree, mas->node); switch (mas->status) { case ma_active: pr_err("(ma_active)"); break; case ma_none: pr_err("(ma_none)"); break; case ma_root: pr_err("(ma_root)"); break; case ma_start: pr_err("(ma_start) "); break; case ma_pause: pr_err("(ma_pause) "); break; case ma_overflow: pr_err("(ma_overflow) "); break; case ma_underflow: pr_err("(ma_underflow) "); break; case ma_error: pr_err("(ma_error) "); break; } pr_err("Store Type: "); switch (mas->store_type) { case wr_invalid: pr_err("invalid store type\n"); break; case wr_new_root: pr_err("new_root\n"); break; case wr_store_root: pr_err("store_root\n"); break; case wr_exact_fit: pr_err("exact_fit\n"); break; case wr_split_store: pr_err("split_store\n"); break; case wr_slot_store: pr_err("slot_store\n"); break; case wr_append: pr_err("append\n"); break; case wr_node_store: pr_err("node_store\n"); break; case wr_spanning_store: pr_err("spanning_store\n"); break; case wr_rebalance: pr_err("rebalance\n"); break; } pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end, mas->index, mas->last); pr_err(" min=%lx max=%lx alloc=" PTR_FMT ", depth=%u, flags=%x\n", mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags); if (mas->index > mas->last) pr_err("Check index & last\n"); } EXPORT_SYMBOL_GPL(mas_dump); void mas_wr_dump(const struct ma_wr_state *wr_mas) { pr_err("WR_MAS: node=" PTR_FMT " r_min=%lx r_max=%lx\n", wr_mas->node, wr_mas->r_min, wr_mas->r_max); pr_err(" type=%u off_end=%u, node_end=%u, end_piv=%lx\n", wr_mas->type, wr_mas->offset_end, wr_mas->mas->end, wr_mas->end_piv); } EXPORT_SYMBOL_GPL(mas_wr_dump); #endif /* CONFIG_DEBUG_MAPLE_TREE */ |
| 760 759 760 567 576 238 238 380 380 381 576 6 538 540 13 99 99 338 339 26 104 104 11 252 250 180 563 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 | #ifndef _LINUX_MMAP_LOCK_H #define _LINUX_MMAP_LOCK_H #include <linux/lockdep.h> #include <linux/mm_types.h> #include <linux/mmdebug.h> #include <linux/rwsem.h> #include <linux/tracepoint-defs.h> #include <linux/types.h> #define MMAP_LOCK_INITIALIZER(name) \ .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), DECLARE_TRACEPOINT(mmap_lock_start_locking); DECLARE_TRACEPOINT(mmap_lock_acquire_returned); DECLARE_TRACEPOINT(mmap_lock_released); #ifdef CONFIG_TRACING void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write); void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, bool success); void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write); static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm, bool write) { if (tracepoint_enabled(mmap_lock_start_locking)) __mmap_lock_do_trace_start_locking(mm, write); } static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm, bool write, bool success) { if (tracepoint_enabled(mmap_lock_acquire_returned)) __mmap_lock_do_trace_acquire_returned(mm, write, success); } static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write) { if (tracepoint_enabled(mmap_lock_released)) __mmap_lock_do_trace_released(mm, write); } #else /* !CONFIG_TRACING */ static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm, bool write) { } static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm, bool write, bool success) { } static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write) { } #endif /* CONFIG_TRACING */ static inline void mmap_assert_locked(const struct mm_struct *mm) { rwsem_assert_held(&mm->mmap_lock); } static inline void mmap_assert_write_locked(const struct mm_struct *mm) { rwsem_assert_held_write(&mm->mmap_lock); } #ifdef CONFIG_PER_VMA_LOCK static inline void mm_lock_seqcount_init(struct mm_struct *mm) { seqcount_init(&mm->mm_lock_seq); } static inline void mm_lock_seqcount_begin(struct mm_struct *mm) { do_raw_write_seqcount_begin(&mm->mm_lock_seq); } static inline void mm_lock_seqcount_end(struct mm_struct *mm) { ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq); do_raw_write_seqcount_end(&mm->mm_lock_seq); } static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) { /* * Since mmap_lock is a sleeping lock, and waiting for it to become * unlocked is more or less equivalent with taking it ourselves, don't * bother with the speculative path if mmap_lock is already write-locked * and take the slow path, which takes the lock. */ return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq); } static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) { return read_seqcount_retry(&mm->mm_lock_seq, seq); } #else /* CONFIG_PER_VMA_LOCK */ static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) { return false; } static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) { return true; } #endif /* CONFIG_PER_VMA_LOCK */ static inline void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); mm_lock_seqcount_init(mm); } static inline void mmap_write_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, true); down_write(&mm->mmap_lock); mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) { __mmap_lock_trace_start_locking(mm, true); down_write_nested(&mm->mmap_lock, subclass); mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } static inline int mmap_write_lock_killable(struct mm_struct *mm) { int ret; __mmap_lock_trace_start_locking(mm, true); ret = down_write_killable(&mm->mmap_lock); if (!ret) mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, ret == 0); return ret; } /* * Drop all currently-held per-VMA locks. * This is called from the mmap_lock implementation directly before releasing * a write-locked mmap_lock (or downgrading it to read-locked). * This should normally NOT be called manually from other places. * If you want to call this manually anyway, keep in mind that this will release * *all* VMA write locks, including ones from further up the stack. */ static inline void vma_end_write_all(struct mm_struct *mm) { mmap_assert_write_locked(mm); mm_lock_seqcount_end(mm); } static inline void mmap_write_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, true); vma_end_write_all(mm); up_write(&mm->mmap_lock); } static inline void mmap_write_downgrade(struct mm_struct *mm) { __mmap_lock_trace_acquire_returned(mm, false, true); vma_end_write_all(mm); downgrade_write(&mm->mmap_lock); } static inline void mmap_read_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, false); down_read(&mm->mmap_lock); __mmap_lock_trace_acquire_returned(mm, false, true); } static inline int mmap_read_lock_killable(struct mm_struct *mm) { int ret; __mmap_lock_trace_start_locking(mm, false); ret = down_read_killable(&mm->mmap_lock); __mmap_lock_trace_acquire_returned(mm, false, ret == 0); return ret; } static inline bool mmap_read_trylock(struct mm_struct *mm) { bool ret; __mmap_lock_trace_start_locking(mm, false); ret = down_read_trylock(&mm->mmap_lock) != 0; __mmap_lock_trace_acquire_returned(mm, false, ret); return ret; } static inline void mmap_read_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, false); up_read(&mm->mmap_lock); } static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) { __mmap_lock_trace_released(mm, false); up_read_non_owner(&mm->mmap_lock); } static inline int mmap_lock_is_contended(struct mm_struct *mm) { return rwsem_is_contended(&mm->mmap_lock); } #endif /* _LINUX_MMAP_LOCK_H */ |
| 146 164 165 165 164 172 172 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ #include <linux/dcache.h> #include <linux/fs.h> #include <linux/gfp.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/srcu.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" /* * Clear all of the marks on an inode when it is being evicted from core */ void __fsnotify_inode_delete(struct inode *inode) { fsnotify_clear_marks_by_inode(inode); } EXPORT_SYMBOL_GPL(__fsnotify_inode_delete); void __fsnotify_vfsmount_delete(struct vfsmount *mnt) { fsnotify_clear_marks_by_mount(mnt); } /** * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. * @sb: superblock being unmounted. * * Called during unmount with no locks held, so needs to be safe against * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. */ static void fsnotify_unmount_inodes(struct super_block *sb) { struct inode *inode, *iput_inode = NULL; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { /* * We cannot __iget() an inode in state I_FREEING, * I_WILL_FREE, or I_NEW which is fine because by that point * the inode cannot have any associated watches. */ spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { spin_unlock(&inode->i_lock); continue; } /* * If i_count is zero, the inode cannot have any watches and * doing an __iget/iput with SB_ACTIVE clear would actually * evict all inodes with zero i_count from icache which is * unnecessarily violent and may in fact be illegal to do. * However, we should have been called /after/ evict_inodes * removed all zero refcount inodes, in any case. Test to * be sure. */ if (!atomic_read(&inode->i_count)) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); iput(iput_inode); /* for each watch, send FS_UNMOUNT and then remove it */ fsnotify_inode(inode, FS_UNMOUNT); fsnotify_inode_delete(inode); iput_inode = inode; cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); iput(iput_inode); } void fsnotify_sb_delete(struct super_block *sb) { struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); /* Were any marks ever added to any object on this sb? */ if (!sbinfo) return; fsnotify_unmount_inodes(sb); fsnotify_clear_marks_by_sb(sb); /* Wait for outstanding object references from connectors */ wait_var_event(fsnotify_sb_watched_objects(sb), !atomic_long_read(fsnotify_sb_watched_objects(sb))); WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT)); WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_PRE_CONTENT)); } void fsnotify_sb_free(struct super_block *sb) { kfree(sb->s_fsnotify_info); } /* * Given an inode, first check if we care what happens to our children. Inotify * and dnotify both tell their parents about events. If we care about any event * on a child we run all of our children and set a dentry flag saying that the * parent cares. Thus when an event happens on a child it can quickly tell * if there is a need to find a parent and send the event to the parent. */ void fsnotify_set_children_dentry_flags(struct inode *inode) { struct dentry *alias; if (!S_ISDIR(inode->i_mode)) return; spin_lock(&inode->i_lock); /* run all of the dentries associated with this inode. Since this is a * directory, there damn well better only be one item on this list */ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { struct dentry *child; /* run all of the children of the original inode and fix their * d_flags to indicate parental interest (their parent is the * original inode) */ spin_lock(&alias->d_lock); hlist_for_each_entry(child, &alias->d_children, d_sib) { if (!child->d_inode) continue; spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; spin_unlock(&child->d_lock); } spin_unlock(&alias->d_lock); } spin_unlock(&inode->i_lock); } /* * Lazily clear false positive PARENT_WATCHED flag for child whose parent had * stopped watching children. */ static void fsnotify_clear_child_dentry_flag(struct inode *pinode, struct dentry *dentry) { spin_lock(&dentry->d_lock); /* * d_lock is a sufficient barrier to prevent observing a non-watched * parent state from before the fsnotify_set_children_dentry_flags() * or fsnotify_update_flags() call that had set PARENT_WATCHED. */ if (!fsnotify_inode_watches_children(pinode)) dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; spin_unlock(&dentry->d_lock); } /* Are inode/sb/mount interested in parent and name info with this event? */ static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask, __u32 mask) { __u32 marks_mask = 0; /* We only send parent/name to inode/sb/mount for events on non-dir */ if (mask & FS_ISDIR) return false; /* * All events that are possible on child can also may be reported with * parent/name info to inode/sb/mount. Otherwise, a watching parent * could result in events reported with unexpected name info to sb/mount. */ BUILD_BUG_ON(FS_EVENTS_POSS_ON_CHILD & ~FS_EVENTS_POSS_TO_PARENT); /* Did either inode/sb/mount subscribe for events with parent/name? */ marks_mask |= fsnotify_parent_needed_mask( READ_ONCE(inode->i_fsnotify_mask)); marks_mask |= fsnotify_parent_needed_mask( READ_ONCE(inode->i_sb->s_fsnotify_mask)); marks_mask |= fsnotify_parent_needed_mask(mnt_mask); /* Did they subscribe for this event with parent/name info? */ return mask & marks_mask; } /* Are there any inode/mount/sb objects that watch for these events? */ static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask, __u32 mask) { __u32 marks_mask = READ_ONCE(inode->i_fsnotify_mask) | mnt_mask | READ_ONCE(inode->i_sb->s_fsnotify_mask); return mask & marks_mask & ALL_FSNOTIFY_EVENTS; } /* Report pre-content event with optional range info */ int fsnotify_pre_content(const struct path *path, const loff_t *ppos, size_t count) { struct file_range range; /* Report page aligned range only when pos is known */ if (!ppos) return fsnotify_path(path, FS_PRE_ACCESS); range.path = path; range.pos = PAGE_ALIGN_DOWN(*ppos); range.count = PAGE_ALIGN(*ppos + count) - range.pos; return fsnotify_parent(path->dentry, FS_PRE_ACCESS, &range, FSNOTIFY_EVENT_FILE_RANGE); } /* * Notify this dentry's parent about a child's events with child name info * if parent is watching or if inode/sb/mount are interested in events with * parent and name info. * * Notify only the child without name info if parent is not watching and * inode/sb/mount are not interested in events with parent and name info. */ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { const struct path *path = fsnotify_data_path(data, data_type); __u32 mnt_mask = path ? READ_ONCE(real_mount(path->mnt)->mnt_fsnotify_mask) : 0; struct inode *inode = d_inode(dentry); struct dentry *parent; bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED; bool parent_needed, parent_interested; __u32 p_mask; struct inode *p_inode = NULL; struct name_snapshot name; struct qstr *file_name = NULL; int ret = 0; /* Optimize the likely case of nobody watching this path */ if (likely(!parent_watched && !fsnotify_object_watched(inode, mnt_mask, mask))) return 0; parent = NULL; parent_needed = fsnotify_event_needs_parent(inode, mnt_mask, mask); if (!parent_watched && !parent_needed) goto notify; /* Does parent inode care about events on children? */ parent = dget_parent(dentry); p_inode = parent->d_inode; p_mask = fsnotify_inode_watches_children(p_inode); if (unlikely(parent_watched && !p_mask)) fsnotify_clear_child_dentry_flag(p_inode, dentry); /* * Include parent/name in notification either if some notification * groups require parent info or the parent is interested in this event. */ parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS; if (parent_needed || parent_interested) { /* When notifying parent, child should be passed as data */ WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type)); /* Notify both parent and child with child name info */ take_dentry_name_snapshot(&name, dentry); file_name = &name.name; if (parent_interested) mask |= FS_EVENT_ON_CHILD; } notify: ret = fsnotify(mask, data, data_type, p_inode, file_name, inode, 0); if (file_name) release_dentry_name_snapshot(&name); dput(parent); return ret; } EXPORT_SYMBOL_GPL(__fsnotify_parent); static int fsnotify_handle_inode_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie) { const struct path *path = fsnotify_data_path(data, data_type); struct inode *inode = fsnotify_data_inode(data, data_type); const struct fsnotify_ops *ops = group->ops; if (WARN_ON_ONCE(!ops->handle_inode_event)) return 0; if (WARN_ON_ONCE(!inode && !dir)) return 0; if ((inode_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) && path && d_unlinked(path->dentry)) return 0; /* Check interest of this mark in case event was sent with two marks */ if (!(mask & inode_mark->mask & ALL_FSNOTIFY_EVENTS)) return 0; return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie); } static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct fsnotify_mark *parent_mark = fsnotify_iter_parent_mark(iter_info); int ret; if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) || WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info))) return 0; /* * For FS_RENAME, 'dir' is old dir and 'data' is new dentry. * The only ->handle_inode_event() backend that supports FS_RENAME is * dnotify, where it means file was renamed within same parent. */ if (mask & FS_RENAME) { struct dentry *moved = fsnotify_data_dentry(data, data_type); if (dir != moved->d_parent->d_inode) return 0; } if (parent_mark) { ret = fsnotify_handle_inode_event(group, parent_mark, mask, data, data_type, dir, name, 0); if (ret) return ret; } if (!inode_mark) return 0; /* * Some events can be sent on both parent dir and child marks (e.g. * FS_ATTRIB). If both parent dir and child are watching, report the * event once to parent dir with name (if interested) and once to child * without name (if interested). * * In any case regardless whether the parent is watching or not, the * child watcher is expecting an event without the FS_EVENT_ON_CHILD * flag. The file name is expected if and only if this is a directory * event. */ mask &= ~FS_EVENT_ON_CHILD; if (!(mask & ALL_FSNOTIFY_DIRENT_EVENTS)) { dir = NULL; name = NULL; } return fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type, dir, name, cookie); } static int send_to_group(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_group *group = NULL; __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); __u32 marks_mask = 0; __u32 marks_ignore_mask = 0; bool is_dir = mask & FS_ISDIR; struct fsnotify_mark *mark; int type; if (!iter_info->report_mask) return 0; /* clear ignored on inode modification */ if (mask & FS_MODIFY) { fsnotify_foreach_iter_mark_type(iter_info, mark, type) { if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) mark->ignore_mask = 0; } } /* Are any of the group marks interested in this event? */ fsnotify_foreach_iter_mark_type(iter_info, mark, type) { group = mark->group; marks_mask |= mark->mask; marks_ignore_mask |= fsnotify_effective_ignore_mask(mark, is_dir, type); } pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", __func__, group, mask, marks_mask, marks_ignore_mask, data, data_type, dir, cookie); if (!(test_mask & marks_mask & ~marks_ignore_mask)) return 0; if (group->ops->handle_event) { return group->ops->handle_event(group, mask, data, data_type, dir, file_name, cookie, iter_info); } return fsnotify_handle_event(group, mask, data, data_type, dir, file_name, cookie, iter_info); } static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector **connp) { struct fsnotify_mark_connector *conn; struct hlist_node *node = NULL; conn = srcu_dereference(*connp, &fsnotify_mark_srcu); if (conn) node = srcu_dereference(conn->list.first, &fsnotify_mark_srcu); return hlist_entry_safe(node, struct fsnotify_mark, obj_list); } static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark) { struct hlist_node *node = NULL; if (mark) node = srcu_dereference(mark->obj_list.next, &fsnotify_mark_srcu); return hlist_entry_safe(node, struct fsnotify_mark, obj_list); } /* * iter_info is a multi head priority queue of marks. * Pick a subset of marks from queue heads, all with the same group * and set the report_mask to a subset of the selected marks. * Returns false if there are no more groups to iterate. */ static bool fsnotify_iter_select_report_types( struct fsnotify_iter_info *iter_info) { struct fsnotify_group *max_prio_group = NULL; struct fsnotify_mark *mark; int type; /* Choose max prio group among groups of all queue heads */ fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && fsnotify_compare_groups(max_prio_group, mark->group) > 0) max_prio_group = mark->group; } if (!max_prio_group) return false; /* Set the report mask for marks from same group as max prio group */ iter_info->current_group = max_prio_group; iter_info->report_mask = 0; fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && mark->group == iter_info->current_group) { /* * FSNOTIFY_ITER_TYPE_PARENT indicates that this inode * is watching children and interested in this event, * which is an event possible on child. * But is *this mark* watching children? */ if (type == FSNOTIFY_ITER_TYPE_PARENT && !(mark->mask & FS_EVENT_ON_CHILD) && !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD)) continue; fsnotify_iter_set_report_type(iter_info, type); } } return true; } /* * Pop from iter_info multi head queue, the marks that belong to the group of * current iteration step. */ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *mark; int type; /* * We cannot use fsnotify_foreach_iter_mark_type() here because we * may need to advance a mark of type X that belongs to current_group * but was not selected for reporting. */ fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && mark->group == iter_info->current_group) iter_info->marks[type] = fsnotify_next_mark(iter_info->marks[type]); } } /* * fsnotify - This is the main call to fsnotify. * * The VFS calls into hook specific functions in linux/fsnotify.h. * Those functions then in turn call here. Here will call out to all of the * registered fsnotify_group. Those groups can then use the notification event * in whatever means they feel necessary. * * @mask: event type and flags * @data: object that event happened on * @data_type: type of object for fanotify_data_XXX() accessors * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to * @file_name: optional file name associated with event * @inode: optional inode associated with event - * If @dir and @inode are both non-NULL, event may be * reported to both. * @cookie: inotify rename cookie */ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, struct inode *inode, u32 cookie) { const struct path *path = fsnotify_data_path(data, data_type); struct super_block *sb = fsnotify_data_sb(data, data_type); struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); struct fsnotify_iter_info iter_info = {}; struct mount *mnt = NULL; struct inode *inode2 = NULL; struct dentry *moved; int inode2_type; int ret = 0; __u32 test_mask, marks_mask; if (path) mnt = real_mount(path->mnt); if (!inode) { /* Dirent event - report on TYPE_INODE to dir */ inode = dir; /* For FS_RENAME, inode is old_dir and inode2 is new_dir */ if (mask & FS_RENAME) { moved = fsnotify_data_dentry(data, data_type); inode2 = moved->d_parent->d_inode; inode2_type = FSNOTIFY_ITER_TYPE_INODE2; } } else if (mask & FS_EVENT_ON_CHILD) { /* * Event on child - report on TYPE_PARENT to dir if it is * watching children and on TYPE_INODE to child. */ inode2 = dir; inode2_type = FSNOTIFY_ITER_TYPE_PARENT; } /* * Optimization: srcu_read_lock() has a memory barrier which can * be expensive. It protects walking the *_fsnotify_marks lists. * However, if we do not walk the lists, we do not have to do * SRCU because we have no references to any objects and do not * need SRCU to keep them "alive". */ if ((!sbinfo || !sbinfo->sb_marks) && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && (!inode2 || !inode2->i_fsnotify_marks)) return 0; marks_mask = READ_ONCE(sb->s_fsnotify_mask); if (mnt) marks_mask |= READ_ONCE(mnt->mnt_fsnotify_mask); if (inode) marks_mask |= READ_ONCE(inode->i_fsnotify_mask); if (inode2) marks_mask |= READ_ONCE(inode2->i_fsnotify_mask); /* * If this is a modify event we may need to clear some ignore masks. * In that case, the object with ignore masks will have the FS_MODIFY * event in its mask. * Otherwise, return if none of the marks care about this type of event. */ test_mask = (mask & ALL_FSNOTIFY_EVENTS); if (!(test_mask & marks_mask)) return 0; iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); if (sbinfo) { iter_info.marks[FSNOTIFY_ITER_TYPE_SB] = fsnotify_first_mark(&sbinfo->sb_marks); } if (mnt) { iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); } if (inode) { iter_info.marks[FSNOTIFY_ITER_TYPE_INODE] = fsnotify_first_mark(&inode->i_fsnotify_marks); } if (inode2) { iter_info.marks[inode2_type] = fsnotify_first_mark(&inode2->i_fsnotify_marks); } /* * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark * ignore masks are properly reflected for mount/sb mark notifications. * That's why this traversal is so complicated... */ while (fsnotify_iter_select_report_types(&iter_info)) { ret = send_to_group(mask, data, data_type, dir, file_name, cookie, &iter_info); if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) goto out; fsnotify_iter_next(&iter_info); } ret = 0; out: srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx); return ret; } EXPORT_SYMBOL_GPL(fsnotify); #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS /* * At open time we check fsnotify_sb_has_priority_watchers() and set the * FMODE_NONOTIFY_ mode bits accordignly. * Later, fsnotify permission hooks do not check if there are permission event * watches, but that there were permission event watches at open time. */ void file_set_fsnotify_mode_from_watchers(struct file *file) { struct dentry *dentry = file->f_path.dentry, *parent; struct super_block *sb = dentry->d_sb; __u32 mnt_mask, p_mask; /* Is it a file opened by fanotify? */ if (FMODE_FSNOTIFY_NONE(file->f_mode)) return; /* * Permission events is a super set of pre-content events, so if there * are no permission event watchers, there are also no pre-content event * watchers and this is implied from the single FMODE_NONOTIFY_PERM bit. */ if (likely(!fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT))) { file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); return; } /* * If there are permission event watchers but no pre-content event * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that. */ if ((!d_is_dir(dentry) && !d_is_reg(dentry)) || likely(!fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_PRE_CONTENT))) { file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM); return; } /* * OK, there are some pre-content watchers. Check if anybody is * watching for pre-content events on *this* file. */ mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask); if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask, FSNOTIFY_PRE_CONTENT_EVENTS))) { /* Enable pre-content events */ file_set_fsnotify_mode(file, 0); return; } /* Is parent watching for pre-content events on this file? */ if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) { parent = dget_parent(dentry); p_mask = fsnotify_inode_watches_children(d_inode(parent)); dput(parent); if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS) { /* Enable pre-content events */ file_set_fsnotify_mode(file, 0); return; } } /* Nobody watching for pre-content events from this file */ file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM); } #endif static __init int fsnotify_init(void) { int ret; BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 24); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) panic("initializing fsnotify_mark_srcu"); fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector, SLAB_PANIC); return 0; } core_initcall(fsnotify_init); |
| 1200 598 432 362 703 402 1200 562 362 650 25 485 7 156 420 413 156 156 245 633 211 151 194 194 2 147 691 1206 230 171 1215 263 253 230 147 1152 214 1050 580 525 429 243 115 208 93 113 112 208 25 25 25 187 1032 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 | // SPDX-License-Identifier: GPL-2.0 // Generated by scripts/atomic/gen-atomic-fallback.sh // DO NOT MODIFY THIS FILE DIRECTLY #ifndef _LINUX_ATOMIC_FALLBACK_H #define _LINUX_ATOMIC_FALLBACK_H #include <linux/compiler.h> #if defined(arch_xchg) #define raw_xchg arch_xchg #elif defined(arch_xchg_relaxed) #define raw_xchg(...) \ __atomic_op_fence(arch_xchg, __VA_ARGS__) #else extern void raw_xchg_not_implemented(void); #define raw_xchg(...) raw_xchg_not_implemented() #endif #if defined(arch_xchg_acquire) #define raw_xchg_acquire arch_xchg_acquire #elif defined(arch_xchg_relaxed) #define raw_xchg_acquire(...) \ __atomic_op_acquire(arch_xchg, __VA_ARGS__) #elif defined(arch_xchg) #define raw_xchg_acquire arch_xchg #else extern void raw_xchg_acquire_not_implemented(void); #define raw_xchg_acquire(...) raw_xchg_acquire_not_implemented() #endif #if defined(arch_xchg_release) #define raw_xchg_release arch_xchg_release #elif defined(arch_xchg_relaxed) #define raw_xchg_release(...) \ __atomic_op_release(arch_xchg, __VA_ARGS__) #elif defined(arch_xchg) #define raw_xchg_release arch_xchg #else extern void raw_xchg_release_not_implemented(void); #define raw_xchg_release(...) raw_xchg_release_not_implemented() #endif #if defined(arch_xchg_relaxed) #define raw_xchg_relaxed arch_xchg_relaxed #elif defined(arch_xchg) #define raw_xchg_relaxed arch_xchg #else extern void raw_xchg_relaxed_not_implemented(void); #define raw_xchg_relaxed(...) raw_xchg_relaxed_not_implemented() #endif #if defined(arch_cmpxchg) #define raw_cmpxchg arch_cmpxchg #elif defined(arch_cmpxchg_relaxed) #define raw_cmpxchg(...) \ __atomic_op_fence(arch_cmpxchg, __VA_ARGS__) #else extern void raw_cmpxchg_not_implemented(void); #define raw_cmpxchg(...) raw_cmpxchg_not_implemented() #endif #if defined(arch_cmpxchg_acquire) #define raw_cmpxchg_acquire arch_cmpxchg_acquire #elif defined(arch_cmpxchg_relaxed) #define raw_cmpxchg_acquire(...) \ __atomic_op_acquire(arch_cmpxchg, __VA_ARGS__) #elif defined(arch_cmpxchg) #define raw_cmpxchg_acquire arch_cmpxchg #else extern void raw_cmpxchg_acquire_not_implemented(void); #define raw_cmpxchg_acquire(...) raw_cmpxchg_acquire_not_implemented() #endif #if defined(arch_cmpxchg_release) #define raw_cmpxchg_release arch_cmpxchg_release #elif defined(arch_cmpxchg_relaxed) #define raw_cmpxchg_release(...) \ __atomic_op_release(arch_cmpxchg, __VA_ARGS__) #elif defined(arch_cmpxchg) #define raw_cmpxchg_release arch_cmpxchg #else extern void raw_cmpxchg_release_not_implemented(void); #define raw_cmpxchg_release(...) raw_cmpxchg_release_not_implemented() #endif #if defined(arch_cmpxchg_relaxed) #define raw_cmpxchg_relaxed arch_cmpxchg_relaxed #elif defined(arch_cmpxchg) #define raw_cmpxchg_relaxed arch_cmpxchg #else extern void raw_cmpxchg_relaxed_not_implemented(void); #define raw_cmpxchg_relaxed(...) raw_cmpxchg_relaxed_not_implemented() #endif #if defined(arch_cmpxchg64) #define raw_cmpxchg64 arch_cmpxchg64 #elif defined(arch_cmpxchg64_relaxed) #define raw_cmpxchg64(...) \ __atomic_op_fence(arch_cmpxchg64, __VA_ARGS__) #else extern void raw_cmpxchg64_not_implemented(void); #define raw_cmpxchg64(...) raw_cmpxchg64_not_implemented() #endif #if defined(arch_cmpxchg64_acquire) #define raw_cmpxchg64_acquire arch_cmpxchg64_acquire #elif defined(arch_cmpxchg64_relaxed) #define raw_cmpxchg64_acquire(...) \ __atomic_op_acquire(arch_cmpxchg64, __VA_ARGS__) #elif defined(arch_cmpxchg64) #define raw_cmpxchg64_acquire arch_cmpxchg64 #else extern void raw_cmpxchg64_acquire_not_implemented(void); #define raw_cmpxchg64_acquire(...) raw_cmpxchg64_acquire_not_implemented() #endif #if defined(arch_cmpxchg64_release) #define raw_cmpxchg64_release arch_cmpxchg64_release #elif defined(arch_cmpxchg64_relaxed) #define raw_cmpxchg64_release(...) \ __atomic_op_release(arch_cmpxchg64, __VA_ARGS__) #elif defined(arch_cmpxchg64) #define raw_cmpxchg64_release arch_cmpxchg64 #else extern void raw_cmpxchg64_release_not_implemented(void); #define raw_cmpxchg64_release(...) raw_cmpxchg64_release_not_implemented() #endif #if defined(arch_cmpxchg64_relaxed) #define raw_cmpxchg64_relaxed arch_cmpxchg64_relaxed #elif defined(arch_cmpxchg64) #define raw_cmpxchg64_relaxed arch_cmpxchg64 #else extern void raw_cmpxchg64_relaxed_not_implemented(void); #define raw_cmpxchg64_relaxed(...) raw_cmpxchg64_relaxed_not_implemented() #endif #if defined(arch_cmpxchg128) #define raw_cmpxchg128 arch_cmpxchg128 #elif defined(arch_cmpxchg128_relaxed) #define raw_cmpxchg128(...) \ __atomic_op_fence(arch_cmpxchg128, __VA_ARGS__) #else extern void raw_cmpxchg128_not_implemented(void); #define raw_cmpxchg128(...) raw_cmpxchg128_not_implemented() #endif #if defined(arch_cmpxchg128_acquire) #define raw_cmpxchg128_acquire arch_cmpxchg128_acquire #elif defined(arch_cmpxchg128_relaxed) #define raw_cmpxchg128_acquire(...) \ __atomic_op_acquire(arch_cmpxchg128, __VA_ARGS__) #elif defined(arch_cmpxchg128) #define raw_cmpxchg128_acquire arch_cmpxchg128 #else extern void raw_cmpxchg128_acquire_not_implemented(void); #define raw_cmpxchg128_acquire(...) raw_cmpxchg128_acquire_not_implemented() #endif #if defined(arch_cmpxchg128_release) #define raw_cmpxchg128_release arch_cmpxchg128_release #elif defined(arch_cmpxchg128_relaxed) #define raw_cmpxchg128_release(...) \ __atomic_op_release(arch_cmpxchg128, __VA_ARGS__) #elif defined(arch_cmpxchg128) #define raw_cmpxchg128_release arch_cmpxchg128 #else extern void raw_cmpxchg128_release_not_implemented(void); #define raw_cmpxchg128_release(...) raw_cmpxchg128_release_not_implemented() #endif #if defined(arch_cmpxchg128_relaxed) #define raw_cmpxchg128_relaxed arch_cmpxchg128_relaxed #elif defined(arch_cmpxchg128) #define raw_cmpxchg128_relaxed arch_cmpxchg128 #else extern void raw_cmpxchg128_relaxed_not_implemented(void); #define raw_cmpxchg128_relaxed(...) raw_cmpxchg128_relaxed_not_implemented() #endif #if defined(arch_try_cmpxchg) #define raw_try_cmpxchg arch_try_cmpxchg #elif defined(arch_try_cmpxchg_relaxed) #define raw_try_cmpxchg(...) \ __atomic_op_fence(arch_try_cmpxchg, __VA_ARGS__) #else #define raw_try_cmpxchg(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg_acquire) #define raw_try_cmpxchg_acquire arch_try_cmpxchg_acquire #elif defined(arch_try_cmpxchg_relaxed) #define raw_try_cmpxchg_acquire(...) \ __atomic_op_acquire(arch_try_cmpxchg, __VA_ARGS__) #elif defined(arch_try_cmpxchg) #define raw_try_cmpxchg_acquire arch_try_cmpxchg #else #define raw_try_cmpxchg_acquire(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg_acquire((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg_release) #define raw_try_cmpxchg_release arch_try_cmpxchg_release #elif defined(arch_try_cmpxchg_relaxed) #define raw_try_cmpxchg_release(...) \ __atomic_op_release(arch_try_cmpxchg, __VA_ARGS__) #elif defined(arch_try_cmpxchg) #define raw_try_cmpxchg_release arch_try_cmpxchg #else #define raw_try_cmpxchg_release(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg_release((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg_relaxed) #define raw_try_cmpxchg_relaxed arch_try_cmpxchg_relaxed #elif defined(arch_try_cmpxchg) #define raw_try_cmpxchg_relaxed arch_try_cmpxchg #else #define raw_try_cmpxchg_relaxed(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg_relaxed((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg64) #define raw_try_cmpxchg64 arch_try_cmpxchg64 #elif defined(arch_try_cmpxchg64_relaxed) #define raw_try_cmpxchg64(...) \ __atomic_op_fence(arch_try_cmpxchg64, __VA_ARGS__) #else #define raw_try_cmpxchg64(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg64_acquire) #define raw_try_cmpxchg64_acquire arch_try_cmpxchg64_acquire #elif defined(arch_try_cmpxchg64_relaxed) #define raw_try_cmpxchg64_acquire(...) \ __atomic_op_acquire(arch_try_cmpxchg64, __VA_ARGS__) #elif defined(arch_try_cmpxchg64) #define raw_try_cmpxchg64_acquire arch_try_cmpxchg64 #else #define raw_try_cmpxchg64_acquire(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64_acquire((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg64_release) #define raw_try_cmpxchg64_release arch_try_cmpxchg64_release #elif defined(arch_try_cmpxchg64_relaxed) #define raw_try_cmpxchg64_release(...) \ __atomic_op_release(arch_try_cmpxchg64, __VA_ARGS__) #elif defined(arch_try_cmpxchg64) #define raw_try_cmpxchg64_release arch_try_cmpxchg64 #else #define raw_try_cmpxchg64_release(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64_release((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg64_relaxed) #define raw_try_cmpxchg64_relaxed arch_try_cmpxchg64_relaxed #elif defined(arch_try_cmpxchg64) #define raw_try_cmpxchg64_relaxed arch_try_cmpxchg64 #else #define raw_try_cmpxchg64_relaxed(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64_relaxed((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg128) #define raw_try_cmpxchg128 arch_try_cmpxchg128 #elif defined(arch_try_cmpxchg128_relaxed) #define raw_try_cmpxchg128(...) \ __atomic_op_fence(arch_try_cmpxchg128, __VA_ARGS__) #else #define raw_try_cmpxchg128(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg128_acquire) #define raw_try_cmpxchg128_acquire arch_try_cmpxchg128_acquire #elif defined(arch_try_cmpxchg128_relaxed) #define raw_try_cmpxchg128_acquire(...) \ __atomic_op_acquire(arch_try_cmpxchg128, __VA_ARGS__) #elif defined(arch_try_cmpxchg128) #define raw_try_cmpxchg128_acquire arch_try_cmpxchg128 #else #define raw_try_cmpxchg128_acquire(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128_acquire((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg128_release) #define raw_try_cmpxchg128_release arch_try_cmpxchg128_release #elif defined(arch_try_cmpxchg128_relaxed) #define raw_try_cmpxchg128_release(...) \ __atomic_op_release(arch_try_cmpxchg128, __VA_ARGS__) #elif defined(arch_try_cmpxchg128) #define raw_try_cmpxchg128_release arch_try_cmpxchg128 #else #define raw_try_cmpxchg128_release(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128_release((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg128_relaxed) #define raw_try_cmpxchg128_relaxed arch_try_cmpxchg128_relaxed #elif defined(arch_try_cmpxchg128) #define raw_try_cmpxchg128_relaxed arch_try_cmpxchg128 #else #define raw_try_cmpxchg128_relaxed(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128_relaxed((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #define raw_cmpxchg_local arch_cmpxchg_local #ifdef arch_try_cmpxchg_local #define raw_try_cmpxchg_local arch_try_cmpxchg_local #else #define raw_try_cmpxchg_local(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg_local((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #define raw_cmpxchg64_local arch_cmpxchg64_local #ifdef arch_try_cmpxchg64_local #define raw_try_cmpxchg64_local arch_try_cmpxchg64_local #else #define raw_try_cmpxchg64_local(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64_local((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #define raw_cmpxchg128_local arch_cmpxchg128_local #ifdef arch_try_cmpxchg128_local #define raw_try_cmpxchg128_local arch_try_cmpxchg128_local #else #define raw_try_cmpxchg128_local(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128_local((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #define raw_sync_cmpxchg arch_sync_cmpxchg #ifdef arch_sync_try_cmpxchg #define raw_sync_try_cmpxchg arch_sync_try_cmpxchg #else #define raw_sync_try_cmpxchg(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_sync_cmpxchg((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif /** * raw_atomic_read() - atomic load with relaxed ordering * @v: pointer to atomic_t * * Atomically loads the value of @v with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_read() elsewhere. * * Return: The value loaded from @v. */ static __always_inline int raw_atomic_read(const atomic_t *v) { return arch_atomic_read(v); } /** * raw_atomic_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic_t * * Atomically loads the value of @v with acquire ordering. * * Safe to use in noinstr code; prefer atomic_read_acquire() elsewhere. * * Return: The value loaded from @v. */ static __always_inline int raw_atomic_read_acquire(const atomic_t *v) { #if defined(arch_atomic_read_acquire) return arch_atomic_read_acquire(v); #else int ret; if (__native_word(atomic_t)) { ret = smp_load_acquire(&(v)->counter); } else { ret = raw_atomic_read(v); __atomic_acquire_fence(); } return ret; #endif } /** * raw_atomic_set() - atomic set with relaxed ordering * @v: pointer to atomic_t * @i: int value to assign * * Atomically sets @v to @i with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_set() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_set(atomic_t *v, int i) { arch_atomic_set(v, i); } /** * raw_atomic_set_release() - atomic set with release ordering * @v: pointer to atomic_t * @i: int value to assign * * Atomically sets @v to @i with release ordering. * * Safe to use in noinstr code; prefer atomic_set_release() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_set_release(atomic_t *v, int i) { #if defined(arch_atomic_set_release) arch_atomic_set_release(v, i); #else if (__native_word(atomic_t)) { smp_store_release(&(v)->counter, i); } else { __atomic_release_fence(); raw_atomic_set(v, i); } #endif } /** * raw_atomic_add() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_add() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_add(int i, atomic_t *v) { arch_atomic_add(i, v); } /** * raw_atomic_add_return() - atomic add with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_add_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_add_return(int i, atomic_t *v) { #if defined(arch_atomic_add_return) return arch_atomic_add_return(i, v); #elif defined(arch_atomic_add_return_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_add_return_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_add_return" #endif } /** * raw_atomic_add_return_acquire() - atomic add with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_add_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_add_return_acquire(int i, atomic_t *v) { #if defined(arch_atomic_add_return_acquire) return arch_atomic_add_return_acquire(i, v); #elif defined(arch_atomic_add_return_relaxed) int ret = arch_atomic_add_return_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_add_return) return arch_atomic_add_return(i, v); #else #error "Unable to define raw_atomic_add_return_acquire" #endif } /** * raw_atomic_add_return_release() - atomic add with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_add_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_add_return_release(int i, atomic_t *v) { #if defined(arch_atomic_add_return_release) return arch_atomic_add_return_release(i, v); #elif defined(arch_atomic_add_return_relaxed) __atomic_release_fence(); return arch_atomic_add_return_relaxed(i, v); #elif defined(arch_atomic_add_return) return arch_atomic_add_return(i, v); #else #error "Unable to define raw_atomic_add_return_release" #endif } /** * raw_atomic_add_return_relaxed() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_add_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_add_return_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_add_return_relaxed) return arch_atomic_add_return_relaxed(i, v); #elif defined(arch_atomic_add_return) return arch_atomic_add_return(i, v); #else #error "Unable to define raw_atomic_add_return_relaxed" #endif } /** * raw_atomic_fetch_add() - atomic add with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_add() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add(int i, atomic_t *v) { #if defined(arch_atomic_fetch_add) return arch_atomic_fetch_add(i, v); #elif defined(arch_atomic_fetch_add_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_add_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_add" #endif } /** * raw_atomic_fetch_add_acquire() - atomic add with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_add_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_add_acquire) return arch_atomic_fetch_add_acquire(i, v); #elif defined(arch_atomic_fetch_add_relaxed) int ret = arch_atomic_fetch_add_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_add) return arch_atomic_fetch_add(i, v); #else #error "Unable to define raw_atomic_fetch_add_acquire" #endif } /** * raw_atomic_fetch_add_release() - atomic add with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_add_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_add_release) return arch_atomic_fetch_add_release(i, v); #elif defined(arch_atomic_fetch_add_relaxed) __atomic_release_fence(); return arch_atomic_fetch_add_relaxed(i, v); #elif defined(arch_atomic_fetch_add) return arch_atomic_fetch_add(i, v); #else #error "Unable to define raw_atomic_fetch_add_release" #endif } /** * raw_atomic_fetch_add_relaxed() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_add_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_add_relaxed) return arch_atomic_fetch_add_relaxed(i, v); #elif defined(arch_atomic_fetch_add) return arch_atomic_fetch_add(i, v); #else #error "Unable to define raw_atomic_fetch_add_relaxed" #endif } /** * raw_atomic_sub() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_sub() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_sub(int i, atomic_t *v) { arch_atomic_sub(i, v); } /** * raw_atomic_sub_return() - atomic subtract with full ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_sub_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_sub_return(int i, atomic_t *v) { #if defined(arch_atomic_sub_return) return arch_atomic_sub_return(i, v); #elif defined(arch_atomic_sub_return_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_sub_return_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_sub_return" #endif } /** * raw_atomic_sub_return_acquire() - atomic subtract with acquire ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_sub_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_sub_return_acquire(int i, atomic_t *v) { #if defined(arch_atomic_sub_return_acquire) return arch_atomic_sub_return_acquire(i, v); #elif defined(arch_atomic_sub_return_relaxed) int ret = arch_atomic_sub_return_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_sub_return) return arch_atomic_sub_return(i, v); #else #error "Unable to define raw_atomic_sub_return_acquire" #endif } /** * raw_atomic_sub_return_release() - atomic subtract with release ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_sub_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_sub_return_release(int i, atomic_t *v) { #if defined(arch_atomic_sub_return_release) return arch_atomic_sub_return_release(i, v); #elif defined(arch_atomic_sub_return_relaxed) __atomic_release_fence(); return arch_atomic_sub_return_relaxed(i, v); #elif defined(arch_atomic_sub_return) return arch_atomic_sub_return(i, v); #else #error "Unable to define raw_atomic_sub_return_release" #endif } /** * raw_atomic_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_sub_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_sub_return_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_sub_return_relaxed) return arch_atomic_sub_return_relaxed(i, v); #elif defined(arch_atomic_sub_return) return arch_atomic_sub_return(i, v); #else #error "Unable to define raw_atomic_sub_return_relaxed" #endif } /** * raw_atomic_fetch_sub() - atomic subtract with full ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_sub() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_sub(int i, atomic_t *v) { #if defined(arch_atomic_fetch_sub) return arch_atomic_fetch_sub(i, v); #elif defined(arch_atomic_fetch_sub_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_sub_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_sub" #endif } /** * raw_atomic_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_sub_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_sub_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_sub_acquire) return arch_atomic_fetch_sub_acquire(i, v); #elif defined(arch_atomic_fetch_sub_relaxed) int ret = arch_atomic_fetch_sub_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_sub) return arch_atomic_fetch_sub(i, v); #else #error "Unable to define raw_atomic_fetch_sub_acquire" #endif } /** * raw_atomic_fetch_sub_release() - atomic subtract with release ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_sub_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_sub_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_sub_release) return arch_atomic_fetch_sub_release(i, v); #elif defined(arch_atomic_fetch_sub_relaxed) __atomic_release_fence(); return arch_atomic_fetch_sub_relaxed(i, v); #elif defined(arch_atomic_fetch_sub) return arch_atomic_fetch_sub(i, v); #else #error "Unable to define raw_atomic_fetch_sub_release" #endif } /** * raw_atomic_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_sub_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_sub_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_sub_relaxed) return arch_atomic_fetch_sub_relaxed(i, v); #elif defined(arch_atomic_fetch_sub) return arch_atomic_fetch_sub(i, v); #else #error "Unable to define raw_atomic_fetch_sub_relaxed" #endif } /** * raw_atomic_inc() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_inc() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_inc(atomic_t *v) { #if defined(arch_atomic_inc) arch_atomic_inc(v); #else raw_atomic_add(1, v); #endif } /** * raw_atomic_inc_return() - atomic increment with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_inc_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_inc_return(atomic_t *v) { #if defined(arch_atomic_inc_return) return arch_atomic_inc_return(v); #elif defined(arch_atomic_inc_return_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_inc_return_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic_add_return(1, v); #endif } /** * raw_atomic_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_inc_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_inc_return_acquire(atomic_t *v) { #if defined(arch_atomic_inc_return_acquire) return arch_atomic_inc_return_acquire(v); #elif defined(arch_atomic_inc_return_relaxed) int ret = arch_atomic_inc_return_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_inc_return) return arch_atomic_inc_return(v); #else return raw_atomic_add_return_acquire(1, v); #endif } /** * raw_atomic_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_inc_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_inc_return_release(atomic_t *v) { #if defined(arch_atomic_inc_return_release) return arch_atomic_inc_return_release(v); #elif defined(arch_atomic_inc_return_relaxed) __atomic_release_fence(); return arch_atomic_inc_return_relaxed(v); #elif defined(arch_atomic_inc_return) return arch_atomic_inc_return(v); #else return raw_atomic_add_return_release(1, v); #endif } /** * raw_atomic_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_inc_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_inc_return_relaxed(atomic_t *v) { #if defined(arch_atomic_inc_return_relaxed) return arch_atomic_inc_return_relaxed(v); #elif defined(arch_atomic_inc_return) return arch_atomic_inc_return(v); #else return raw_atomic_add_return_relaxed(1, v); #endif } /** * raw_atomic_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_inc() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_inc(atomic_t *v) { #if defined(arch_atomic_fetch_inc) return arch_atomic_fetch_inc(v); #elif defined(arch_atomic_fetch_inc_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_inc_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic_fetch_add(1, v); #endif } /** * raw_atomic_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_inc_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_inc_acquire(atomic_t *v) { #if defined(arch_atomic_fetch_inc_acquire) return arch_atomic_fetch_inc_acquire(v); #elif defined(arch_atomic_fetch_inc_relaxed) int ret = arch_atomic_fetch_inc_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_inc) return arch_atomic_fetch_inc(v); #else return raw_atomic_fetch_add_acquire(1, v); #endif } /** * raw_atomic_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_inc_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_inc_release(atomic_t *v) { #if defined(arch_atomic_fetch_inc_release) return arch_atomic_fetch_inc_release(v); #elif defined(arch_atomic_fetch_inc_relaxed) __atomic_release_fence(); return arch_atomic_fetch_inc_relaxed(v); #elif defined(arch_atomic_fetch_inc) return arch_atomic_fetch_inc(v); #else return raw_atomic_fetch_add_release(1, v); #endif } /** * raw_atomic_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_inc_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_inc_relaxed(atomic_t *v) { #if defined(arch_atomic_fetch_inc_relaxed) return arch_atomic_fetch_inc_relaxed(v); #elif defined(arch_atomic_fetch_inc) return arch_atomic_fetch_inc(v); #else return raw_atomic_fetch_add_relaxed(1, v); #endif } /** * raw_atomic_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_dec() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_dec(atomic_t *v) { #if defined(arch_atomic_dec) arch_atomic_dec(v); #else raw_atomic_sub(1, v); #endif } /** * raw_atomic_dec_return() - atomic decrement with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_dec_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_dec_return(atomic_t *v) { #if defined(arch_atomic_dec_return) return arch_atomic_dec_return(v); #elif defined(arch_atomic_dec_return_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_dec_return_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic_sub_return(1, v); #endif } /** * raw_atomic_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_dec_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_dec_return_acquire(atomic_t *v) { #if defined(arch_atomic_dec_return_acquire) return arch_atomic_dec_return_acquire(v); #elif defined(arch_atomic_dec_return_relaxed) int ret = arch_atomic_dec_return_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_dec_return) return arch_atomic_dec_return(v); #else return raw_atomic_sub_return_acquire(1, v); #endif } /** * raw_atomic_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_dec_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_dec_return_release(atomic_t *v) { #if defined(arch_atomic_dec_return_release) return arch_atomic_dec_return_release(v); #elif defined(arch_atomic_dec_return_relaxed) __atomic_release_fence(); return arch_atomic_dec_return_relaxed(v); #elif defined(arch_atomic_dec_return) return arch_atomic_dec_return(v); #else return raw_atomic_sub_return_release(1, v); #endif } /** * raw_atomic_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_dec_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_dec_return_relaxed(atomic_t *v) { #if defined(arch_atomic_dec_return_relaxed) return arch_atomic_dec_return_relaxed(v); #elif defined(arch_atomic_dec_return) return arch_atomic_dec_return(v); #else return raw_atomic_sub_return_relaxed(1, v); #endif } /** * raw_atomic_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_dec() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_dec(atomic_t *v) { #if defined(arch_atomic_fetch_dec) return arch_atomic_fetch_dec(v); #elif defined(arch_atomic_fetch_dec_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_dec_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic_fetch_sub(1, v); #endif } /** * raw_atomic_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_dec_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_dec_acquire(atomic_t *v) { #if defined(arch_atomic_fetch_dec_acquire) return arch_atomic_fetch_dec_acquire(v); #elif defined(arch_atomic_fetch_dec_relaxed) int ret = arch_atomic_fetch_dec_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_dec) return arch_atomic_fetch_dec(v); #else return raw_atomic_fetch_sub_acquire(1, v); #endif } /** * raw_atomic_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_dec_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_dec_release(atomic_t *v) { #if defined(arch_atomic_fetch_dec_release) return arch_atomic_fetch_dec_release(v); #elif defined(arch_atomic_fetch_dec_relaxed) __atomic_release_fence(); return arch_atomic_fetch_dec_relaxed(v); #elif defined(arch_atomic_fetch_dec) return arch_atomic_fetch_dec(v); #else return raw_atomic_fetch_sub_release(1, v); #endif } /** * raw_atomic_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_dec_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_dec_relaxed(atomic_t *v) { #if defined(arch_atomic_fetch_dec_relaxed) return arch_atomic_fetch_dec_relaxed(v); #elif defined(arch_atomic_fetch_dec) return arch_atomic_fetch_dec(v); #else return raw_atomic_fetch_sub_relaxed(1, v); #endif } /** * raw_atomic_and() - atomic bitwise AND with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_and() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_and(int i, atomic_t *v) { arch_atomic_and(i, v); } /** * raw_atomic_fetch_and() - atomic bitwise AND with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_and() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_and(int i, atomic_t *v) { #if defined(arch_atomic_fetch_and) return arch_atomic_fetch_and(i, v); #elif defined(arch_atomic_fetch_and_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_and_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_and" #endif } /** * raw_atomic_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_and_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_and_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_and_acquire) return arch_atomic_fetch_and_acquire(i, v); #elif defined(arch_atomic_fetch_and_relaxed) int ret = arch_atomic_fetch_and_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_and) return arch_atomic_fetch_and(i, v); #else #error "Unable to define raw_atomic_fetch_and_acquire" #endif } /** * raw_atomic_fetch_and_release() - atomic bitwise AND with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_and_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_and_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_and_release) return arch_atomic_fetch_and_release(i, v); #elif defined(arch_atomic_fetch_and_relaxed) __atomic_release_fence(); return arch_atomic_fetch_and_relaxed(i, v); #elif defined(arch_atomic_fetch_and) return arch_atomic_fetch_and(i, v); #else #error "Unable to define raw_atomic_fetch_and_release" #endif } /** * raw_atomic_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_and_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_and_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_and_relaxed) return arch_atomic_fetch_and_relaxed(i, v); #elif defined(arch_atomic_fetch_and) return arch_atomic_fetch_and(i, v); #else #error "Unable to define raw_atomic_fetch_and_relaxed" #endif } /** * raw_atomic_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_andnot() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_andnot(int i, atomic_t *v) { #if defined(arch_atomic_andnot) arch_atomic_andnot(i, v); #else raw_atomic_and(~i, v); #endif } /** * raw_atomic_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_andnot() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_andnot(int i, atomic_t *v) { #if defined(arch_atomic_fetch_andnot) return arch_atomic_fetch_andnot(i, v); #elif defined(arch_atomic_fetch_andnot_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_andnot_relaxed(i, v); __atomic_post_full_fence(); return ret; #else return raw_atomic_fetch_and(~i, v); #endif } /** * raw_atomic_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_andnot_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_andnot_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_andnot_acquire) return arch_atomic_fetch_andnot_acquire(i, v); #elif defined(arch_atomic_fetch_andnot_relaxed) int ret = arch_atomic_fetch_andnot_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_andnot) return arch_atomic_fetch_andnot(i, v); #else return raw_atomic_fetch_and_acquire(~i, v); #endif } /** * raw_atomic_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_andnot_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_andnot_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_andnot_release) return arch_atomic_fetch_andnot_release(i, v); #elif defined(arch_atomic_fetch_andnot_relaxed) __atomic_release_fence(); return arch_atomic_fetch_andnot_relaxed(i, v); #elif defined(arch_atomic_fetch_andnot) return arch_atomic_fetch_andnot(i, v); #else return raw_atomic_fetch_and_release(~i, v); #endif } /** * raw_atomic_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_andnot_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_andnot_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_andnot_relaxed) return arch_atomic_fetch_andnot_relaxed(i, v); #elif defined(arch_atomic_fetch_andnot) return arch_atomic_fetch_andnot(i, v); #else return raw_atomic_fetch_and_relaxed(~i, v); #endif } /** * raw_atomic_or() - atomic bitwise OR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_or() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_or(int i, atomic_t *v) { arch_atomic_or(i, v); } /** * raw_atomic_fetch_or() - atomic bitwise OR with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_or() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_or(int i, atomic_t *v) { #if defined(arch_atomic_fetch_or) return arch_atomic_fetch_or(i, v); #elif defined(arch_atomic_fetch_or_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_or_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_or" #endif } /** * raw_atomic_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_or_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_or_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_or_acquire) return arch_atomic_fetch_or_acquire(i, v); #elif defined(arch_atomic_fetch_or_relaxed) int ret = arch_atomic_fetch_or_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_or) return arch_atomic_fetch_or(i, v); #else #error "Unable to define raw_atomic_fetch_or_acquire" #endif } /** * raw_atomic_fetch_or_release() - atomic bitwise OR with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_or_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_or_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_or_release) return arch_atomic_fetch_or_release(i, v); #elif defined(arch_atomic_fetch_or_relaxed) __atomic_release_fence(); return arch_atomic_fetch_or_relaxed(i, v); #elif defined(arch_atomic_fetch_or) return arch_atomic_fetch_or(i, v); #else #error "Unable to define raw_atomic_fetch_or_release" #endif } /** * raw_atomic_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_or_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_or_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_or_relaxed) return arch_atomic_fetch_or_relaxed(i, v); #elif defined(arch_atomic_fetch_or) return arch_atomic_fetch_or(i, v); #else #error "Unable to define raw_atomic_fetch_or_relaxed" #endif } /** * raw_atomic_xor() - atomic bitwise XOR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_xor() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_xor(int i, atomic_t *v) { arch_atomic_xor(i, v); } /** * raw_atomic_fetch_xor() - atomic bitwise XOR with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_xor() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_xor(int i, atomic_t *v) { #if defined(arch_atomic_fetch_xor) return arch_atomic_fetch_xor(i, v); #elif defined(arch_atomic_fetch_xor_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_xor_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_xor" #endif } /** * raw_atomic_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_xor_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_xor_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_xor_acquire) return arch_atomic_fetch_xor_acquire(i, v); #elif defined(arch_atomic_fetch_xor_relaxed) int ret = arch_atomic_fetch_xor_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_xor) return arch_atomic_fetch_xor(i, v); #else #error "Unable to define raw_atomic_fetch_xor_acquire" #endif } /** * raw_atomic_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_xor_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_xor_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_xor_release) return arch_atomic_fetch_xor_release(i, v); #elif defined(arch_atomic_fetch_xor_relaxed) __atomic_release_fence(); return arch_atomic_fetch_xor_relaxed(i, v); #elif defined(arch_atomic_fetch_xor) return arch_atomic_fetch_xor(i, v); #else #error "Unable to define raw_atomic_fetch_xor_release" #endif } /** * raw_atomic_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_xor_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_xor_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_xor_relaxed) return arch_atomic_fetch_xor_relaxed(i, v); #elif defined(arch_atomic_fetch_xor) return arch_atomic_fetch_xor(i, v); #else #error "Unable to define raw_atomic_fetch_xor_relaxed" #endif } /** * raw_atomic_xchg() - atomic exchange with full ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with full ordering. * * Safe to use in noinstr code; prefer atomic_xchg() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_xchg(atomic_t *v, int new) { #if defined(arch_atomic_xchg) return arch_atomic_xchg(v, new); #elif defined(arch_atomic_xchg_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_xchg_relaxed(v, new); __atomic_post_full_fence(); return ret; #else return raw_xchg(&v->counter, new); #endif } /** * raw_atomic_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with acquire ordering. * * Safe to use in noinstr code; prefer atomic_xchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_xchg_acquire(atomic_t *v, int new) { #if defined(arch_atomic_xchg_acquire) return arch_atomic_xchg_acquire(v, new); #elif defined(arch_atomic_xchg_relaxed) int ret = arch_atomic_xchg_relaxed(v, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_xchg) return arch_atomic_xchg(v, new); #else return raw_xchg_acquire(&v->counter, new); #endif } /** * raw_atomic_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with release ordering. * * Safe to use in noinstr code; prefer atomic_xchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_xchg_release(atomic_t *v, int new) { #if defined(arch_atomic_xchg_release) return arch_atomic_xchg_release(v, new); #elif defined(arch_atomic_xchg_relaxed) __atomic_release_fence(); return arch_atomic_xchg_relaxed(v, new); #elif defined(arch_atomic_xchg) return arch_atomic_xchg(v, new); #else return raw_xchg_release(&v->counter, new); #endif } /** * raw_atomic_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_xchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_xchg_relaxed(atomic_t *v, int new) { #if defined(arch_atomic_xchg_relaxed) return arch_atomic_xchg_relaxed(v, new); #elif defined(arch_atomic_xchg) return arch_atomic_xchg(v, new); #else return raw_xchg_relaxed(&v->counter, new); #endif } /** * raw_atomic_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_cmpxchg() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_cmpxchg(atomic_t *v, int old, int new) { #if defined(arch_atomic_cmpxchg) return arch_atomic_cmpxchg(v, old, new); #elif defined(arch_atomic_cmpxchg_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; #else return raw_cmpxchg(&v->counter, old, new); #endif } /** * raw_atomic_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_cmpxchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_cmpxchg_acquire(atomic_t *v, int old, int new) { #if defined(arch_atomic_cmpxchg_acquire) return arch_atomic_cmpxchg_acquire(v, old, new); #elif defined(arch_atomic_cmpxchg_relaxed) int ret = arch_atomic_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_cmpxchg) return arch_atomic_cmpxchg(v, old, new); #else return raw_cmpxchg_acquire(&v->counter, old, new); #endif } /** * raw_atomic_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_cmpxchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_cmpxchg_release(atomic_t *v, int old, int new) { #if defined(arch_atomic_cmpxchg_release) return arch_atomic_cmpxchg_release(v, old, new); #elif defined(arch_atomic_cmpxchg_relaxed) __atomic_release_fence(); return arch_atomic_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic_cmpxchg) return arch_atomic_cmpxchg(v, old, new); #else return raw_cmpxchg_release(&v->counter, old, new); #endif } /** * raw_atomic_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_cmpxchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_cmpxchg_relaxed(atomic_t *v, int old, int new) { #if defined(arch_atomic_cmpxchg_relaxed) return arch_atomic_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic_cmpxchg) return arch_atomic_cmpxchg(v, old, new); #else return raw_cmpxchg_relaxed(&v->counter, old, new); #endif } /** * raw_atomic_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_try_cmpxchg(atomic_t *v, int *old, int new) { #if defined(arch_atomic_try_cmpxchg) return arch_atomic_try_cmpxchg(v, old, new); #elif defined(arch_atomic_try_cmpxchg_relaxed) bool ret; __atomic_pre_full_fence(); ret = arch_atomic_try_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; #else int r, o = *old; r = raw_atomic_cmpxchg(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg_acquire() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { #if defined(arch_atomic_try_cmpxchg_acquire) return arch_atomic_try_cmpxchg_acquire(v, old, new); #elif defined(arch_atomic_try_cmpxchg_relaxed) bool ret = arch_atomic_try_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_try_cmpxchg) return arch_atomic_try_cmpxchg(v, old, new); #else int r, o = *old; r = raw_atomic_cmpxchg_acquire(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg_release() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { #if defined(arch_atomic_try_cmpxchg_release) return arch_atomic_try_cmpxchg_release(v, old, new); #elif defined(arch_atomic_try_cmpxchg_relaxed) __atomic_release_fence(); return arch_atomic_try_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic_try_cmpxchg) return arch_atomic_try_cmpxchg(v, old, new); #else int r, o = *old; r = raw_atomic_cmpxchg_release(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg_relaxed() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) { #if defined(arch_atomic_try_cmpxchg_relaxed) return arch_atomic_try_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic_try_cmpxchg) return arch_atomic_try_cmpxchg(v, old, new); #else int r, o = *old; r = raw_atomic_cmpxchg_relaxed(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic_sub_and_test() - atomic subtract and test if zero with full ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_sub_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_sub_and_test(int i, atomic_t *v) { #if defined(arch_atomic_sub_and_test) return arch_atomic_sub_and_test(i, v); #else return raw_atomic_sub_return(i, v) == 0; #endif } /** * raw_atomic_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_dec_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_dec_and_test(atomic_t *v) { #if defined(arch_atomic_dec_and_test) return arch_atomic_dec_and_test(v); #else return raw_atomic_dec_return(v) == 0; #endif } /** * raw_atomic_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_inc_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_inc_and_test(atomic_t *v) { #if defined(arch_atomic_inc_and_test) return arch_atomic_inc_and_test(v); #else return raw_atomic_inc_return(v) == 0; #endif } /** * raw_atomic_add_negative() - atomic add and test if negative with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_add_negative() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_add_negative(int i, atomic_t *v) { #if defined(arch_atomic_add_negative) return arch_atomic_add_negative(i, v); #elif defined(arch_atomic_add_negative_relaxed) bool ret; __atomic_pre_full_fence(); ret = arch_atomic_add_negative_relaxed(i, v); __atomic_post_full_fence(); return ret; #else return raw_atomic_add_return(i, v) < 0; #endif } /** * raw_atomic_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_add_negative_acquire() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_add_negative_acquire(int i, atomic_t *v) { #if defined(arch_atomic_add_negative_acquire) return arch_atomic_add_negative_acquire(i, v); #elif defined(arch_atomic_add_negative_relaxed) bool ret = arch_atomic_add_negative_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_add_negative) return arch_atomic_add_negative(i, v); #else return raw_atomic_add_return_acquire(i, v) < 0; #endif } /** * raw_atomic_add_negative_release() - atomic add and test if negative with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_add_negative_release() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_add_negative_release(int i, atomic_t *v) { #if defined(arch_atomic_add_negative_release) return arch_atomic_add_negative_release(i, v); #elif defined(arch_atomic_add_negative_relaxed) __atomic_release_fence(); return arch_atomic_add_negative_relaxed(i, v); #elif defined(arch_atomic_add_negative) return arch_atomic_add_negative(i, v); #else return raw_atomic_add_return_release(i, v) < 0; #endif } /** * raw_atomic_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_add_negative_relaxed() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_add_negative_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_add_negative_relaxed) return arch_atomic_add_negative_relaxed(i, v); #elif defined(arch_atomic_add_negative) return arch_atomic_add_negative(i, v); #else return raw_atomic_add_return_relaxed(i, v) < 0; #endif } /** * raw_atomic_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_t * @a: int value to add * @u: int value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_fetch_add_unless() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add_unless(atomic_t *v, int a, int u) { #if defined(arch_atomic_fetch_add_unless) return arch_atomic_fetch_add_unless(v, a, u); #else int c = raw_atomic_read(v); do { if (unlikely(c == u)) break; } while (!raw_atomic_try_cmpxchg(v, &c, c + a)); return c; #endif } /** * raw_atomic_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_t * @a: int value to add * @u: int value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_add_unless() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_add_unless(atomic_t *v, int a, int u) { #if defined(arch_atomic_add_unless) return arch_atomic_add_unless(v, a, u); #else return raw_atomic_fetch_add_unless(v, a, u) != u; #endif } /** * raw_atomic_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_inc_not_zero() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_inc_not_zero(atomic_t *v) { #if defined(arch_atomic_inc_not_zero) return arch_atomic_inc_not_zero(v); #else return raw_atomic_add_unless(v, 1, 0); #endif } /** * raw_atomic_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_inc_unless_negative() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_inc_unless_negative(atomic_t *v) { #if defined(arch_atomic_inc_unless_negative) return arch_atomic_inc_unless_negative(v); #else int c = raw_atomic_read(v); do { if (unlikely(c < 0)) return false; } while (!raw_atomic_try_cmpxchg(v, &c, c + 1)); return true; #endif } /** * raw_atomic_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_dec_unless_positive() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_dec_unless_positive(atomic_t *v) { #if defined(arch_atomic_dec_unless_positive) return arch_atomic_dec_unless_positive(v); #else int c = raw_atomic_read(v); do { if (unlikely(c > 0)) return false; } while (!raw_atomic_try_cmpxchg(v, &c, c - 1)); return true; #endif } /** * raw_atomic_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_dec_if_positive() elsewhere. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline int raw_atomic_dec_if_positive(atomic_t *v) { #if defined(arch_atomic_dec_if_positive) return arch_atomic_dec_if_positive(v); #else int dec, c = raw_atomic_read(v); do { dec = c - 1; if (unlikely(dec < 0)) break; } while (!raw_atomic_try_cmpxchg(v, &c, dec)); return dec; #endif } #ifdef CONFIG_GENERIC_ATOMIC64 #include <asm-generic/atomic64.h> #endif /** * raw_atomic64_read() - atomic load with relaxed ordering * @v: pointer to atomic64_t * * Atomically loads the value of @v with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_read() elsewhere. * * Return: The value loaded from @v. */ static __always_inline s64 raw_atomic64_read(const atomic64_t *v) { return arch_atomic64_read(v); } /** * raw_atomic64_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic64_t * * Atomically loads the value of @v with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_read_acquire() elsewhere. * * Return: The value loaded from @v. */ static __always_inline s64 raw_atomic64_read_acquire(const atomic64_t *v) { #if defined(arch_atomic64_read_acquire) return arch_atomic64_read_acquire(v); #else s64 ret; if (__native_word(atomic64_t)) { ret = smp_load_acquire(&(v)->counter); } else { ret = raw_atomic64_read(v); __atomic_acquire_fence(); } return ret; #endif } /** * raw_atomic64_set() - atomic set with relaxed ordering * @v: pointer to atomic64_t * @i: s64 value to assign * * Atomically sets @v to @i with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_set() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_set(atomic64_t *v, s64 i) { arch_atomic64_set(v, i); } /** * raw_atomic64_set_release() - atomic set with release ordering * @v: pointer to atomic64_t * @i: s64 value to assign * * Atomically sets @v to @i with release ordering. * * Safe to use in noinstr code; prefer atomic64_set_release() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_set_release(atomic64_t *v, s64 i) { #if defined(arch_atomic64_set_release) arch_atomic64_set_release(v, i); #else if (__native_word(atomic64_t)) { smp_store_release(&(v)->counter, i); } else { __atomic_release_fence(); raw_atomic64_set(v, i); } #endif } /** * raw_atomic64_add() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_add() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_add(s64 i, atomic64_t *v) { arch_atomic64_add(i, v); } /** * raw_atomic64_add_return() - atomic add with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_add_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_add_return(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_return) return arch_atomic64_add_return(i, v); #elif defined(arch_atomic64_add_return_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_add_return_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_add_return" #endif } /** * raw_atomic64_add_return_acquire() - atomic add with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_add_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_add_return_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_return_acquire) return arch_atomic64_add_return_acquire(i, v); #elif defined(arch_atomic64_add_return_relaxed) s64 ret = arch_atomic64_add_return_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_add_return) return arch_atomic64_add_return(i, v); #else #error "Unable to define raw_atomic64_add_return_acquire" #endif } /** * raw_atomic64_add_return_release() - atomic add with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_add_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_add_return_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_return_release) return arch_atomic64_add_return_release(i, v); #elif defined(arch_atomic64_add_return_relaxed) __atomic_release_fence(); return arch_atomic64_add_return_relaxed(i, v); #elif defined(arch_atomic64_add_return) return arch_atomic64_add_return(i, v); #else #error "Unable to define raw_atomic64_add_return_release" #endif } /** * raw_atomic64_add_return_relaxed() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_add_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_add_return_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_return_relaxed) return arch_atomic64_add_return_relaxed(i, v); #elif defined(arch_atomic64_add_return) return arch_atomic64_add_return(i, v); #else #error "Unable to define raw_atomic64_add_return_relaxed" #endif } /** * raw_atomic64_fetch_add() - atomic add with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_add() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_add) return arch_atomic64_fetch_add(i, v); #elif defined(arch_atomic64_fetch_add_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_add_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_add" #endif } /** * raw_atomic64_fetch_add_acquire() - atomic add with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_add_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_add_acquire) return arch_atomic64_fetch_add_acquire(i, v); #elif defined(arch_atomic64_fetch_add_relaxed) s64 ret = arch_atomic64_fetch_add_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_add) return arch_atomic64_fetch_add(i, v); #else #error "Unable to define raw_atomic64_fetch_add_acquire" #endif } /** * raw_atomic64_fetch_add_release() - atomic add with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_add_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_add_release) return arch_atomic64_fetch_add_release(i, v); #elif defined(arch_atomic64_fetch_add_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_add_relaxed(i, v); #elif defined(arch_atomic64_fetch_add) return arch_atomic64_fetch_add(i, v); #else #error "Unable to define raw_atomic64_fetch_add_release" #endif } /** * raw_atomic64_fetch_add_relaxed() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_add_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_add_relaxed) return arch_atomic64_fetch_add_relaxed(i, v); #elif defined(arch_atomic64_fetch_add) return arch_atomic64_fetch_add(i, v); #else #error "Unable to define raw_atomic64_fetch_add_relaxed" #endif } /** * raw_atomic64_sub() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_sub() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_sub(s64 i, atomic64_t *v) { arch_atomic64_sub(i, v); } /** * raw_atomic64_sub_return() - atomic subtract with full ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_sub_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_sub_return(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_return) return arch_atomic64_sub_return(i, v); #elif defined(arch_atomic64_sub_return_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_sub_return_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_sub_return" #endif } /** * raw_atomic64_sub_return_acquire() - atomic subtract with acquire ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_sub_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_sub_return_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_return_acquire) return arch_atomic64_sub_return_acquire(i, v); #elif defined(arch_atomic64_sub_return_relaxed) s64 ret = arch_atomic64_sub_return_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_sub_return) return arch_atomic64_sub_return(i, v); #else #error "Unable to define raw_atomic64_sub_return_acquire" #endif } /** * raw_atomic64_sub_return_release() - atomic subtract with release ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_sub_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_sub_return_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_return_release) return arch_atomic64_sub_return_release(i, v); #elif defined(arch_atomic64_sub_return_relaxed) __atomic_release_fence(); return arch_atomic64_sub_return_relaxed(i, v); #elif defined(arch_atomic64_sub_return) return arch_atomic64_sub_return(i, v); #else #error "Unable to define raw_atomic64_sub_return_release" #endif } /** * raw_atomic64_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_sub_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_sub_return_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_return_relaxed) return arch_atomic64_sub_return_relaxed(i, v); #elif defined(arch_atomic64_sub_return) return arch_atomic64_sub_return(i, v); #else #error "Unable to define raw_atomic64_sub_return_relaxed" #endif } /** * raw_atomic64_fetch_sub() - atomic subtract with full ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_sub() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_sub(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_sub) return arch_atomic64_fetch_sub(i, v); #elif defined(arch_atomic64_fetch_sub_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_sub_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_sub" #endif } /** * raw_atomic64_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_sub_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_sub_acquire) return arch_atomic64_fetch_sub_acquire(i, v); #elif defined(arch_atomic64_fetch_sub_relaxed) s64 ret = arch_atomic64_fetch_sub_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_sub) return arch_atomic64_fetch_sub(i, v); #else #error "Unable to define raw_atomic64_fetch_sub_acquire" #endif } /** * raw_atomic64_fetch_sub_release() - atomic subtract with release ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_sub_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_sub_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_sub_release) return arch_atomic64_fetch_sub_release(i, v); #elif defined(arch_atomic64_fetch_sub_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_sub_relaxed(i, v); #elif defined(arch_atomic64_fetch_sub) return arch_atomic64_fetch_sub(i, v); #else #error "Unable to define raw_atomic64_fetch_sub_release" #endif } /** * raw_atomic64_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_sub_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_sub_relaxed) return arch_atomic64_fetch_sub_relaxed(i, v); #elif defined(arch_atomic64_fetch_sub) return arch_atomic64_fetch_sub(i, v); #else #error "Unable to define raw_atomic64_fetch_sub_relaxed" #endif } /** * raw_atomic64_inc() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_inc() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_inc(atomic64_t *v) { #if defined(arch_atomic64_inc) arch_atomic64_inc(v); #else raw_atomic64_add(1, v); #endif } /** * raw_atomic64_inc_return() - atomic increment with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_inc_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_inc_return(atomic64_t *v) { #if defined(arch_atomic64_inc_return) return arch_atomic64_inc_return(v); #elif defined(arch_atomic64_inc_return_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_inc_return_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_add_return(1, v); #endif } /** * raw_atomic64_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_inc_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_inc_return_acquire(atomic64_t *v) { #if defined(arch_atomic64_inc_return_acquire) return arch_atomic64_inc_return_acquire(v); #elif defined(arch_atomic64_inc_return_relaxed) s64 ret = arch_atomic64_inc_return_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_inc_return) return arch_atomic64_inc_return(v); #else return raw_atomic64_add_return_acquire(1, v); #endif } /** * raw_atomic64_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic64_inc_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_inc_return_release(atomic64_t *v) { #if defined(arch_atomic64_inc_return_release) return arch_atomic64_inc_return_release(v); #elif defined(arch_atomic64_inc_return_relaxed) __atomic_release_fence(); return arch_atomic64_inc_return_relaxed(v); #elif defined(arch_atomic64_inc_return) return arch_atomic64_inc_return(v); #else return raw_atomic64_add_return_release(1, v); #endif } /** * raw_atomic64_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_inc_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_inc_return_relaxed(atomic64_t *v) { #if defined(arch_atomic64_inc_return_relaxed) return arch_atomic64_inc_return_relaxed(v); #elif defined(arch_atomic64_inc_return) return arch_atomic64_inc_return(v); #else return raw_atomic64_add_return_relaxed(1, v); #endif } /** * raw_atomic64_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_inc() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_inc(atomic64_t *v) { #if defined(arch_atomic64_fetch_inc) return arch_atomic64_fetch_inc(v); #elif defined(arch_atomic64_fetch_inc_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_inc_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_fetch_add(1, v); #endif } /** * raw_atomic64_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_inc_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_inc_acquire(atomic64_t *v) { #if defined(arch_atomic64_fetch_inc_acquire) return arch_atomic64_fetch_inc_acquire(v); #elif defined(arch_atomic64_fetch_inc_relaxed) s64 ret = arch_atomic64_fetch_inc_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_inc) return arch_atomic64_fetch_inc(v); #else return raw_atomic64_fetch_add_acquire(1, v); #endif } /** * raw_atomic64_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_inc_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_inc_release(atomic64_t *v) { #if defined(arch_atomic64_fetch_inc_release) return arch_atomic64_fetch_inc_release(v); #elif defined(arch_atomic64_fetch_inc_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_inc_relaxed(v); #elif defined(arch_atomic64_fetch_inc) return arch_atomic64_fetch_inc(v); #else return raw_atomic64_fetch_add_release(1, v); #endif } /** * raw_atomic64_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_inc_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_inc_relaxed(atomic64_t *v) { #if defined(arch_atomic64_fetch_inc_relaxed) return arch_atomic64_fetch_inc_relaxed(v); #elif defined(arch_atomic64_fetch_inc) return arch_atomic64_fetch_inc(v); #else return raw_atomic64_fetch_add_relaxed(1, v); #endif } /** * raw_atomic64_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_dec() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_dec(atomic64_t *v) { #if defined(arch_atomic64_dec) arch_atomic64_dec(v); #else raw_atomic64_sub(1, v); #endif } /** * raw_atomic64_dec_return() - atomic decrement with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_dec_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_dec_return(atomic64_t *v) { #if defined(arch_atomic64_dec_return) return arch_atomic64_dec_return(v); #elif defined(arch_atomic64_dec_return_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_dec_return_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_sub_return(1, v); #endif } /** * raw_atomic64_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_dec_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_dec_return_acquire(atomic64_t *v) { #if defined(arch_atomic64_dec_return_acquire) return arch_atomic64_dec_return_acquire(v); #elif defined(arch_atomic64_dec_return_relaxed) s64 ret = arch_atomic64_dec_return_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_dec_return) return arch_atomic64_dec_return(v); #else return raw_atomic64_sub_return_acquire(1, v); #endif } /** * raw_atomic64_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic64_dec_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_dec_return_release(atomic64_t *v) { #if defined(arch_atomic64_dec_return_release) return arch_atomic64_dec_return_release(v); #elif defined(arch_atomic64_dec_return_relaxed) __atomic_release_fence(); return arch_atomic64_dec_return_relaxed(v); #elif defined(arch_atomic64_dec_return) return arch_atomic64_dec_return(v); #else return raw_atomic64_sub_return_release(1, v); #endif } /** * raw_atomic64_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_dec_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_dec_return_relaxed(atomic64_t *v) { #if defined(arch_atomic64_dec_return_relaxed) return arch_atomic64_dec_return_relaxed(v); #elif defined(arch_atomic64_dec_return) return arch_atomic64_dec_return(v); #else return raw_atomic64_sub_return_relaxed(1, v); #endif } /** * raw_atomic64_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_dec() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_dec(atomic64_t *v) { #if defined(arch_atomic64_fetch_dec) return arch_atomic64_fetch_dec(v); #elif defined(arch_atomic64_fetch_dec_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_dec_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_fetch_sub(1, v); #endif } /** * raw_atomic64_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_dec_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_dec_acquire(atomic64_t *v) { #if defined(arch_atomic64_fetch_dec_acquire) return arch_atomic64_fetch_dec_acquire(v); #elif defined(arch_atomic64_fetch_dec_relaxed) s64 ret = arch_atomic64_fetch_dec_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_dec) return arch_atomic64_fetch_dec(v); #else return raw_atomic64_fetch_sub_acquire(1, v); #endif } /** * raw_atomic64_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_dec_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_dec_release(atomic64_t *v) { #if defined(arch_atomic64_fetch_dec_release) return arch_atomic64_fetch_dec_release(v); #elif defined(arch_atomic64_fetch_dec_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_dec_relaxed(v); #elif defined(arch_atomic64_fetch_dec) return arch_atomic64_fetch_dec(v); #else return raw_atomic64_fetch_sub_release(1, v); #endif } /** * raw_atomic64_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_dec_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_dec_relaxed(atomic64_t *v) { #if defined(arch_atomic64_fetch_dec_relaxed) return arch_atomic64_fetch_dec_relaxed(v); #elif defined(arch_atomic64_fetch_dec) return arch_atomic64_fetch_dec(v); #else return raw_atomic64_fetch_sub_relaxed(1, v); #endif } /** * raw_atomic64_and() - atomic bitwise AND with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_and() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_and(s64 i, atomic64_t *v) { arch_atomic64_and(i, v); } /** * raw_atomic64_fetch_and() - atomic bitwise AND with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_and() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_and(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_and) return arch_atomic64_fetch_and(i, v); #elif defined(arch_atomic64_fetch_and_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_and_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_and" #endif } /** * raw_atomic64_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_and_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_and_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_and_acquire) return arch_atomic64_fetch_and_acquire(i, v); #elif defined(arch_atomic64_fetch_and_relaxed) s64 ret = arch_atomic64_fetch_and_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_and) return arch_atomic64_fetch_and(i, v); #else #error "Unable to define raw_atomic64_fetch_and_acquire" #endif } /** * raw_atomic64_fetch_and_release() - atomic bitwise AND with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_and_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_and_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_and_release) return arch_atomic64_fetch_and_release(i, v); #elif defined(arch_atomic64_fetch_and_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_and_relaxed(i, v); #elif defined(arch_atomic64_fetch_and) return arch_atomic64_fetch_and(i, v); #else #error "Unable to define raw_atomic64_fetch_and_release" #endif } /** * raw_atomic64_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_and_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_and_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_and_relaxed) return arch_atomic64_fetch_and_relaxed(i, v); #elif defined(arch_atomic64_fetch_and) return arch_atomic64_fetch_and(i, v); #else #error "Unable to define raw_atomic64_fetch_and_relaxed" #endif } /** * raw_atomic64_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_andnot() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_andnot(s64 i, atomic64_t *v) { #if defined(arch_atomic64_andnot) arch_atomic64_andnot(i, v); #else raw_atomic64_and(~i, v); #endif } /** * raw_atomic64_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_andnot() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_andnot(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_andnot) return arch_atomic64_fetch_andnot(i, v); #elif defined(arch_atomic64_fetch_andnot_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_andnot_relaxed(i, v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_fetch_and(~i, v); #endif } /** * raw_atomic64_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_andnot_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_andnot_acquire) return arch_atomic64_fetch_andnot_acquire(i, v); #elif defined(arch_atomic64_fetch_andnot_relaxed) s64 ret = arch_atomic64_fetch_andnot_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_andnot) return arch_atomic64_fetch_andnot(i, v); #else return raw_atomic64_fetch_and_acquire(~i, v); #endif } /** * raw_atomic64_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_andnot_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_andnot_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_andnot_release) return arch_atomic64_fetch_andnot_release(i, v); #elif defined(arch_atomic64_fetch_andnot_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_andnot_relaxed(i, v); #elif defined(arch_atomic64_fetch_andnot) return arch_atomic64_fetch_andnot(i, v); #else return raw_atomic64_fetch_and_release(~i, v); #endif } /** * raw_atomic64_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_andnot_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_andnot_relaxed) return arch_atomic64_fetch_andnot_relaxed(i, v); #elif defined(arch_atomic64_fetch_andnot) return arch_atomic64_fetch_andnot(i, v); #else return raw_atomic64_fetch_and_relaxed(~i, v); #endif } /** * raw_atomic64_or() - atomic bitwise OR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_or() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_or(s64 i, atomic64_t *v) { arch_atomic64_or(i, v); } /** * raw_atomic64_fetch_or() - atomic bitwise OR with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_or() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_or(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_or) return arch_atomic64_fetch_or(i, v); #elif defined(arch_atomic64_fetch_or_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_or_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_or" #endif } /** * raw_atomic64_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_or_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_or_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_or_acquire) return arch_atomic64_fetch_or_acquire(i, v); #elif defined(arch_atomic64_fetch_or_relaxed) s64 ret = arch_atomic64_fetch_or_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_or) return arch_atomic64_fetch_or(i, v); #else #error "Unable to define raw_atomic64_fetch_or_acquire" #endif } /** * raw_atomic64_fetch_or_release() - atomic bitwise OR with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_or_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_or_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_or_release) return arch_atomic64_fetch_or_release(i, v); #elif defined(arch_atomic64_fetch_or_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_or_relaxed(i, v); #elif defined(arch_atomic64_fetch_or) return arch_atomic64_fetch_or(i, v); #else #error "Unable to define raw_atomic64_fetch_or_release" #endif } /** * raw_atomic64_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_or_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_or_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_or_relaxed) return arch_atomic64_fetch_or_relaxed(i, v); #elif defined(arch_atomic64_fetch_or) return arch_atomic64_fetch_or(i, v); #else #error "Unable to define raw_atomic64_fetch_or_relaxed" #endif } /** * raw_atomic64_xor() - atomic bitwise XOR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_xor() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_xor(s64 i, atomic64_t *v) { arch_atomic64_xor(i, v); } /** * raw_atomic64_fetch_xor() - atomic bitwise XOR with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_xor() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_xor(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_xor) return arch_atomic64_fetch_xor(i, v); #elif defined(arch_atomic64_fetch_xor_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_xor_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_xor" #endif } /** * raw_atomic64_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_xor_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_xor_acquire) return arch_atomic64_fetch_xor_acquire(i, v); #elif defined(arch_atomic64_fetch_xor_relaxed) s64 ret = arch_atomic64_fetch_xor_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_xor) return arch_atomic64_fetch_xor(i, v); #else #error "Unable to define raw_atomic64_fetch_xor_acquire" #endif } /** * raw_atomic64_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_xor_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_xor_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_xor_release) return arch_atomic64_fetch_xor_release(i, v); #elif defined(arch_atomic64_fetch_xor_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_xor_relaxed(i, v); #elif defined(arch_atomic64_fetch_xor) return arch_atomic64_fetch_xor(i, v); #else #error "Unable to define raw_atomic64_fetch_xor_release" #endif } /** * raw_atomic64_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_xor_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_xor_relaxed) return arch_atomic64_fetch_xor_relaxed(i, v); #elif defined(arch_atomic64_fetch_xor) return arch_atomic64_fetch_xor(i, v); #else #error "Unable to define raw_atomic64_fetch_xor_relaxed" #endif } /** * raw_atomic64_xchg() - atomic exchange with full ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with full ordering. * * Safe to use in noinstr code; prefer atomic64_xchg() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_xchg(atomic64_t *v, s64 new) { #if defined(arch_atomic64_xchg) return arch_atomic64_xchg(v, new); #elif defined(arch_atomic64_xchg_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_xchg_relaxed(v, new); __atomic_post_full_fence(); return ret; #else return raw_xchg(&v->counter, new); #endif } /** * raw_atomic64_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_xchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_xchg_acquire(atomic64_t *v, s64 new) { #if defined(arch_atomic64_xchg_acquire) return arch_atomic64_xchg_acquire(v, new); #elif defined(arch_atomic64_xchg_relaxed) s64 ret = arch_atomic64_xchg_relaxed(v, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_xchg) return arch_atomic64_xchg(v, new); #else return raw_xchg_acquire(&v->counter, new); #endif } /** * raw_atomic64_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with release ordering. * * Safe to use in noinstr code; prefer atomic64_xchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_xchg_release(atomic64_t *v, s64 new) { #if defined(arch_atomic64_xchg_release) return arch_atomic64_xchg_release(v, new); #elif defined(arch_atomic64_xchg_relaxed) __atomic_release_fence(); return arch_atomic64_xchg_relaxed(v, new); #elif defined(arch_atomic64_xchg) return arch_atomic64_xchg(v, new); #else return raw_xchg_release(&v->counter, new); #endif } /** * raw_atomic64_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_xchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_xchg_relaxed(atomic64_t *v, s64 new) { #if defined(arch_atomic64_xchg_relaxed) return arch_atomic64_xchg_relaxed(v, new); #elif defined(arch_atomic64_xchg) return arch_atomic64_xchg(v, new); #else return raw_xchg_relaxed(&v->counter, new); #endif } /** * raw_atomic64_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_cmpxchg() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { #if defined(arch_atomic64_cmpxchg) return arch_atomic64_cmpxchg(v, old, new); #elif defined(arch_atomic64_cmpxchg_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; #else return raw_cmpxchg(&v->counter, old, new); #endif } /** * raw_atomic64_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_cmpxchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) { #if defined(arch_atomic64_cmpxchg_acquire) return arch_atomic64_cmpxchg_acquire(v, old, new); #elif defined(arch_atomic64_cmpxchg_relaxed) s64 ret = arch_atomic64_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_cmpxchg) return arch_atomic64_cmpxchg(v, old, new); #else return raw_cmpxchg_acquire(&v->counter, old, new); #endif } /** * raw_atomic64_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_cmpxchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) { #if defined(arch_atomic64_cmpxchg_release) return arch_atomic64_cmpxchg_release(v, old, new); #elif defined(arch_atomic64_cmpxchg_relaxed) __atomic_release_fence(); return arch_atomic64_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic64_cmpxchg) return arch_atomic64_cmpxchg(v, old, new); #else return raw_cmpxchg_release(&v->counter, old, new); #endif } /** * raw_atomic64_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_cmpxchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new) { #if defined(arch_atomic64_cmpxchg_relaxed) return arch_atomic64_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic64_cmpxchg) return arch_atomic64_cmpxchg(v, old, new); #else return raw_cmpxchg_relaxed(&v->counter, old, new); #endif } /** * raw_atomic64_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { #if defined(arch_atomic64_try_cmpxchg) return arch_atomic64_try_cmpxchg(v, old, new); #elif defined(arch_atomic64_try_cmpxchg_relaxed) bool ret; __atomic_pre_full_fence(); ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; #else s64 r, o = *old; r = raw_atomic64_cmpxchg(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic64_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_acquire() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { #if defined(arch_atomic64_try_cmpxchg_acquire) return arch_atomic64_try_cmpxchg_acquire(v, old, new); #elif defined(arch_atomic64_try_cmpxchg_relaxed) bool ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_try_cmpxchg) return arch_atomic64_try_cmpxchg(v, old, new); #else s64 r, o = *old; r = raw_atomic64_cmpxchg_acquire(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic64_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_release() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { #if defined(arch_atomic64_try_cmpxchg_release) return arch_atomic64_try_cmpxchg_release(v, old, new); #elif defined(arch_atomic64_try_cmpxchg_relaxed) __atomic_release_fence(); return arch_atomic64_try_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic64_try_cmpxchg) return arch_atomic64_try_cmpxchg(v, old, new); #else s64 r, o = *old; r = raw_atomic64_cmpxchg_release(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic64_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_relaxed() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) { #if defined(arch_atomic64_try_cmpxchg_relaxed) return arch_atomic64_try_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic64_try_cmpxchg) return arch_atomic64_try_cmpxchg(v, old, new); #else s64 r, o = *old; r = raw_atomic64_cmpxchg_relaxed(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic64_sub_and_test() - atomic subtract and test if zero with full ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_sub_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic64_sub_and_test(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_and_test) return arch_atomic64_sub_and_test(i, v); #else return raw_atomic64_sub_return(i, v) == 0; #endif } /** * raw_atomic64_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_dec_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic64_dec_and_test(atomic64_t *v) { #if defined(arch_atomic64_dec_and_test) return arch_atomic64_dec_and_test(v); #else return raw_atomic64_dec_return(v) == 0; #endif } /** * raw_atomic64_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_inc_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic64_inc_and_test(atomic64_t *v) { #if defined(arch_atomic64_inc_and_test) return arch_atomic64_inc_and_test(v); #else return raw_atomic64_inc_return(v) == 0; #endif } /** * raw_atomic64_add_negative() - atomic add and test if negative with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_add_negative() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic64_add_negative(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_negative) return arch_atomic64_add_negative(i, v); #elif defined(arch_atomic64_add_negative_relaxed) bool ret; __atomic_pre_full_fence(); ret = arch_atomic64_add_negative_relaxed(i, v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_add_return(i, v) < 0; #endif } /** * raw_atomic64_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_add_negative_acquire() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic64_add_negative_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_negative_acquire) return arch_atomic64_add_negative_acquire(i, v); #elif defined(arch_atomic64_add_negative_relaxed) bool ret = arch_atomic64_add_negative_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_add_negative) return arch_atomic64_add_negative(i, v); #else return raw_atomic64_add_return_acquire(i, v) < 0; #endif } /** * raw_atomic64_add_negative_release() - atomic add and test if negative with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_add_negative_release() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic64_add_negative_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_negative_release) return arch_atomic64_add_negative_release(i, v); #elif defined(arch_atomic64_add_negative_relaxed) __atomic_release_fence(); return arch_atomic64_add_negative_relaxed(i, v); #elif defined(arch_atomic64_add_negative) return arch_atomic64_add_negative(i, v); #else return raw_atomic64_add_return_release(i, v) < 0; #endif } /** * raw_atomic64_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_add_negative_relaxed() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic64_add_negative_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_negative_relaxed) return arch_atomic64_add_negative_relaxed(i, v); #elif defined(arch_atomic64_add_negative) return arch_atomic64_add_negative(i, v); #else return raw_atomic64_add_return_relaxed(i, v) < 0; #endif } /** * raw_atomic64_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic64_t * @a: s64 value to add * @u: s64 value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_fetch_add_unless() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { #if defined(arch_atomic64_fetch_add_unless) return arch_atomic64_fetch_add_unless(v, a, u); #else s64 c = raw_atomic64_read(v); do { if (unlikely(c == u)) break; } while (!raw_atomic64_try_cmpxchg(v, &c, c + a)); return c; #endif } /** * raw_atomic64_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic64_t * @a: s64 value to add * @u: s64 value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_add_unless() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { #if defined(arch_atomic64_add_unless) return arch_atomic64_add_unless(v, a, u); #else return raw_atomic64_fetch_add_unless(v, a, u) != u; #endif } /** * raw_atomic64_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic64_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_inc_not_zero() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic64_inc_not_zero(atomic64_t *v) { #if defined(arch_atomic64_inc_not_zero) return arch_atomic64_inc_not_zero(v); #else return raw_atomic64_add_unless(v, 1, 0); #endif } /** * raw_atomic64_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic64_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_inc_unless_negative() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic64_inc_unless_negative(atomic64_t *v) { #if defined(arch_atomic64_inc_unless_negative) return arch_atomic64_inc_unless_negative(v); #else s64 c = raw_atomic64_read(v); do { if (unlikely(c < 0)) return false; } while (!raw_atomic64_try_cmpxchg(v, &c, c + 1)); return true; #endif } /** * raw_atomic64_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic64_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_dec_unless_positive() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic64_dec_unless_positive(atomic64_t *v) { #if defined(arch_atomic64_dec_unless_positive) return arch_atomic64_dec_unless_positive(v); #else s64 c = raw_atomic64_read(v); do { if (unlikely(c > 0)) return false; } while (!raw_atomic64_try_cmpxchg(v, &c, c - 1)); return true; #endif } /** * raw_atomic64_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic64_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_dec_if_positive() elsewhere. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline s64 raw_atomic64_dec_if_positive(atomic64_t *v) { #if defined(arch_atomic64_dec_if_positive) return arch_atomic64_dec_if_positive(v); #else s64 dec, c = raw_atomic64_read(v); do { dec = c - 1; if (unlikely(dec < 0)) break; } while (!raw_atomic64_try_cmpxchg(v, &c, dec)); return dec; #endif } #endif /* _LINUX_ATOMIC_FALLBACK_H */ // b565db590afeeff0d7c9485ccbca5bb6e155749f |
| 139 610 211 211 210 210 224 223 601 598 41 42 584 585 583 244 245 177 179 179 226 536 513 514 513 285 285 285 275 275 94 94 97 97 60 60 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 | // SPDX-License-Identifier: GPL-2.0-only /* * Simple NUMA memory policy for the Linux kernel. * * Copyright 2003,2004 Andi Kleen, SuSE Labs. * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. * * NUMA policy allows the user to give hints in which node(s) memory should * be allocated. * * Support six policies per VMA and per process: * * The VMA policy has priority over the process policy for a page fault. * * interleave Allocate memory interleaved over a set of nodes, * with normal fallback if it fails. * For VMA based allocations this interleaves based on the * offset into the backing object or offset into the mapping * for anonymous memory. For process policy an process counter * is used. * * weighted interleave * Allocate memory interleaved over a set of nodes based on * a set of weights (per-node), with normal fallback if it * fails. Otherwise operates the same as interleave. * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated * on node 0 for every 1 page allocated on node 1. * * bind Only allocate memory on a specific set of nodes, * no fallback. * FIXME: memory is allocated starting with the first node * to the last. It would be better if bind would truly restrict * the allocation to memory nodes instead * * preferred Try a specific node first before normal fallback. * As a special case NUMA_NO_NODE here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default * process policy. * * preferred many Try a set of nodes first before normal fallback. This is * similar to preferred without the special case. * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. * * The process policy is applied for most non interrupt memory allocations * in that process' context. Interrupts ignore the policies and always * try to allocate on the local CPU. The VMA policy is only applied for memory * allocations for a VMA in the VM. * * Currently there are a few corner cases in swapping where the policy * is not applied, but the majority should be handled. When process policy * is used it is not remembered over swap outs/swap ins. * * Only the highest zone in the zone hierarchy gets policied. Allocations * requesting a lower zone just use default policy. This implies that * on systems with highmem kernel lowmem allocation don't get policied. * Same with GFP_DMA allocations. * * For shmem/tmpfs shared memory the policy is shared between * all users and remembered even when nobody has memory mapped. */ /* Notebook: fix mmap readahead to honour policy and enable policy for any page cache object statistics for bigpages global policy for page cache? currently it uses process policy. Requires first item above. handle mremap for shared memory (currently ignored for the policy) grows down? make bind policy root only? It can trigger oom much faster and the kernel is not always grateful with that. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mempolicy.h> #include <linux/pagewalk.h> #include <linux/highmem.h> #include <linux/hugetlb.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/nodemask.h> #include <linux/cpuset.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/ptrace.h> #include <linux/swap.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/migrate.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ctype.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/printk.h> #include <linux/swapops.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <linux/uaccess.h> #include "internal.h" /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; /* Highest zone. An specific allocation for a zone below that is not policied. */ enum zone_type policy_zone = 0; /* * run-time system-wide default policy => local allocation */ static struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ .mode = MPOL_LOCAL, }; static struct mempolicy preferred_node_policy[MAX_NUMNODES]; /* * iw_table is the sysfs-set interleave weight table, a value of 0 denotes * system-default value should be used. A NULL iw_table also denotes that * system-default values should be used. Until the system-default table * is implemented, the system-default is always 1. * * iw_table is RCU protected */ static u8 __rcu *iw_table; static DEFINE_MUTEX(iw_table_lock); static u8 get_il_weight(int node) { u8 *table; u8 weight; rcu_read_lock(); table = rcu_dereference(iw_table); /* if no iw_table, use system default */ weight = table ? table[node] : 1; /* if value in iw_table is 0, use system default */ weight = weight ? weight : 1; rcu_read_unlock(); return weight; } /** * numa_nearest_node - Find nearest node by state * @node: Node id to start the search * @state: State to filter the search * * Lookup the closest node by distance if @nid is not in state. * * Return: this @node if it is in state, otherwise the closest node by distance */ int numa_nearest_node(int node, unsigned int state) { int min_dist = INT_MAX, dist, n, min_node; if (state >= NR_NODE_STATES) return -EINVAL; if (node == NUMA_NO_NODE || node_state(node, state)) return node; min_node = node; for_each_node_state(n, state) { dist = node_distance(node, n); if (dist < min_dist) { min_dist = dist; min_node = n; } } return min_node; } EXPORT_SYMBOL_GPL(numa_nearest_node); struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; int node; if (pol) return pol; node = numa_node_id(); if (node != NUMA_NO_NODE) { pol = &preferred_node_policy[node]; /* preferred_node_policy is not initialised early in boot */ if (pol->mode) return pol; } return &default_policy; } static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); } mpol_ops[MPOL_MAX]; static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { return pol->flags & MPOL_MODE_FLAGS; } static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, const nodemask_t *rel) { nodemask_t tmp; nodes_fold(tmp, *orig, nodes_weight(*rel)); nodes_onto(*ret, tmp, *rel); } static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; pol->nodes = *nodes; return 0; } static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; nodes_clear(pol->nodes); node_set(first_node(*nodes), pol->nodes); return 0; } /* * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * any, for the new policy. mpol_new() has already validated the nodes * parameter with respect to the policy mode and flags. * * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_lock for write. */ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes, struct nodemask_scratch *nsc) { int ret; /* * Default (pol==NULL) resp. local memory policies are not a * subject of any remapping. They also do not need any special * constructor. */ if (!pol || pol->mode == MPOL_LOCAL) return 0; /* Check N_MEMORY */ nodes_and(nsc->mask1, cpuset_current_mems_allowed, node_states[N_MEMORY]); VM_BUG_ON(!nodes); if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); else nodes_and(nsc->mask2, *nodes, nsc->mask1); if (mpol_store_user_nodemask(pol)) pol->w.user_nodemask = *nodes; else pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); return ret; } /* * This function just creates a new policy, does some check and simple * initialization. You must invoke mpol_set_nodemask() to set nodes. */ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *policy; if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); return NULL; } VM_BUG_ON(!nodes); /* * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). * All other modes require a valid pointer to a non-empty nodemask. */ if (mode == MPOL_PREFERRED) { if (nodes_empty(*nodes)) { if (((flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); mode = MPOL_LOCAL; } } else if (mode == MPOL_LOCAL) { if (!nodes_empty(*nodes) || (flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES)) return ERR_PTR(-EINVAL); } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); atomic_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; policy->home_node = NUMA_NO_NODE; return policy; } /* Slow path of a mpol destructor. */ void __mpol_put(struct mempolicy *pol) { if (!atomic_dec_and_test(&pol->refcnt)) return; kmem_cache_free(policy_cache, pol); } static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { } static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { nodemask_t tmp; if (pol->flags & MPOL_F_STATIC_NODES) nodes_and(tmp, pol->w.user_nodemask, *nodes); else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed, *nodes); pol->w.cpuset_mems_allowed = *nodes; } if (nodes_empty(tmp)) tmp = *nodes; pol->nodes = tmp; } static void mpol_rebind_preferred(struct mempolicy *pol, const nodemask_t *nodes) { pol->w.cpuset_mems_allowed = *nodes; } /* * mpol_rebind_policy - Migrate a policy to a different set of nodes * * Per-vma policies are protected by mmap_lock. Allocations using per-task * policies are protected by task->mems_allowed_seq to prevent a premature * OOM/allocation failure due to parallel nodemask modification. */ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol || pol->mode == MPOL_LOCAL) return; if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; mpol_ops[pol->mode].rebind(pol, newmask); } /* * Wrapper for mpol_rebind_policy() that just requires task * pointer, and updates task mempolicy. * * Called with task's alloc_lock held. */ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { mpol_rebind_policy(tsk->mempolicy, new); } /* * Rebind each vma in mm to new nodemask. * * Call holding a reference to mm. Takes mm->mmap_lock during call. */ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); mmap_write_lock(mm); for_each_vma(vmi, vma) { vma_start_write(vma); mpol_rebind_policy(vma->vm_policy, new); } mmap_write_unlock(mm); } static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { [MPOL_DEFAULT] = { .rebind = mpol_rebind_default, }, [MPOL_INTERLEAVE] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_PREFERRED] = { .create = mpol_new_preferred, .rebind = mpol_rebind_preferred, }, [MPOL_BIND] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_LOCAL] = { .rebind = mpol_rebind_default, }, [MPOL_PREFERRED_MANY] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_preferred, }, [MPOL_WEIGHTED_INTERLEAVE] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, }; static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, pgoff_t ilx, int *nid); static bool strictly_unmovable(unsigned long flags) { /* * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO * if any misplaced page is found. */ return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == MPOL_MF_STRICT; } struct migration_mpol { /* for alloc_migration_target_by_mpol() */ struct mempolicy *pol; pgoff_t ilx; }; struct queue_pages { struct list_head *pagelist; unsigned long flags; nodemask_t *nmask; unsigned long start; unsigned long end; struct vm_area_struct *first; struct folio *large; /* note last large folio encountered */ long nr_failed; /* could not be isolated at this time */ }; /* * Check if the folio's nid is in qp->nmask. * * If MPOL_MF_INVERT is set in qp->flags, check if the nid is * in the invert of qp->nmask. */ static inline bool queue_folio_required(struct folio *folio, struct queue_pages *qp) { int nid = folio_nid(folio); unsigned long flags = qp->flags; return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); } static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) { struct folio *folio; struct queue_pages *qp = walk->private; if (unlikely(is_pmd_migration_entry(*pmd))) { qp->nr_failed++; return; } folio = pmd_folio(*pmd); if (is_huge_zero_folio(folio)) { walk->action = ACTION_CONTINUE; return; } if (!queue_folio_required(folio, qp)) return; if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(walk->vma) || !migrate_folio_add(folio, qp->pagelist, qp->flags)) qp->nr_failed++; } /* * Scan through folios, checking if they satisfy the required conditions, * moving them from LRU to local pagelist for migration if they do (or not). * * queue_folios_pte_range() has two possible return values: * 0 - continue walking to scan for more, even if an existing folio on the * wrong node could not be isolated and queued for migration. * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, * and an existing folio was on a node that does not follow the policy. */ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; struct folio *folio; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; pte_t *pte, *mapped_pte; pte_t ptent; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { queue_folios_pmd(pmd, walk); spin_unlock(ptl); goto out; } mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); if (pte_none(ptent)) continue; if (!pte_present(ptent)) { if (is_migration_entry(pte_to_swp_entry(ptent))) qp->nr_failed++; continue; } folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* * vm_normal_folio() filters out zero pages, but there might * still be reserved folios to skip, perhaps in a VDSO. */ if (folio_test_reserved(folio)) continue; if (!queue_folio_required(folio, qp)) continue; if (folio_test_large(folio)) { /* * A large folio can only be isolated from LRU once, * but may be mapped by many PTEs (and Copy-On-Write may * intersperse PTEs of other, order 0, folios). This is * a common case, so don't mistake it for failure (but * there can be other cases of multi-mapped pages which * this quick check does not help to filter out - and a * search of the pagelist might grow to be prohibitive). * * migrate_pages(&pagelist) returns nr_failed folios, so * check "large" now so that queue_pages_range() returns * a comparable nr_failed folios. This does imply that * if folio could not be isolated for some racy reason * at its first PTE, later PTEs will not give it another * chance of isolation; but keeps the accounting simple. */ if (folio == qp->large) continue; qp->large = folio; } if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(vma) || !migrate_folio_add(folio, qp->pagelist, flags)) { qp->nr_failed++; if (strictly_unmovable(flags)) break; } } pte_unmap_unlock(mapped_pte, ptl); cond_resched(); out: if (qp->nr_failed && strictly_unmovable(flags)) return -EIO; return 0; } static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; struct folio *folio; spinlock_t *ptl; pte_t entry; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); entry = huge_ptep_get(walk->mm, addr, pte); if (!pte_present(entry)) { if (unlikely(is_hugetlb_entry_migration(entry))) qp->nr_failed++; goto unlock; } folio = pfn_folio(pte_pfn(entry)); if (!queue_folio_required(folio, qp)) goto unlock; if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(walk->vma)) { qp->nr_failed++; goto unlock; } /* * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * * See folio_likely_mapped_shared() on possible imprecision when we * cannot easily detect if a folio is shared. */ if ((flags & MPOL_MF_MOVE_ALL) || (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) if (!folio_isolate_hugetlb(folio, qp->pagelist)) qp->nr_failed++; unlock: spin_unlock(ptl); if (qp->nr_failed && strictly_unmovable(flags)) return -EIO; #endif return 0; } #ifdef CONFIG_NUMA_BALANCING /* * This is used to mark a range of virtual addresses to be inaccessible. * These are later cleared by a NUMA hinting fault. Depending on these * faults, pages may be migrated for better NUMA placement. * * This is assuming that NUMA faults are handled using PROT_NONE. If * an architecture makes a different choice, it will need further * changes to the core. */ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct mmu_gather tlb; long nr_updated; tlb_gather_mmu(&tlb, vma->vm_mm); nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); if (nr_updated > 0) { count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated); } tlb_finish_mmu(&tlb); return nr_updated; } #endif /* CONFIG_NUMA_BALANCING */ static int queue_pages_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *next, *vma = walk->vma; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; /* range check first */ VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); if (!qp->first) { qp->first = vma; if (!(flags & MPOL_MF_DISCONTIG_OK) && (qp->start < vma->vm_start)) /* hole at head side of range */ return -EFAULT; } next = find_vma(vma->vm_mm, vma->vm_end); if (!(flags & MPOL_MF_DISCONTIG_OK) && ((vma->vm_end < qp->end) && (!next || vma->vm_end < next->vm_start))) /* hole at middle or tail of range */ return -EFAULT; /* * Need check MPOL_MF_STRICT to return -EIO if possible * regardless of vma_migratable */ if (!vma_migratable(vma) && !(flags & MPOL_MF_STRICT)) return 1; /* * Check page nodes, and queue pages to move, in the current vma. * But if no moving, and no strict checking, the scan can be skipped. */ if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) return 0; return 1; } static const struct mm_walk_ops queue_pages_walk_ops = { .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, .walk_lock = PGWALK_RDLOCK, }; static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, .walk_lock = PGWALK_WRLOCK, }; /* * Walk through page tables and collect pages to be migrated. * * If pages found in a given range are not on the required set of @nodes, * and migration is allowed, they are isolated and queued to @pagelist. * * queue_pages_range() may return: * 0 - all pages already on the right node, or successfully queued for moving * (or neither strict checking nor moving requested: only range checking). * >0 - this number of misplaced folios could not be queued for moving * (a hugetlbfs page or a transparent huge page being counted as 1). * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. */ static long queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, struct list_head *pagelist) { int err; struct queue_pages qp = { .pagelist = pagelist, .flags = flags, .nmask = nodes, .start = start, .end = end, .first = NULL, }; const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; err = walk_page_range(mm, start, end, ops, &qp); if (!qp.first) /* whole range in hole */ err = -EFAULT; return err ? : qp.nr_failed; } /* * Apply policy to a single VMA * This must be called with the mmap_lock held for writing. */ static int vma_replace_policy(struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct mempolicy *old; struct mempolicy *new; vma_assert_write_locked(vma); new = mpol_dup(pol); if (IS_ERR(new)) return PTR_ERR(new); if (vma->vm_ops && vma->vm_ops->set_policy) { err = vma->vm_ops->set_policy(vma, new); if (err) goto err_out; } old = vma->vm_policy; vma->vm_policy = new; /* protected by mmap_lock */ mpol_put(old); return 0; err_out: mpol_put(new); return err; } /* Split or merge the VMA (if required) and apply the new policy */ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, struct mempolicy *new_pol) { unsigned long vmstart, vmend; vmend = min(end, vma->vm_end); if (start > vma->vm_start) { *prev = vma; vmstart = start; } else { vmstart = vma->vm_start; } if (mpol_equal(vma->vm_policy, new_pol)) { *prev = vma; return 0; } vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); if (IS_ERR(vma)) return PTR_ERR(vma); *prev = vma; return vma_replace_policy(vma, new_pol); } /* Set the process memory policy */ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *new, *old; NODEMASK_SCRATCH(scratch); int ret; if (!scratch) return -ENOMEM; new = mpol_new(mode, flags, nodes); if (IS_ERR(new)) { ret = PTR_ERR(new); goto out; } task_lock(current); ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); mpol_put(new); goto out; } old = current->mempolicy; current->mempolicy = new; if (new && (new->mode == MPOL_INTERLEAVE || new->mode == MPOL_WEIGHTED_INTERLEAVE)) { current->il_prev = MAX_NUMNODES-1; current->il_weight = 0; } task_unlock(current); mpol_put(old); ret = 0; out: NODEMASK_SCRATCH_FREE(scratch); return ret; } /* * Return nodemask for policy for get_mempolicy() query * * Called with task's alloc_lock held */ static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) { nodes_clear(*nodes); if (pol == &default_policy) return; switch (pol->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_WEIGHTED_INTERLEAVE: *nodes = pol->nodes; break; case MPOL_LOCAL: /* return empty node mask for local allocation */ break; default: BUG(); } } static int lookup_node(struct mm_struct *mm, unsigned long addr) { struct page *p = NULL; int ret; ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); if (ret > 0) { ret = page_to_nid(p); put_page(p); } return ret; } /* Retrieve NUMA policy */ static long do_get_mempolicy(int *policy, nodemask_t *nmask, unsigned long addr, unsigned long flags) { int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) return -EINVAL; if (flags & MPOL_F_MEMS_ALLOWED) { if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; *policy = 0; /* just so it's initialized */ task_lock(current); *nmask = cpuset_current_mems_allowed; task_unlock(current); return 0; } if (flags & MPOL_F_ADDR) { pgoff_t ilx; /* ignored here */ /* * Do NOT fall back to task policy if the * vma/shared policy at addr is NULL. We * want to return MPOL_DEFAULT in this case. */ mmap_read_lock(mm); vma = vma_lookup(mm, addr); if (!vma) { mmap_read_unlock(mm); return -EFAULT; } pol = __get_vma_policy(vma, addr, &ilx); } else if (addr) return -EINVAL; if (!pol) pol = &default_policy; /* indicates default behavior */ if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { /* * Take a refcount on the mpol, because we are about to * drop the mmap_lock, after which only "pol" remains * valid, "vma" is stale. */ pol_refcount = pol; vma = NULL; mpol_get(pol); mmap_read_unlock(mm); err = lookup_node(mm, addr); if (err < 0) goto out; *policy = err; } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { *policy = next_node_in(current->il_prev, pol->nodes); } else if (pol == current->mempolicy && pol->mode == MPOL_WEIGHTED_INTERLEAVE) { if (current->il_weight) *policy = current->il_prev; else *policy = next_node_in(current->il_prev, pol->nodes); } else { err = -EINVAL; goto out; } } else { *policy = pol == &default_policy ? MPOL_DEFAULT : pol->mode; /* * Internal mempolicy flags must be masked off before exposing * the policy to userspace. */ *policy |= (pol->flags & MPOL_MODE_FLAGS); } err = 0; if (nmask) { if (mpol_store_user_nodemask(pol)) { *nmask = pol->w.user_nodemask; } else { task_lock(current); get_policy_nodemask(pol, nmask); task_unlock(current); } } out: mpol_cond_put(pol); if (vma) mmap_read_unlock(mm); if (pol_refcount) mpol_put(pol_refcount); return err; } #ifdef CONFIG_MIGRATION static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { /* * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * * See folio_likely_mapped_shared() on possible imprecision when we * cannot easily detect if a folio is shared. */ if ((flags & MPOL_MF_MOVE_ALL) || !folio_likely_mapped_shared(folio)) { if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, foliolist); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } else { /* * Non-movable folio may reach here. And, there may be * temporary off LRU folios or non-LRU movable folios. * Treat them as unmovable folios since they can't be * isolated, so they can't be moved at the moment. */ return false; } } return true; } /* * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. */ static long migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) { nodemask_t nmask; struct vm_area_struct *vma; LIST_HEAD(pagelist); long nr_failed; long err = 0; struct migration_target_control mtc = { .nid = dest, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, .reason = MR_SYSCALL, }; nodes_clear(nmask); node_set(source, nmask); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); mmap_read_lock(mm); vma = find_vma(mm, 0); if (unlikely(!vma)) { mmap_read_unlock(mm); return 0; } /* * This does not migrate the range, but isolates all pages that * need migration. Between passing in the full user address * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, * but passes back the count of pages which could not be isolated. */ nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); mmap_read_unlock(mm); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(&pagelist); } if (err >= 0) err += nr_failed; return err; } /* * Move pages between the two nodesets so as to preserve the physical * layout as much as possible. * * Returns the number of page that could not be moved. */ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { long nr_failed = 0; long err = 0; nodemask_t tmp; lru_cache_disable(); /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' * bit in 'tmp', and return that <source, dest> pair for migration. * The pair of nodemasks 'to' and 'from' define the map. * * If no pair of bits is found that way, fallback to picking some * pair of 'source' and 'dest' bits that are not the same. If the * 'source' and 'dest' bits are the same, this represents a node * that will be migrating to itself, so no pages need move. * * If no bits are left in 'tmp', or if all remaining bits left * in 'tmp' correspond to the same bit in 'to', return false * (nothing left to migrate). * * This lets us pick a pair of nodes to migrate between, such that * if possible the dest node is not already occupied by some other * source node, minimizing the risk of overloading the memory on a * node that would happen if we migrated incoming memory to a node * before migrating outgoing memory source that same node. * * A single scan of tmp is sufficient. As we go, we remember the * most recent <s, d> pair that moved (s != d). If we find a pair * that not only moved, but what's better, moved to an empty slot * (d is not set in tmp), then we break out then, with that pair. * Otherwise when we finish scanning from_tmp, we at least have the * most recent <s, d> pair that moved. If we get all the way through * the scan of tmp without finding any node that moved, much less * moved to an empty node, then there is nothing left worth migrating. */ tmp = *from; while (!nodes_empty(tmp)) { int s, d; int source = NUMA_NO_NODE; int dest = 0; for_each_node_mask(s, tmp) { /* * do_migrate_pages() tries to maintain the relative * node relationship of the pages established between * threads and memory areas. * * However if the number of source nodes is not equal to * the number of destination nodes we can not preserve * this node relative relationship. In that case, skip * copying memory from a node that is in the destination * mask. * * Example: [2,3,4] -> [3,4,5] moves everything. * [0-7] - > [3,4,5] moves only 0,1,2,6,7. */ if ((nodes_weight(*from) != nodes_weight(*to)) && (node_isset(s, *to))) continue; d = node_remap(s, *from, *to); if (s == d) continue; source = s; /* Node moved. Memorize */ dest = d; /* dest not in remaining from nodes? */ if (!node_isset(dest, tmp)) break; } if (source == NUMA_NO_NODE) break; node_clear(source, tmp); err = migrate_to_node(mm, source, dest, flags); if (err > 0) nr_failed += err; if (err < 0) break; } lru_cache_enable(); if (err < 0) return err; return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; } /* * Allocate a new folio for page migration, according to NUMA mempolicy. */ static struct folio *alloc_migration_target_by_mpol(struct folio *src, unsigned long private) { struct migration_mpol *mmpol = (struct migration_mpol *)private; struct mempolicy *pol = mmpol->pol; pgoff_t ilx = mmpol->ilx; unsigned int order; int nid = numa_node_id(); gfp_t gfp; order = folio_order(src); ilx += src->index >> order; if (folio_test_hugetlb(src)) { nodemask_t *nodemask; struct hstate *h; h = folio_hstate(src); gfp = htlb_alloc_mask(h); nodemask = policy_nodemask(gfp, pol, ilx, &nid); return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp, htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND)); } if (folio_test_large(src)) gfp = GFP_TRANSHUGE; else gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; return folio_alloc_mpol(gfp, order, pol, ilx, nid); } #else static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { return false; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { return -ENOSYS; } static struct folio *alloc_migration_target_by_mpol(struct folio *src, unsigned long private) { return NULL; } #endif static long do_mbind(unsigned long start, unsigned long len, unsigned short mode, unsigned short mode_flags, nodemask_t *nmask, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vma_iterator vmi; struct migration_mpol mmpol; struct mempolicy *new; unsigned long end; long err; long nr_failed; LIST_HEAD(pagelist); if (flags & ~(unsigned long)MPOL_MF_VALID) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; if (start & ~PAGE_MASK) return -EINVAL; if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; len = PAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; new = mpol_new(mode, mode_flags, nmask); if (IS_ERR(new)) return PTR_ERR(new); /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all */ if (!new) flags |= MPOL_MF_DISCONTIG_OK; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_disable(); { NODEMASK_SCRATCH(scratch); if (scratch) { mmap_write_lock(mm); err = mpol_set_nodemask(new, nmask, scratch); if (err) mmap_write_unlock(mm); } else err = -ENOMEM; NODEMASK_SCRATCH_FREE(scratch); } if (err) goto mpol_out; /* * Lock the VMAs before scanning for pages to migrate, * to ensure we don't miss a concurrently inserted page. */ nr_failed = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); if (nr_failed < 0) { err = nr_failed; nr_failed = 0; } else { vma_iter_init(&vmi, mm, start); prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { err = mbind_range(&vmi, vma, &prev, start, end, new); if (err) break; } } if (!err && !list_empty(&pagelist)) { /* Convert MPOL_DEFAULT's NULL to task or default policy */ if (!new) { new = get_task_policy(current); mpol_get(new); } mmpol.pol = new; mmpol.ilx = 0; /* * In the interleaved case, attempt to allocate on exactly the * targeted nodes, for the first VMA to be migrated; for later * VMAs, the nodes will still be interleaved from the targeted * nodemask, but one by one may be selected differently. */ if (new->mode == MPOL_INTERLEAVE || new->mode == MPOL_WEIGHTED_INTERLEAVE) { struct folio *folio; unsigned int order; unsigned long addr = -EFAULT; list_for_each_entry(folio, &pagelist, lru) { if (!folio_test_ksm(folio)) break; } if (!list_entry_is_head(folio, &pagelist, lru)) { vma_iter_init(&vmi, mm, start); for_each_vma_range(vmi, vma, end) { addr = page_address_in_vma(folio, folio_page(folio, 0), vma); if (addr != -EFAULT) break; } } if (addr != -EFAULT) { order = folio_order(folio); /* We already know the pol, but not the ilx */ mpol_cond_put(get_vma_policy(vma, addr, order, &mmpol.ilx)); /* Set base from which to increment by index */ mmpol.ilx -= folio->index >> order; } } } mmap_write_unlock(mm); if (!err && !list_empty(&pagelist)) { nr_failed |= migrate_pages(&pagelist, alloc_migration_target_by_mpol, NULL, (unsigned long)&mmpol, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); } if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); mpol_out: mpol_put(new); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_enable(); return err; } /* * User space interface with variable sized bitmaps for nodelists. */ static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, unsigned long maxnode) { unsigned long nlongs = BITS_TO_LONGS(maxnode); int ret; if (in_compat_syscall()) ret = compat_get_bitmap(mask, (const compat_ulong_t __user *)nmask, maxnode); else ret = copy_from_user(mask, nmask, nlongs * sizeof(unsigned long)); if (ret) return -EFAULT; if (maxnode % BITS_PER_LONG) mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; return 0; } /* Copy a node mask from user space. */ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { --maxnode; nodes_clear(*nodes); if (maxnode == 0 || !nmask) return 0; if (maxnode > PAGE_SIZE*BITS_PER_BYTE) return -EINVAL; /* * When the user specified more nodes than supported just check * if the non supported part is all zero, one word at a time, * starting at the end. */ while (maxnode > MAX_NUMNODES) { unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); unsigned long t; if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) return -EFAULT; if (maxnode - bits >= MAX_NUMNODES) { maxnode -= bits; } else { maxnode = MAX_NUMNODES; t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); } if (t) return -EINVAL; } return get_bitmap(nodes_addr(*nodes), nmask, maxnode); } /* Copy a kernel node mask to user space */ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, nodemask_t *nodes) { unsigned long copy = ALIGN(maxnode-1, 64) / 8; unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); bool compat = in_compat_syscall(); if (compat) nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); if (copy > nbytes) { if (copy > PAGE_SIZE) return -EINVAL; if (clear_user((char __user *)mask + nbytes, copy - nbytes)) return -EFAULT; copy = nbytes; maxnode = nr_node_ids; } if (compat) return compat_put_bitmap((compat_ulong_t __user *)mask, nodes_addr(*nodes), maxnode); return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } /* Basic parameter sanity check used by both mbind() and set_mempolicy() */ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) { *flags = *mode & MPOL_MODE_FLAGS; *mode &= ~MPOL_MODE_FLAGS; if ((unsigned int)(*mode) >= MPOL_MAX) return -EINVAL; if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; if (*flags & MPOL_F_NUMA_BALANCING) { if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY) *flags |= (MPOL_F_MOF | MPOL_F_MORON); else return -EINVAL; } return 0; } static long kernel_mbind(unsigned long start, unsigned long len, unsigned long mode, const unsigned long __user *nmask, unsigned long maxnode, unsigned int flags) { unsigned short mode_flags; nodemask_t nodes; int lmode = mode; int err; start = untagged_addr(start); err = sanitize_mpol_flags(&lmode, &mode_flags); if (err) return err; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_mbind(start, len, lmode, mode_flags, &nodes, flags); } SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, unsigned long, home_node, unsigned long, flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct mempolicy *new, *old; unsigned long end; int err = -ENOENT; VMA_ITERATOR(vmi, mm, start); start = untagged_addr(start); if (start & ~PAGE_MASK) return -EINVAL; /* * flags is used for future extension if any. */ if (flags != 0) return -EINVAL; /* * Check home_node is online to avoid accessing uninitialized * NODE_DATA. */ if (home_node >= MAX_NUMNODES || !node_online(home_node)) return -EINVAL; len = PAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; mmap_write_lock(mm); prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { /* * If any vma in the range got policy other than MPOL_BIND * or MPOL_PREFERRED_MANY we return error. We don't reset * the home node for vmas we already updated before. */ old = vma_policy(vma); if (!old) { prev = vma; continue; } if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { err = -EOPNOTSUPP; break; } new = mpol_dup(old); if (IS_ERR(new)) { err = PTR_ERR(new); break; } vma_start_write(vma); new->home_node = home_node; err = mbind_range(&vmi, vma, &prev, start, end, new); mpol_put(new); if (err) break; } mmap_write_unlock(mm); return err; } SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned int, flags) { return kernel_mbind(start, len, mode, nmask, maxnode, flags); } /* Set the process memory policy */ static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, unsigned long maxnode) { unsigned short mode_flags; nodemask_t nodes; int lmode = mode; int err; err = sanitize_mpol_flags(&lmode, &mode_flags); if (err) return err; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_set_mempolicy(lmode, mode_flags, &nodes); } SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, unsigned long, maxnode) { return kernel_set_mempolicy(mode, nmask, maxnode); } static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *old_nodes, const unsigned long __user *new_nodes) { struct mm_struct *mm = NULL; struct task_struct *task; nodemask_t task_nodes; int err; nodemask_t *old; nodemask_t *new; NODEMASK_SCRATCH(scratch); if (!scratch) return -ENOMEM; old = &scratch->mask1; new = &scratch->mask2; err = get_nodes(old, old_nodes, maxnode); if (err) goto out; err = get_nodes(new, new_nodes, maxnode); if (err) goto out; /* Find the mm_struct */ rcu_read_lock(); task = pid ? find_task_by_vpid(pid) : current; if (!task) { rcu_read_unlock(); err = -ESRCH; goto out; } get_task_struct(task); err = -EINVAL; /* * Check if this process has the right to modify the specified process. * Use the regular "ptrace_may_access()" checks. */ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { rcu_read_unlock(); err = -EPERM; goto out_put; } rcu_read_unlock(); task_nodes = cpuset_mems_allowed(task); /* Is the user allowed to access the target nodes? */ if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out_put; } task_nodes = cpuset_mems_allowed(current); nodes_and(*new, *new, task_nodes); if (nodes_empty(*new)) goto out_put; err = security_task_movememory(task); if (err) goto out_put; mm = get_task_mm(task); put_task_struct(task); if (!mm) { err = -EINVAL; goto out; } err = do_migrate_pages(mm, old, new, capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); mmput(mm); out: NODEMASK_SCRATCH_FREE(scratch); return err; out_put: put_task_struct(task); goto out; } SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, const unsigned long __user *, old_nodes, const unsigned long __user *, new_nodes) { return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); } /* Retrieve NUMA policy */ static int kernel_get_mempolicy(int __user *policy, unsigned long __user *nmask, unsigned long maxnode, unsigned long addr, unsigned long flags) { int err; int pval; nodemask_t nodes; if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; addr = untagged_addr(addr); err = do_get_mempolicy(&pval, &nodes, addr, flags); if (err) return err; if (policy && put_user(pval, policy)) return -EFAULT; if (nmask) err = copy_nodes_to_user(nmask, maxnode, &nodes); return err; } SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, unsigned long __user *, nmask, unsigned long, maxnode, unsigned long, addr, unsigned long, flags) { return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); } bool vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_IO | VM_PFNMAP)) return false; /* * DAX device mappings require predictable access latency, so avoid * incurring periodic faults. */ if (vma_is_dax(vma)) return false; if (is_vm_hugetlb_page(vma) && !hugepage_migration_supported(hstate_vma(vma))) return false; /* * Migration allocates pages in the highest zone. If we cannot * do so then migration (at least from node to node) is not * possible. */ if (vma->vm_file && gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) < policy_zone) return false; return true; } struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx) { *ilx = 0; return (vma->vm_ops && vma->vm_ops->get_policy) ? vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; } /* * get_vma_policy(@vma, @addr, @order, @ilx) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup * @order: 0, or appropriate huge_page_order for interleaving * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or * MPOL_WEIGHTED_INTERLEAVE * * Returns effective policy for a VMA at specified address. * Falls back to current->mempolicy or system default policy, as necessary. * Shared policies [those marked as MPOL_F_SHARED] require an extra reference * count--added by the get_policy() vm_op, as appropriate--to protect against * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx) { struct mempolicy *pol; pol = __get_vma_policy(vma, addr, ilx); if (!pol) pol = get_task_policy(current); if (pol->mode == MPOL_INTERLEAVE || pol->mode == MPOL_WEIGHTED_INTERLEAVE) { *ilx += vma->vm_pgoff >> order; *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); } return pol; } bool vma_policy_mof(struct vm_area_struct *vma) { struct mempolicy *pol; if (vma->vm_ops && vma->vm_ops->get_policy) { bool ret = false; pgoff_t ilx; /* ignored here */ pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); if (pol && (pol->flags & MPOL_F_MOF)) ret = true; mpol_cond_put(pol); return ret; } pol = vma->vm_policy; if (!pol) pol = get_task_policy(current); return pol->flags & MPOL_F_MOF; } bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); /* * if policy->nodes has movable memory only, * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. * * policy->nodes is intersect with node_states[N_MEMORY]. * so if the following test fails, it implies * policy->nodes has movable memory only. */ if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY])) dynamic_policy_zone = ZONE_MOVABLE; return zone >= dynamic_policy_zone; } static unsigned int weighted_interleave_nodes(struct mempolicy *policy) { unsigned int node; unsigned int cpuset_mems_cookie; retry: /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */ cpuset_mems_cookie = read_mems_allowed_begin(); node = current->il_prev; if (!current->il_weight || !node_isset(node, policy->nodes)) { node = next_node_in(node, policy->nodes); if (read_mems_allowed_retry(cpuset_mems_cookie)) goto retry; if (node == MAX_NUMNODES) return node; current->il_prev = node; current->il_weight = get_il_weight(node); } current->il_weight--; return node; } /* Do dynamic interleaving for a process */ static unsigned int interleave_nodes(struct mempolicy *policy) { unsigned int nid; unsigned int cpuset_mems_cookie; /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ do { cpuset_mems_cookie = read_mems_allowed_begin(); nid = next_node_in(current->il_prev, policy->nodes); } while (read_mems_allowed_retry(cpuset_mems_cookie)); if (nid < MAX_NUMNODES) current->il_prev = nid; return nid; } /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. */ unsigned int mempolicy_slab_node(void) { struct mempolicy *policy; int node = numa_mem_id(); if (!in_task()) return node; policy = current->mempolicy; if (!policy) return node; switch (policy->mode) { case MPOL_PREFERRED: return first_node(policy->nodes); case MPOL_INTERLEAVE: return interleave_nodes(policy); case MPOL_WEIGHTED_INTERLEAVE: return weighted_interleave_nodes(policy); case MPOL_BIND: case MPOL_PREFERRED_MANY: { struct zoneref *z; /* * Follow bind policy behavior and start allocation at the * first node. */ struct zonelist *zonelist; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->nodes); return zonelist_zone(z) ? zonelist_node_idx(z) : node; } case MPOL_LOCAL: return node; default: BUG(); } } static unsigned int read_once_policy_nodemask(struct mempolicy *pol, nodemask_t *mask) { /* * barrier stabilizes the nodemask locally so that it can be iterated * over safely without concern for changes. Allocators validate node * selection does not violate mems_allowed, so this is safe. */ barrier(); memcpy(mask, &pol->nodes, sizeof(nodemask_t)); barrier(); return nodes_weight(*mask); } static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) { nodemask_t nodemask; unsigned int target, nr_nodes; u8 *table; unsigned int weight_total = 0; u8 weight; int nid; nr_nodes = read_once_policy_nodemask(pol, &nodemask); if (!nr_nodes) return numa_node_id(); rcu_read_lock(); table = rcu_dereference(iw_table); /* calculate the total weight */ for_each_node_mask(nid, nodemask) { /* detect system default usage */ weight = table ? table[nid] : 1; weight = weight ? weight : 1; weight_total += weight; } /* Calculate the node offset based on totals */ target = ilx % weight_total; nid = first_node(nodemask); while (target) { /* detect system default usage */ weight = table ? table[nid] : 1; weight = weight ? weight : 1; if (target < weight) break; target -= weight; nid = next_node_in(nid, nodemask); } rcu_read_unlock(); return nid; } /* * Do static interleaving for interleave index @ilx. Returns the ilx'th * node in pol->nodes (starting from ilx=0), wrapping around if ilx * exceeds the number of present nodes. */ static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) { nodemask_t nodemask; unsigned int target, nnodes; int i; int nid; nnodes = read_once_policy_nodemask(pol, &nodemask); if (!nnodes) return numa_node_id(); target = ilx % nnodes; nid = first_node(nodemask); for (i = 0; i < target; i++) nid = next_node(nid, nodemask); return nid; } /* * Return a nodemask representing a mempolicy for filtering nodes for * page allocation, together with preferred node id (or the input node id). */ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, pgoff_t ilx, int *nid) { nodemask_t *nodemask = NULL; switch (pol->mode) { case MPOL_PREFERRED: /* Override input node id */ *nid = first_node(pol->nodes); break; case MPOL_PREFERRED_MANY: nodemask = &pol->nodes; if (pol->home_node != NUMA_NO_NODE) *nid = pol->home_node; break; case MPOL_BIND: /* Restrict to nodemask (but not on lower zones) */ if (apply_policy_zone(pol, gfp_zone(gfp)) && cpuset_nodemask_valid_mems_allowed(&pol->nodes)) nodemask = &pol->nodes; if (pol->home_node != NUMA_NO_NODE) *nid = pol->home_node; /* * __GFP_THISNODE shouldn't even be used with the bind policy * because we might easily break the expectation to stay on the * requested node and not break the policy. */ WARN_ON_ONCE(gfp & __GFP_THISNODE); break; case MPOL_INTERLEAVE: /* Override input node id */ *nid = (ilx == NO_INTERLEAVE_INDEX) ? interleave_nodes(pol) : interleave_nid(pol, ilx); break; case MPOL_WEIGHTED_INTERLEAVE: *nid = (ilx == NO_INTERLEAVE_INDEX) ? weighted_interleave_nodes(pol) : weighted_interleave_nid(pol, ilx); break; } return nodemask; } #ifdef CONFIG_HUGETLBFS /* * huge_node(@vma, @addr, @gfp_flags, @mpol) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup and interleave policy * @gfp_flags: for requested zone * @mpol: pointer to mempolicy pointer for reference counted mempolicy * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy * * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'bind' or 'prefer-many', returns a pointer * to the mempolicy's @nodemask for filtering the zonelist. */ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { pgoff_t ilx; int nid; nid = numa_node_id(); *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); return nid; } /* * init_nodemask_of_mempolicy * * If the current task's mempolicy is "default" [NULL], return 'false' * to indicate default policy. Otherwise, extract the policy nodemask * for 'bind' or 'interleave' policy into the argument nodemask, or * initialize the argument nodemask to contain the single node for * 'preferred' or 'local' policy and return 'true' to indicate presence * of non-default mempolicy. * * We don't bother with reference counting the mempolicy [mpol_get/put] * because the current task is examining it's own mempolicy and a task's * mempolicy is only ever changed by the task itself. * * N.B., it is the caller's responsibility to free a returned nodemask. */ bool init_nodemask_of_mempolicy(nodemask_t *mask) { struct mempolicy *mempolicy; if (!(mask && current->mempolicy)) return false; task_lock(current); mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_WEIGHTED_INTERLEAVE: *mask = mempolicy->nodes; break; case MPOL_LOCAL: init_nodemask_of_node(mask, numa_node_id()); break; default: BUG(); } task_unlock(current); return true; } #endif /* * mempolicy_in_oom_domain * * If tsk's mempolicy is "bind", check for intersection between mask and * the policy nodemask. Otherwise, return true for all other policies * including "interleave", as a tsk with "interleave" policy may have * memory allocated from all nodes in system. * * Takes task_lock(tsk) to prevent freeing of its mempolicy. */ bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask) { struct mempolicy *mempolicy; bool ret = true; if (!mask) return ret; task_lock(tsk); mempolicy = tsk->mempolicy; if (mempolicy && mempolicy->mode == MPOL_BIND) ret = nodes_intersects(mempolicy->nodes, *mask); task_unlock(tsk); return ret; } static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, int nid, nodemask_t *nodemask) { struct page *page; gfp_t preferred_gfp; /* * This is a two pass approach. The first pass will only try the * preferred nodes but skip the direct reclaim and allow the * allocation to fail, while the second pass will try all the * nodes in system. */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask); if (!page) page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL); return page; } /** * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. * @gfp: GFP flags. * @order: Order of the page allocation. * @pol: Pointer to the NUMA mempolicy. * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). * @nid: Preferred node (usually numa_node_id() but @mpol may override it). * * Return: The page on success or NULL if allocation fails. */ static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { nodemask_t *nodemask; struct page *page; nodemask = policy_nodemask(gfp, pol, ilx, &nid); if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_preferred_many(gfp, order, nid, nodemask); if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && /* filter "hugepage" allocation, unless from alloc_pages() */ order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { /* * For hugepage allocation and non-interleave policy which * allows the current node (or other explicitly preferred * node) we only try to allocate from the current/preferred * node and don't fall back to other nodes, as the cost of * remote accesses would likely offset THP benefits. * * If the policy is interleave or does not allow the current * node in its nodemask, we allocate the standard way. */ if (pol->mode != MPOL_INTERLEAVE && pol->mode != MPOL_WEIGHTED_INTERLEAVE && (!nodemask || node_isset(nid, *nodemask))) { /* * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ page = __alloc_frozen_pages_noprof( gfp | __GFP_THISNODE | __GFP_NORETRY, order, nid, NULL); if (page || !(gfp & __GFP_DIRECT_RECLAIM)) return page; /* * If hugepage allocations are configured to always * synchronous compact or the vma has been madvised * to prefer hugepage backing, retry allowing remote * memory with both reclaim and compact as well. */ } } page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask); if (unlikely(pol->mode == MPOL_INTERLEAVE || pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) { /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ if (static_branch_likely(&vm_numa_stat_key) && page_to_nid(page) == nid) { preempt_disable(); __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); preempt_enable(); } } return page; } struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, ilx, nid); if (!page) return NULL; set_page_refcounted(page); return page_rmappable_folio(page); } /** * vma_alloc_folio - Allocate a folio for a VMA. * @gfp: GFP flags. * @order: Order of the folio. * @vma: Pointer to VMA. * @addr: Virtual address of the allocation. Must be inside @vma. * * Allocate a folio for a specific address in @vma, using the appropriate * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the * VMA to prevent it from going away. Should be used for all allocations * for folios that will be mapped into user space, excepting hugetlbfs, and * excepting where direct use of folio_alloc_mpol() is more appropriate. * * Return: The folio on success or NULL if allocation fails. */ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol; pgoff_t ilx; struct folio *folio; if (vma->vm_flags & VM_DROPPABLE) gfp |= __GFP_NOWARN; pol = get_vma_policy(vma, addr, order, &ilx); folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id()); mpol_cond_put(pol); return folio; } EXPORT_SYMBOL(vma_alloc_folio_noprof); struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order) { struct mempolicy *pol = &default_policy; /* * No reference counting needed for current->mempolicy * nor system default_policy */ if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, numa_node_id()); } /** * alloc_pages - Allocate pages. * @gfp: GFP flags. * @order: Power of two of number of pages to allocate. * * Allocate 1 << @order contiguous pages. The physical address of the * first page is naturally aligned (eg an order-3 allocation will be aligned * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current * process is honoured when in process context. * * Context: Can be called from any context, providing the appropriate GFP * flags are used. * Return: The page on success or NULL if allocation fails. */ struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) { struct page *page = alloc_frozen_pages_noprof(gfp, order); if (page) set_page_refcounted(page); return page; } EXPORT_SYMBOL(alloc_pages_noprof); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order)); } EXPORT_SYMBOL(folio_alloc_noprof); static unsigned long alloc_pages_bulk_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { int nodes; unsigned long nr_pages_per_node; int delta; int i; unsigned long nr_allocated; unsigned long total_allocated = 0; nodes = nodes_weight(pol->nodes); nr_pages_per_node = nr_pages / nodes; delta = nr_pages - nodes * nr_pages_per_node; for (i = 0; i < nodes; i++) { if (delta) { nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, nr_pages_per_node + 1, page_array); delta--; } else { nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, nr_pages_per_node, page_array); } page_array += nr_allocated; total_allocated += nr_allocated; } return total_allocated; } static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { struct task_struct *me = current; unsigned int cpuset_mems_cookie; unsigned long total_allocated = 0; unsigned long nr_allocated = 0; unsigned long rounds; unsigned long node_pages, delta; u8 *table, *weights, weight; unsigned int weight_total = 0; unsigned long rem_pages = nr_pages; nodemask_t nodes; int nnodes, node; int resume_node = MAX_NUMNODES - 1; u8 resume_weight = 0; int prev_node; int i; if (!nr_pages) return 0; /* read the nodes onto the stack, retry if done during rebind */ do { cpuset_mems_cookie = read_mems_allowed_begin(); nnodes = read_once_policy_nodemask(pol, &nodes); } while (read_mems_allowed_retry(cpuset_mems_cookie)); /* if the nodemask has become invalid, we cannot do anything */ if (!nnodes) return 0; /* Continue allocating from most recent node and adjust the nr_pages */ node = me->il_prev; weight = me->il_weight; if (weight && node_isset(node, nodes)) { node_pages = min(rem_pages, weight); nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, page_array); page_array += nr_allocated; total_allocated += nr_allocated; /* if that's all the pages, no need to interleave */ if (rem_pages <= weight) { me->il_weight -= rem_pages; return total_allocated; } /* Otherwise we adjust remaining pages, continue from there */ rem_pages -= weight; } /* clear active weight in case of an allocation failure */ me->il_weight = 0; prev_node = node; /* create a local copy of node weights to operate on outside rcu */ weights = kzalloc(nr_node_ids, GFP_KERNEL); if (!weights) return total_allocated; rcu_read_lock(); table = rcu_dereference(iw_table); if (table) memcpy(weights, table, nr_node_ids); rcu_read_unlock(); /* calculate total, detect system default usage */ for_each_node_mask(node, nodes) { if (!weights[node]) weights[node] = 1; weight_total += weights[node]; } /* * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. * Track which node weighted interleave should resume from. * * if (rounds > 0) and (delta == 0), resume_node will always be * the node following prev_node and its weight. */ rounds = rem_pages / weight_total; delta = rem_pages % weight_total; resume_node = next_node_in(prev_node, nodes); resume_weight = weights[resume_node]; for (i = 0; i < nnodes; i++) { node = next_node_in(prev_node, nodes); weight = weights[node]; node_pages = weight * rounds; /* If a delta exists, add this node's portion of the delta */ if (delta > weight) { node_pages += weight; delta -= weight; } else if (delta) { /* when delta is depleted, resume from that node */ node_pages += delta; resume_node = node; resume_weight = weight - delta; delta = 0; } /* node_pages can be 0 if an allocation fails and rounds == 0 */ if (!node_pages) break; nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, page_array); page_array += nr_allocated; total_allocated += nr_allocated; if (total_allocated == nr_pages) break; prev_node = node; } me->il_prev = resume_node; me->il_weight = resume_weight; kfree(weights); return total_allocated; } static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { gfp_t preferred_gfp; unsigned long nr_allocated = 0; preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, nr_pages, page_array); if (nr_allocated < nr_pages) nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, nr_pages - nr_allocated, page_array + nr_allocated); return nr_allocated; } /* alloc pages bulk and mempolicy should be considered at the * same time in some situation such as vmalloc. * * It can accelerate memory allocation especially interleaving * allocate memory. */ unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; nodemask_t *nodemask; int nid; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); if (pol->mode == MPOL_INTERLEAVE) return alloc_pages_bulk_interleave(gfp, pol, nr_pages, page_array); if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) return alloc_pages_bulk_weighted_interleave( gfp, pol, nr_pages, page_array); if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_bulk_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); nid = numa_node_id(); nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); return alloc_pages_bulk_noprof(gfp, nid, nodemask, nr_pages, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { struct mempolicy *pol = mpol_dup(src->vm_policy); if (IS_ERR(pol)) return PTR_ERR(pol); dst->vm_policy = pol; return 0; } /* * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it * rebinds the mempolicy its copying by calling mpol_rebind_policy() * with the mems_allowed returned by cpuset_mems_allowed(). This * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). * * current's mempolicy may be rebinded by the other task(the task that changes * cpuset's mems), so we needn't do rebind work for current task. */ /* Slow path of a mempolicy duplicate */ struct mempolicy *__mpol_dup(struct mempolicy *old) { struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); /* task's mempolicy is protected by alloc_lock */ if (old == current->mempolicy) { task_lock(current); *new = *old; task_unlock(current); } else *new = *old; if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); mpol_rebind_policy(new, &mems); } atomic_set(&new->refcnt, 1); return new; } /* Slow path of a mempolicy comparison */ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (!a || !b) return false; if (a->mode != b->mode) return false; if (a->flags != b->flags) return false; if (a->home_node != b->home_node) return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) return false; switch (a->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_WEIGHTED_INTERLEAVE: return !!nodes_equal(a->nodes, b->nodes); case MPOL_LOCAL: return true; default: BUG(); return false; } } /* * Shared memory backing store policy support. * * Remember policies even when nobody has shared memory mapped. * The policies are kept in Red-Black tree linked from the inode. * They are protected by the sp->lock rwlock, which should be held * for any accesses to the tree. */ /* * lookup first element intersecting start-end. Caller holds sp->lock for * reading or for writing */ static struct sp_node *sp_lookup(struct shared_policy *sp, pgoff_t start, pgoff_t end) { struct rb_node *n = sp->root.rb_node; while (n) { struct sp_node *p = rb_entry(n, struct sp_node, nd); if (start >= p->end) n = n->rb_right; else if (end <= p->start) n = n->rb_left; else break; } if (!n) return NULL; for (;;) { struct sp_node *w = NULL; struct rb_node *prev = rb_prev(n); if (!prev) break; w = rb_entry(prev, struct sp_node, nd); if (w->end <= start) break; n = prev; } return rb_entry(n, struct sp_node, nd); } /* * Insert a new shared policy into the list. Caller holds sp->lock for * writing. */ static void sp_insert(struct shared_policy *sp, struct sp_node *new) { struct rb_node **p = &sp->root.rb_node; struct rb_node *parent = NULL; struct sp_node *nd; while (*p) { parent = *p; nd = rb_entry(parent, struct sp_node, nd); if (new->start < nd->start) p = &(*p)->rb_left; else if (new->end > nd->end) p = &(*p)->rb_right; else BUG(); } rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); } /* Find shared policy intersecting idx */ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) { struct mempolicy *pol = NULL; struct sp_node *sn; if (!sp->root.rb_node) return NULL; read_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } read_unlock(&sp->lock); return pol; } static void sp_free(struct sp_node *n) { mpol_put(n->policy); kmem_cache_free(sn_cache, n); } /** * mpol_misplaced - check whether current folio node is valid in policy * * @folio: folio to be checked * @vmf: structure describing the fault * @addr: virtual address in @vma for shared policy lookup and interleave policy * * Lookup current policy node id for vma,addr and "compare to" folio's * node id. Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. * * Return: NUMA_NO_NODE if the page is in a node that is valid for this * policy, or a suitable node ID to allocate a replacement folio from. */ int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, unsigned long addr) { struct mempolicy *pol; pgoff_t ilx; struct zoneref *z; int curnid = folio_nid(folio); struct vm_area_struct *vma = vmf->vma; int thiscpu = raw_smp_processor_id(); int thisnid = numa_node_id(); int polnid = NUMA_NO_NODE; int ret = NUMA_NO_NODE; /* * Make sure ptl is held so that we don't preempt and we * have a stable smp processor id */ lockdep_assert_held(vmf->ptl); pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); if (!(pol->flags & MPOL_F_MOF)) goto out; switch (pol->mode) { case MPOL_INTERLEAVE: polnid = interleave_nid(pol, ilx); break; case MPOL_WEIGHTED_INTERLEAVE: polnid = weighted_interleave_nid(pol, ilx); break; case MPOL_PREFERRED: if (node_isset(curnid, pol->nodes)) goto out; polnid = first_node(pol->nodes); break; case MPOL_LOCAL: polnid = numa_node_id(); break; case MPOL_BIND: case MPOL_PREFERRED_MANY: /* * Even though MPOL_PREFERRED_MANY can allocate pages outside * policy nodemask we don't allow numa migration to nodes * outside policy nodemask for now. This is done so that if we * want demotion to slow memory to happen, before allocating * from some DRAM node say 'x', we will end up using a * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario * we should not promote to node 'x' from slow memory node. */ if (pol->flags & MPOL_F_MORON) { /* * Optimize placement among multiple nodes * via NUMA balancing */ if (node_isset(thisnid, pol->nodes)) break; goto out; } /* * use current page if in policy nodemask, * else select nearest allowed node, if any. * If no allowed nodes, use current [!misplaced]. */ if (node_isset(curnid, pol->nodes)) goto out; z = first_zones_zonelist( node_zonelist(thisnid, GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->nodes); polnid = zonelist_node_idx(z); break; default: BUG(); } /* Migrate the folio towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { polnid = thisnid; if (!should_numa_migrate_memory(current, folio, curnid, thiscpu)) goto out; } if (curnid != polnid) ret = polnid; out: mpol_cond_put(pol); return ret; } /* * Drop the (possibly final) reference to task->mempolicy. It needs to be * dropped after task->mempolicy is set to NULL so that any allocation done as * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed * policy. */ void mpol_put_task_policy(struct task_struct *task) { struct mempolicy *pol; task_lock(task); pol = task->mempolicy; task->mempolicy = NULL; task_unlock(task); mpol_put(pol); } static void sp_delete(struct shared_policy *sp, struct sp_node *n) { rb_erase(&n->nd, &sp->root); sp_free(n); } static void sp_node_init(struct sp_node *node, unsigned long start, unsigned long end, struct mempolicy *pol) { node->start = start; node->end = end; node->policy = pol; } static struct sp_node *sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) { struct sp_node *n; struct mempolicy *newpol; n = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n) return NULL; newpol = mpol_dup(pol); if (IS_ERR(newpol)) { kmem_cache_free(sn_cache, n); return NULL; } newpol->flags |= MPOL_F_SHARED; sp_node_init(n, start, end, newpol); return n; } /* Replace a policy range. */ static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, pgoff_t end, struct sp_node *new) { struct sp_node *n; struct sp_node *n_new = NULL; struct mempolicy *mpol_new = NULL; int ret = 0; restart: write_lock(&sp->lock); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { struct rb_node *next = rb_next(&n->nd); if (n->start >= start) { if (n->end <= end) sp_delete(sp, n); else n->start = end; } else { /* Old policy spanning whole new range. */ if (n->end > end) { if (!n_new) goto alloc_new; *mpol_new = *n->policy; atomic_set(&mpol_new->refcnt, 1); sp_node_init(n_new, end, n->end, mpol_new); n->end = start; sp_insert(sp, n_new); n_new = NULL; mpol_new = NULL; break; } else n->end = start; } if (!next) break; n = rb_entry(next, struct sp_node, nd); } if (new) sp_insert(sp, new); write_unlock(&sp->lock); ret = 0; err_out: if (mpol_new) mpol_put(mpol_new); if (n_new) kmem_cache_free(sn_cache, n_new); return ret; alloc_new: write_unlock(&sp->lock); ret = -ENOMEM; n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n_new) goto err_out; mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!mpol_new) goto err_out; atomic_set(&mpol_new->refcnt, 1); goto restart; } /** * mpol_shared_policy_init - initialize shared policy for inode * @sp: pointer to inode shared policy * @mpol: struct mempolicy to install * * Install non-NULL @mpol in inode's shared policy rb-tree. * On entry, the current task has a reference on a non-NULL @mpol. * This must be released on exit. * This is called at get_inode() calls and we can use GFP_KERNEL. */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ rwlock_init(&sp->lock); if (mpol) { struct sp_node *sn; struct mempolicy *npol; NODEMASK_SCRATCH(scratch); if (!scratch) goto put_mpol; /* contextualize the tmpfs mount point mempolicy to this file */ npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); if (IS_ERR(npol)) goto free_scratch; /* no valid nodemask intersection */ task_lock(current); ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); task_unlock(current); if (ret) goto put_npol; /* alloc node covering entire file; adds ref to file's npol */ sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); if (sn) sp_insert(sp, sn); put_npol: mpol_put(npol); /* drop initial ref on file's npol */ free_scratch: NODEMASK_SCRATCH_FREE(scratch); put_mpol: mpol_put(mpol); /* drop our incoming ref on sb mpol */ } } int mpol_set_shared_policy(struct shared_policy *sp, struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); if (pol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); if (!new) return -ENOMEM; } err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); if (err && new) sp_free(new); return err; } /* Free a backing policy store on inode delete. */ void mpol_free_shared_policy(struct shared_policy *sp) { struct sp_node *n; struct rb_node *next; if (!sp->root.rb_node) return; write_lock(&sp->lock); next = rb_first(&sp->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); sp_delete(sp, n); } write_unlock(&sp->lock); } #ifdef CONFIG_NUMA_BALANCING static int __initdata numabalancing_override; static void __init check_numabalancing_enable(void) { bool numabalancing_default = false; if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) numabalancing_default = true; /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ if (numabalancing_override) set_numabalancing_state(numabalancing_override == 1); if (num_online_nodes() > 1 && !numabalancing_override) { pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", numabalancing_default ? "Enabling" : "Disabling"); set_numabalancing_state(numabalancing_default); } } static int __init setup_numabalancing(char *str) { int ret = 0; if (!str) goto out; if (!strcmp(str, "enable")) { numabalancing_override = 1; ret = 1; } else if (!strcmp(str, "disable")) { numabalancing_override = -1; ret = 1; } out: if (!ret) pr_warn("Unable to parse numa_balancing=\n"); return ret; } __setup("numa_balancing=", setup_numabalancing); #else static inline void __init check_numabalancing_enable(void) { } #endif /* CONFIG_NUMA_BALANCING */ void __init numa_policy_init(void) { nodemask_t interleave_nodes; unsigned long largest = 0; int nid, prefer = 0; policy_cache = kmem_cache_create("numa_policy", sizeof(struct mempolicy), 0, SLAB_PANIC, NULL); sn_cache = kmem_cache_create("shared_policy_node", sizeof(struct sp_node), 0, SLAB_PANIC, NULL); for_each_node(nid) { preferred_node_policy[nid] = (struct mempolicy) { .refcnt = ATOMIC_INIT(1), .mode = MPOL_PREFERRED, .flags = MPOL_F_MOF | MPOL_F_MORON, .nodes = nodemask_of_node(nid), }; } /* * Set interleaving policy for system init. Interleaving is only * enabled across suitably sized nodes (default is >= 16MB), or * fall back to the largest node if they're all smaller. */ nodes_clear(interleave_nodes); for_each_node_state(nid, N_MEMORY) { unsigned long total_pages = node_present_pages(nid); /* Preserve the largest node */ if (largest < total_pages) { largest = total_pages; prefer = nid; } /* Interleave this node? */ if ((total_pages << PAGE_SHIFT) >= (16 << 20)) node_set(nid, interleave_nodes); } /* All too small, use the largest */ if (unlikely(nodes_empty(interleave_nodes))) node_set(prefer, interleave_nodes); if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) pr_err("%s: interleaving failed\n", __func__); check_numabalancing_enable(); } /* Reset policy of current process to default */ void numa_default_policy(void) { do_set_mempolicy(MPOL_DEFAULT, 0, NULL); } /* * Parse and format mempolicy from/to strings */ static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave", [MPOL_LOCAL] = "local", [MPOL_PREFERRED_MANY] = "prefer (many)", }; #ifdef CONFIG_TMPFS /** * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. * @str: string containing mempolicy to parse * @mpol: pointer to struct mempolicy pointer, returned on success. * * Format of input: * <mode>[=<flags>][:<nodelist>] * * Return: %0 on success, else %1 */ int mpol_parse_str(char *str, struct mempolicy **mpol) { struct mempolicy *new = NULL; unsigned short mode_flags; nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); int err = 1, mode; if (flags) *flags++ = '\0'; /* terminate mode string */ if (nodelist) { /* NUL-terminate mode or flags string */ *nodelist++ = '\0'; if (nodelist_parse(nodelist, nodes)) goto out; if (!nodes_subset(nodes, node_states[N_MEMORY])) goto out; } else nodes_clear(nodes); mode = match_string(policy_modes, MPOL_MAX, str); if (mode < 0) goto out; switch (mode) { case MPOL_PREFERRED: /* * Insist on a nodelist of one node only, although later * we use first_node(nodes) to grab a single node, so here * nodelist (or nodes) cannot be empty. */ if (nodelist) { char *rest = nodelist; while (isdigit(*rest)) rest++; if (*rest) goto out; if (nodes_empty(nodes)) goto out; } break; case MPOL_INTERLEAVE: case MPOL_WEIGHTED_INTERLEAVE: /* * Default to online nodes with memory if no nodelist */ if (!nodelist) nodes = node_states[N_MEMORY]; break; case MPOL_LOCAL: /* * Don't allow a nodelist; mpol_new() checks flags */ if (nodelist) goto out; break; case MPOL_DEFAULT: /* * Insist on a empty nodelist */ if (!nodelist) err = 0; goto out; case MPOL_PREFERRED_MANY: case MPOL_BIND: /* * Insist on a nodelist */ if (!nodelist) goto out; } mode_flags = 0; if (flags) { /* * Currently, we only support two mutually exclusive * mode flags. */ if (!strcmp(flags, "static")) mode_flags |= MPOL_F_STATIC_NODES; else if (!strcmp(flags, "relative")) mode_flags |= MPOL_F_RELATIVE_NODES; else goto out; } new = mpol_new(mode, mode_flags, &nodes); if (IS_ERR(new)) goto out; /* * Save nodes for mpol_to_str() to show the tmpfs mount options * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. */ if (mode != MPOL_PREFERRED) { new->nodes = nodes; } else if (nodelist) { nodes_clear(new->nodes); node_set(first_node(nodes), new->nodes); } else { new->mode = MPOL_LOCAL; } /* * Save nodes for contextualization: this will be used to "clone" * the mempolicy in a specific context [cpuset] at a later time. */ new->w.user_nodemask = nodes; err = 0; out: /* Restore string for error message */ if (nodelist) *--nodelist = ':'; if (flags) *--flags = '='; if (!err) *mpol = new; return err; } #endif /* CONFIG_TMPFS */ /** * mpol_to_str - format a mempolicy structure for printing * @buffer: to contain formatted mempolicy string * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted * * Convert @pol into a string. If @buffer is too short, truncate the string. * Recommend a @maxlen of at least 51 for the longest mode, "weighted * interleave", plus the longest flag flags, "relative|balancing", and to * display at least a few node ids. */ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; nodemask_t nodes = NODE_MASK_NONE; unsigned short mode = MPOL_DEFAULT; unsigned short flags = 0; if (pol && pol != &default_policy && !(pol >= &preferred_node_policy[0] && pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) { mode = pol->mode; flags = pol->flags; } switch (mode) { case MPOL_DEFAULT: case MPOL_LOCAL: break; case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_WEIGHTED_INTERLEAVE: nodes = pol->nodes; break; default: WARN_ON_ONCE(1); snprintf(p, maxlen, "unknown"); return; } p += snprintf(p, maxlen, "%s", policy_modes[mode]); if (flags & MPOL_MODE_FLAGS) { p += snprintf(p, buffer + maxlen - p, "="); /* * Static and relative are mutually exclusive. */ if (flags & MPOL_F_STATIC_NODES) p += snprintf(p, buffer + maxlen - p, "static"); else if (flags & MPOL_F_RELATIVE_NODES) p += snprintf(p, buffer + maxlen - p, "relative"); if (flags & MPOL_F_NUMA_BALANCING) { if (!is_power_of_2(flags & MPOL_MODE_FLAGS)) p += snprintf(p, buffer + maxlen - p, "|"); p += snprintf(p, buffer + maxlen - p, "balancing"); } } if (!nodes_empty(nodes)) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); } #ifdef CONFIG_SYSFS struct iw_node_attr { struct kobj_attribute kobj_attr; int nid; }; static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct iw_node_attr *node_attr; u8 weight; node_attr = container_of(attr, struct iw_node_attr, kobj_attr); weight = get_il_weight(node_attr->nid); return sysfs_emit(buf, "%d\n", weight); } static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct iw_node_attr *node_attr; u8 *new; u8 *old; u8 weight = 0; node_attr = container_of(attr, struct iw_node_attr, kobj_attr); if (count == 0 || sysfs_streq(buf, "")) weight = 0; else if (kstrtou8(buf, 0, &weight)) return -EINVAL; new = kzalloc(nr_node_ids, GFP_KERNEL); if (!new) return -ENOMEM; mutex_lock(&iw_table_lock); old = rcu_dereference_protected(iw_table, lockdep_is_held(&iw_table_lock)); if (old) memcpy(new, old, nr_node_ids); new[node_attr->nid] = weight; rcu_assign_pointer(iw_table, new); mutex_unlock(&iw_table_lock); synchronize_rcu(); kfree(old); return count; } static struct iw_node_attr **node_attrs; static void sysfs_wi_node_release(struct iw_node_attr *node_attr, struct kobject *parent) { if (!node_attr) return; sysfs_remove_file(parent, &node_attr->kobj_attr.attr); kfree(node_attr->kobj_attr.attr.name); kfree(node_attr); } static void sysfs_wi_release(struct kobject *wi_kobj) { int i; for (i = 0; i < nr_node_ids; i++) sysfs_wi_node_release(node_attrs[i], wi_kobj); kobject_put(wi_kobj); } static const struct kobj_type wi_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = sysfs_wi_release, }; static int add_weight_node(int nid, struct kobject *wi_kobj) { struct iw_node_attr *node_attr; char *name; node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); if (!node_attr) return -ENOMEM; name = kasprintf(GFP_KERNEL, "node%d", nid); if (!name) { kfree(node_attr); return -ENOMEM; } sysfs_attr_init(&node_attr->kobj_attr.attr); node_attr->kobj_attr.attr.name = name; node_attr->kobj_attr.attr.mode = 0644; node_attr->kobj_attr.show = node_show; node_attr->kobj_attr.store = node_store; node_attr->nid = nid; if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) { kfree(node_attr->kobj_attr.attr.name); kfree(node_attr); pr_err("failed to add attribute to weighted_interleave\n"); return -ENOMEM; } node_attrs[nid] = node_attr; return 0; } static int add_weighted_interleave_group(struct kobject *root_kobj) { struct kobject *wi_kobj; int nid, err; wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); if (!wi_kobj) return -ENOMEM; err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj, "weighted_interleave"); if (err) { kfree(wi_kobj); return err; } for_each_node_state(nid, N_POSSIBLE) { err = add_weight_node(nid, wi_kobj); if (err) { pr_err("failed to add sysfs [node%d]\n", nid); break; } } if (err) kobject_put(wi_kobj); return 0; } static void mempolicy_kobj_release(struct kobject *kobj) { u8 *old; mutex_lock(&iw_table_lock); old = rcu_dereference_protected(iw_table, lockdep_is_held(&iw_table_lock)); rcu_assign_pointer(iw_table, NULL); mutex_unlock(&iw_table_lock); synchronize_rcu(); kfree(old); kfree(node_attrs); kfree(kobj); } static const struct kobj_type mempolicy_ktype = { .release = mempolicy_kobj_release }; static int __init mempolicy_sysfs_init(void) { int err; static struct kobject *mempolicy_kobj; mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL); if (!mempolicy_kobj) { err = -ENOMEM; goto err_out; } node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *), GFP_KERNEL); if (!node_attrs) { err = -ENOMEM; goto mempol_out; } err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj, "mempolicy"); if (err) goto node_out; err = add_weighted_interleave_group(mempolicy_kobj); if (err) { pr_err("mempolicy sysfs structure failed to initialize\n"); kobject_put(mempolicy_kobj); return err; } return err; node_out: kfree(node_attrs); mempol_out: kfree(mempolicy_kobj); err_out: pr_err("failed to add mempolicy kobject to the system\n"); return err; } late_initcall(mempolicy_sysfs_init); #endif /* CONFIG_SYSFS */ |
| 516 520 516 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Procedures for maintaining information about logical memory blocks. * * Peter Bergner, IBM Corp. June 2001. * Copyright (C) 2001 Peter Bergner. */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/bitops.h> #include <linux/poison.h> #include <linux/pfn.h> #include <linux/debugfs.h> #include <linux/kmemleak.h> #include <linux/seq_file.h> #include <linux/memblock.h> #include <asm/sections.h> #include <linux/io.h> #include "internal.h" #define INIT_MEMBLOCK_REGIONS 128 #define INIT_PHYSMEM_REGIONS 4 #ifndef INIT_MEMBLOCK_RESERVED_REGIONS # define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS #endif #ifndef INIT_MEMBLOCK_MEMORY_REGIONS #define INIT_MEMBLOCK_MEMORY_REGIONS INIT_MEMBLOCK_REGIONS #endif /** * DOC: memblock overview * * Memblock is a method of managing memory regions during the early * boot period when the usual kernel memory allocators are not up and * running. * * Memblock views the system memory as collections of contiguous * regions. There are several types of these collections: * * * ``memory`` - describes the physical memory available to the * kernel; this may differ from the actual physical memory installed * in the system, for instance when the memory is restricted with * ``mem=`` command line parameter * * ``reserved`` - describes the regions that were allocated * * ``physmem`` - describes the actual physical memory available during * boot regardless of the possible restrictions and memory hot(un)plug; * the ``physmem`` type is only available on some architectures. * * Each region is represented by struct memblock_region that * defines the region extents, its attributes and NUMA node id on NUMA * systems. Every memory type is described by the struct memblock_type * which contains an array of memory regions along with * the allocator metadata. The "memory" and "reserved" types are nicely * wrapped with struct memblock. This structure is statically * initialized at build time. The region arrays are initially sized to * %INIT_MEMBLOCK_MEMORY_REGIONS for "memory" and * %INIT_MEMBLOCK_RESERVED_REGIONS for "reserved". The region array * for "physmem" is initially sized to %INIT_PHYSMEM_REGIONS. * The memblock_allow_resize() enables automatic resizing of the region * arrays during addition of new regions. This feature should be used * with care so that memory allocated for the region array will not * overlap with areas that should be reserved, for example initrd. * * The early architecture setup should tell memblock what the physical * memory layout is by using memblock_add() or memblock_add_node() * functions. The first function does not assign the region to a NUMA * node and it is appropriate for UMA systems. Yet, it is possible to * use it on NUMA systems as well and assign the region to a NUMA node * later in the setup process using memblock_set_node(). The * memblock_add_node() performs such an assignment directly. * * Once memblock is setup the memory can be allocated using one of the * API variants: * * * memblock_phys_alloc*() - these functions return the **physical** * address of the allocated memory * * memblock_alloc*() - these functions return the **virtual** address * of the allocated memory. * * Note, that both API variants use implicit assumptions about allowed * memory ranges and the fallback methods. Consult the documentation * of memblock_alloc_internal() and memblock_alloc_range_nid() * functions for more elaborate description. * * As the system boot progresses, the architecture specific mem_init() * function frees all the memory to the buddy page allocator. * * Unless an architecture enables %CONFIG_ARCH_KEEP_MEMBLOCK, the * memblock data structures (except "physmem") will be discarded after the * system initialization completes. */ #ifndef CONFIG_NUMA struct pglist_data __refdata contig_page_data; EXPORT_SYMBOL(contig_page_data); #endif unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; unsigned long long max_possible_pfn; static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock; #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS]; #endif struct memblock memblock __initdata_memblock = { .memory.regions = memblock_memory_init_regions, .memory.max = INIT_MEMBLOCK_MEMORY_REGIONS, .memory.name = "memory", .reserved.regions = memblock_reserved_init_regions, .reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS, .reserved.name = "reserved", .bottom_up = false, .current_limit = MEMBLOCK_ALLOC_ANYWHERE, }; #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP struct memblock_type physmem = { .regions = memblock_physmem_init_regions, .max = INIT_PHYSMEM_REGIONS, .name = "physmem", }; #endif /* * keep a pointer to &memblock.memory in the text section to use it in * __next_mem_range() and its helpers. * For architectures that do not keep memblock data after init, this * pointer will be reset to NULL at memblock_discard() */ static __refdata struct memblock_type *memblock_memory = &memblock.memory; #define for_each_memblock_type(i, memblock_type, rgn) \ for (i = 0, rgn = &memblock_type->regions[0]; \ i < memblock_type->cnt; \ i++, rgn = &memblock_type->regions[i]) #define memblock_dbg(fmt, ...) \ do { \ if (memblock_debug) \ pr_info(fmt, ##__VA_ARGS__); \ } while (0) static int memblock_debug __initdata_memblock; static bool system_has_some_mirror __initdata_memblock; static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock; static int memblock_reserved_in_slab __initdata_memblock; bool __init_memblock memblock_has_mirror(void) { return system_has_some_mirror; } static enum memblock_flags __init_memblock choose_memblock_flags(void) { return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; } /* adjust *@size so that (@base + *@size) doesn't overflow, return new size */ static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) { return *size = min(*size, PHYS_ADDR_MAX - base); } /* * Address comparison utilities */ unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, phys_addr_t base2, phys_addr_t size2) { return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); } bool __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) { unsigned long i; memblock_cap_size(base, &size); for (i = 0; i < type->cnt; i++) if (memblock_addrs_overlap(base, size, type->regions[i].base, type->regions[i].size)) return true; return false; } /** * __memblock_find_range_bottom_up - find free area utility in bottom-up * @start: start of candidate range * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or * %MEMBLOCK_ALLOC_ACCESSIBLE * @size: size of free area to find * @align: alignment of free area to find * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * @flags: pick from blocks based on memory attributes * * Utility called from memblock_find_in_range_node(), find free area bottom-up. * * Return: * Found address on success, 0 on failure. */ static phys_addr_t __init_memblock __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align, int nid, enum memblock_flags flags) { phys_addr_t this_start, this_end, cand; u64 i; for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) { this_start = clamp(this_start, start, end); this_end = clamp(this_end, start, end); cand = round_up(this_start, align); if (cand < this_end && this_end - cand >= size) return cand; } return 0; } /** * __memblock_find_range_top_down - find free area utility, in top-down * @start: start of candidate range * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or * %MEMBLOCK_ALLOC_ACCESSIBLE * @size: size of free area to find * @align: alignment of free area to find * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * @flags: pick from blocks based on memory attributes * * Utility called from memblock_find_in_range_node(), find free area top-down. * * Return: * Found address on success, 0 on failure. */ static phys_addr_t __init_memblock __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align, int nid, enum memblock_flags flags) { phys_addr_t this_start, this_end, cand; u64 i; for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end, NULL) { this_start = clamp(this_start, start, end); this_end = clamp(this_end, start, end); if (this_end < size) continue; cand = round_down(this_end - size, align); if (cand >= this_start) return cand; } return 0; } /** * memblock_find_in_range_node - find free area in given range and node * @size: size of free area to find * @align: alignment of free area to find * @start: start of candidate range * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or * %MEMBLOCK_ALLOC_ACCESSIBLE * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * @flags: pick from blocks based on memory attributes * * Find @size free area aligned to @align in the specified range and node. * * Return: * Found address on success, 0 on failure. */ static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, int nid, enum memblock_flags flags) { /* pump up @end */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE || end == MEMBLOCK_ALLOC_NOLEAKTRACE) end = memblock.current_limit; /* avoid allocating the first page */ start = max_t(phys_addr_t, start, PAGE_SIZE); end = max(start, end); if (memblock_bottom_up()) return __memblock_find_range_bottom_up(start, end, size, align, nid, flags); else return __memblock_find_range_top_down(start, end, size, align, nid, flags); } /** * memblock_find_in_range - find free area in given range * @start: start of candidate range * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or * %MEMBLOCK_ALLOC_ACCESSIBLE * @size: size of free area to find * @align: alignment of free area to find * * Find @size free area aligned to @align in the specified range. * * Return: * Found address on success, 0 on failure. */ static phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align) { phys_addr_t ret; enum memblock_flags flags = choose_memblock_flags(); again: ret = memblock_find_in_range_node(size, align, start, end, NUMA_NO_NODE, flags); if (!ret && (flags & MEMBLOCK_MIRROR)) { pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n", &size); flags &= ~MEMBLOCK_MIRROR; goto again; } return ret; } static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) { type->total_size -= type->regions[r].size; memmove(&type->regions[r], &type->regions[r + 1], (type->cnt - (r + 1)) * sizeof(type->regions[r])); type->cnt--; /* Special case for empty arrays */ if (type->cnt == 0) { WARN_ON(type->total_size != 0); type->regions[0].base = 0; type->regions[0].size = 0; type->regions[0].flags = 0; memblock_set_region_node(&type->regions[0], MAX_NUMNODES); } } #ifndef CONFIG_ARCH_KEEP_MEMBLOCK /** * memblock_discard - discard memory and reserved arrays if they were allocated */ void __init memblock_discard(void) { phys_addr_t addr, size; if (memblock.reserved.regions != memblock_reserved_init_regions) { addr = __pa(memblock.reserved.regions); size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.reserved.max); if (memblock_reserved_in_slab) kfree(memblock.reserved.regions); else memblock_free_late(addr, size); } if (memblock.memory.regions != memblock_memory_init_regions) { addr = __pa(memblock.memory.regions); size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.memory.max); if (memblock_memory_in_slab) kfree(memblock.memory.regions); else memblock_free_late(addr, size); } memblock_memory = NULL; } #endif /** * memblock_double_array - double the size of the memblock regions array * @type: memblock type of the regions array being doubled * @new_area_start: starting address of memory range to avoid overlap with * @new_area_size: size of memory range to avoid overlap with * * Double the size of the @type regions array. If memblock is being used to * allocate memory for a new reserved regions array and there is a previously * allocated memory range [@new_area_start, @new_area_start + @new_area_size] * waiting to be reserved, ensure the memory used by the new array does * not overlap. * * Return: * 0 on success, -1 on failure. */ static int __init_memblock memblock_double_array(struct memblock_type *type, phys_addr_t new_area_start, phys_addr_t new_area_size) { struct memblock_region *new_array, *old_array; phys_addr_t old_alloc_size, new_alloc_size; phys_addr_t old_size, new_size, addr, new_end; int use_slab = slab_is_available(); int *in_slab; /* We don't allow resizing until we know about the reserved regions * of memory that aren't suitable for allocation */ if (!memblock_can_resize) panic("memblock: cannot resize %s array\n", type->name); /* Calculate new doubled size */ old_size = type->max * sizeof(struct memblock_region); new_size = old_size << 1; /* * We need to allocated new one align to PAGE_SIZE, * so we can free them completely later. */ old_alloc_size = PAGE_ALIGN(old_size); new_alloc_size = PAGE_ALIGN(new_size); /* Retrieve the slab flag */ if (type == &memblock.memory) in_slab = &memblock_memory_in_slab; else in_slab = &memblock_reserved_in_slab; /* Try to find some space for it */ if (use_slab) { new_array = kmalloc(new_size, GFP_KERNEL); addr = new_array ? __pa(new_array) : 0; } else { /* only exclude range when trying to double reserved.regions */ if (type != &memblock.reserved) new_area_start = new_area_size = 0; addr = memblock_find_in_range(new_area_start + new_area_size, memblock.current_limit, new_alloc_size, PAGE_SIZE); if (!addr && new_area_size) addr = memblock_find_in_range(0, min(new_area_start, memblock.current_limit), new_alloc_size, PAGE_SIZE); new_array = addr ? __va(addr) : NULL; } if (!addr) { pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", type->name, type->max, type->max * 2); return -1; } new_end = addr + new_size - 1; memblock_dbg("memblock: %s is doubled to %ld at [%pa-%pa]", type->name, type->max * 2, &addr, &new_end); /* * Found space, we now need to move the array over before we add the * reserved region since it may be our reserved array itself that is * full. */ memcpy(new_array, type->regions, old_size); memset(new_array + type->max, 0, old_size); old_array = type->regions; type->regions = new_array; type->max <<= 1; /* Free old array. We needn't free it if the array is the static one */ if (*in_slab) kfree(old_array); else if (old_array != memblock_memory_init_regions && old_array != memblock_reserved_init_regions) memblock_free(old_array, old_alloc_size); /* * Reserve the new array if that comes from the memblock. Otherwise, we * needn't do it */ if (!use_slab) BUG_ON(memblock_reserve(addr, new_alloc_size)); /* Update slab flag */ *in_slab = use_slab; return 0; } /** * memblock_merge_regions - merge neighboring compatible regions * @type: memblock type to scan * @start_rgn: start scanning from (@start_rgn - 1) * @end_rgn: end scanning at (@end_rgn - 1) * Scan @type and merge neighboring compatible regions in [@start_rgn - 1, @end_rgn) */ static void __init_memblock memblock_merge_regions(struct memblock_type *type, unsigned long start_rgn, unsigned long end_rgn) { int i = 0; if (start_rgn) i = start_rgn - 1; end_rgn = min(end_rgn, type->cnt - 1); while (i < end_rgn) { struct memblock_region *this = &type->regions[i]; struct memblock_region *next = &type->regions[i + 1]; if (this->base + this->size != next->base || memblock_get_region_node(this) != memblock_get_region_node(next) || this->flags != next->flags) { BUG_ON(this->base + this->size > next->base); i++; continue; } this->size += next->size; /* move forward from next + 1, index of which is i + 2 */ memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next)); type->cnt--; end_rgn--; } } /** * memblock_insert_region - insert new memblock region * @type: memblock type to insert into * @idx: index for the insertion point * @base: base address of the new region * @size: size of the new region * @nid: node id of the new region * @flags: flags of the new region * * Insert new memblock region [@base, @base + @size) into @type at @idx. * @type must already have extra room to accommodate the new region. */ static void __init_memblock memblock_insert_region(struct memblock_type *type, int idx, phys_addr_t base, phys_addr_t size, int nid, enum memblock_flags flags) { struct memblock_region *rgn = &type->regions[idx]; BUG_ON(type->cnt >= type->max); memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); rgn->base = base; rgn->size = size; rgn->flags = flags; memblock_set_region_node(rgn, nid); type->cnt++; type->total_size += size; } /** * memblock_add_range - add new memblock region * @type: memblock type to add new region into * @base: base address of the new region * @size: size of the new region * @nid: nid of the new region * @flags: flags of the new region * * Add new memblock region [@base, @base + @size) into @type. The new region * is allowed to overlap with existing ones - overlaps don't affect already * existing regions. @type is guaranteed to be minimal (all neighbouring * compatible regions are merged) after the addition. * * Return: * 0 on success, -errno on failure. */ static int __init_memblock memblock_add_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size, int nid, enum memblock_flags flags) { bool insert = false; phys_addr_t obase = base; phys_addr_t end = base + memblock_cap_size(base, &size); int idx, nr_new, start_rgn = -1, end_rgn; struct memblock_region *rgn; if (!size) return 0; /* special case for empty array */ if (type->regions[0].size == 0) { WARN_ON(type->cnt != 0 || type->total_size); type->regions[0].base = base; type->regions[0].size = size; type->regions[0].flags = flags; memblock_set_region_node(&type->regions[0], nid); type->total_size = size; type->cnt = 1; return 0; } /* * The worst case is when new range overlaps all existing regions, * then we'll need type->cnt + 1 empty regions in @type. So if * type->cnt * 2 + 1 is less than or equal to type->max, we know * that there is enough empty regions in @type, and we can insert * regions directly. */ if (type->cnt * 2 + 1 <= type->max) insert = true; repeat: /* * The following is executed twice. Once with %false @insert and * then with %true. The first counts the number of regions needed * to accommodate the new area. The second actually inserts them. */ base = obase; nr_new = 0; for_each_memblock_type(idx, type, rgn) { phys_addr_t rbase = rgn->base; phys_addr_t rend = rbase + rgn->size; if (rbase >= end) break; if (rend <= base) continue; /* * @rgn overlaps. If it separates the lower part of new * area, insert that portion. */ if (rbase > base) { #ifdef CONFIG_NUMA WARN_ON(nid != memblock_get_region_node(rgn)); #endif WARN_ON(flags != rgn->flags); nr_new++; if (insert) { if (start_rgn == -1) start_rgn = idx; end_rgn = idx + 1; memblock_insert_region(type, idx++, base, rbase - base, nid, flags); } } /* area below @rend is dealt with, forget about it */ base = min(rend, end); } /* insert the remaining portion */ if (base < end) { nr_new++; if (insert) { if (start_rgn == -1) start_rgn = idx; end_rgn = idx + 1; memblock_insert_region(type, idx, base, end - base, nid, flags); } } if (!nr_new) return 0; /* * If this was the first round, resize array and repeat for actual * insertions; otherwise, merge and return. */ if (!insert) { while (type->cnt + nr_new > type->max) if (memblock_double_array(type, obase, size) < 0) return -ENOMEM; insert = true; goto repeat; } else { memblock_merge_regions(type, start_rgn, end_rgn); return 0; } } /** * memblock_add_node - add new memblock region within a NUMA node * @base: base address of the new region * @size: size of the new region * @nid: nid of the new region * @flags: flags of the new region * * Add new memblock region [@base, @base + @size) to the "memory" * type. See memblock_add_range() description for mode details * * Return: * 0 on success, -errno on failure. */ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, int nid, enum memblock_flags flags) { phys_addr_t end = base + size - 1; memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__, &base, &end, nid, flags, (void *)_RET_IP_); return memblock_add_range(&memblock.memory, base, size, nid, flags); } /** * memblock_add - add new memblock region * @base: base address of the new region * @size: size of the new region * * Add new memblock region [@base, @base + @size) to the "memory" * type. See memblock_add_range() description for mode details * * Return: * 0 on success, -errno on failure. */ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, &base, &end, (void *)_RET_IP_); return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0); } /** * memblock_validate_numa_coverage - check if amount of memory with * no node ID assigned is less than a threshold * @threshold_bytes: maximal memory size that can have unassigned node * ID (in bytes). * * A buggy firmware may report memory that does not belong to any node. * Check if amount of such memory is below @threshold_bytes. * * Return: true on success, false on failure. */ bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_bytes) { unsigned long nr_pages = 0; unsigned long start_pfn, end_pfn, mem_size_mb; int nid, i; /* calculate lose page */ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { if (!numa_valid_node(nid)) nr_pages += end_pfn - start_pfn; } if ((nr_pages << PAGE_SHIFT) > threshold_bytes) { mem_size_mb = memblock_phys_mem_size() >> 20; pr_err("NUMA: no nodes coverage for %luMB of %luMB RAM\n", (nr_pages << PAGE_SHIFT) >> 20, mem_size_mb); return false; } return true; } /** * memblock_isolate_range - isolate given range into disjoint memblocks * @type: memblock type to isolate range for * @base: base of range to isolate * @size: size of range to isolate * @start_rgn: out parameter for the start of isolated region * @end_rgn: out parameter for the end of isolated region * * Walk @type and ensure that regions don't cross the boundaries defined by * [@base, @base + @size). Crossing regions are split at the boundaries, * which may create at most two more regions. The index of the first * region inside the range is returned in *@start_rgn and the index of the * first region after the range is returned in *@end_rgn. * * Return: * 0 on success, -errno on failure. */ static int __init_memblock memblock_isolate_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size, int *start_rgn, int *end_rgn) { phys_addr_t end = base + memblock_cap_size(base, &size); int idx; struct memblock_region *rgn; *start_rgn = *end_rgn = 0; if (!size) return 0; /* we'll create at most two more regions */ while (type->cnt + 2 > type->max) if (memblock_double_array(type, base, size) < 0) return -ENOMEM; for_each_memblock_type(idx, type, rgn) { phys_addr_t rbase = rgn->base; phys_addr_t rend = rbase + rgn->size; if (rbase >= end) break; if (rend <= base) continue; if (rbase < base) { /* * @rgn intersects from below. Split and continue * to process the next region - the new top half. */ rgn->base = base; rgn->size -= base - rbase; type->total_size -= base - rbase; memblock_insert_region(type, idx, rbase, base - rbase, memblock_get_region_node(rgn), rgn->flags); } else if (rend > end) { /* * @rgn intersects from above. Split and redo the * current region - the new bottom half. */ rgn->base = end; rgn->size -= end - rbase; type->total_size -= end - rbase; memblock_insert_region(type, idx--, rbase, end - rbase, memblock_get_region_node(rgn), rgn->flags); } else { /* @rgn is fully contained, record it */ if (!*end_rgn) *start_rgn = idx; *end_rgn = idx + 1; } } return 0; } static int __init_memblock memblock_remove_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size) { int start_rgn, end_rgn; int i, ret; ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); if (ret) return ret; for (i = end_rgn - 1; i >= start_rgn; i--) memblock_remove_region(type, i); return 0; } int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, &base, &end, (void *)_RET_IP_); return memblock_remove_range(&memblock.memory, base, size); } /** * memblock_free - free boot memory allocation * @ptr: starting address of the boot memory allocation * @size: size of the boot memory block in bytes * * Free boot memory block previously allocated by memblock_alloc_xx() API. * The freeing memory will not be released to the buddy allocator. */ void __init_memblock memblock_free(void *ptr, size_t size) { if (ptr) memblock_phys_free(__pa(ptr), size); } /** * memblock_phys_free - free boot memory block * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes * * Free boot memory block previously allocated by memblock_phys_alloc_xx() API. * The freeing memory will not be released to the buddy allocator. */ int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, &base, &end, (void *)_RET_IP_); kmemleak_free_part_phys(base, size); return memblock_remove_range(&memblock.reserved, base, size); } int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, &base, &end, (void *)_RET_IP_); return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0); } #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, &base, &end, (void *)_RET_IP_); return memblock_add_range(&physmem, base, size, MAX_NUMNODES, 0); } #endif /** * memblock_setclr_flag - set or clear flag for a memory region * @type: memblock type to set/clear flag for * @base: base address of the region * @size: size of the region * @set: set or clear the flag * @flag: the flag to update * * This function isolates region [@base, @base + @size), and sets/clears flag * * Return: 0 on success, -errno on failure. */ static int __init_memblock memblock_setclr_flag(struct memblock_type *type, phys_addr_t base, phys_addr_t size, int set, int flag) { int i, ret, start_rgn, end_rgn; ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); if (ret) return ret; for (i = start_rgn; i < end_rgn; i++) { struct memblock_region *r = &type->regions[i]; if (set) r->flags |= flag; else r->flags &= ~flag; } memblock_merge_regions(type, start_rgn, end_rgn); return 0; } /** * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. * @base: the base phys addr of the region * @size: the size of the region * * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) { return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_HOTPLUG); } /** * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. * @base: the base phys addr of the region * @size: the size of the region * * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) { return memblock_setclr_flag(&memblock.memory, base, size, 0, MEMBLOCK_HOTPLUG); } /** * memblock_mark_mirror - Mark mirrored memory with flag MEMBLOCK_MIRROR. * @base: the base phys addr of the region * @size: the size of the region * * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) { if (!mirrored_kernelcore) return 0; system_has_some_mirror = true; return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_MIRROR); } /** * memblock_mark_nomap - Mark a memory region with flag MEMBLOCK_NOMAP. * @base: the base phys addr of the region * @size: the size of the region * * The memory regions marked with %MEMBLOCK_NOMAP will not be added to the * direct mapping of the physical memory. These regions will still be * covered by the memory map. The struct page representing NOMAP memory * frames in the memory map will be PageReserved() * * Note: if the memory being marked %MEMBLOCK_NOMAP was allocated from * memblock, the caller must inform kmemleak to ignore that memory * * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) { return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_NOMAP); } /** * memblock_clear_nomap - Clear flag MEMBLOCK_NOMAP for a specified region. * @base: the base phys addr of the region * @size: the size of the region * * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size) { return memblock_setclr_flag(&memblock.memory, base, size, 0, MEMBLOCK_NOMAP); } /** * memblock_reserved_mark_noinit - Mark a reserved memory region with flag * MEMBLOCK_RSRV_NOINIT which results in the struct pages not being initialized * for this region. * @base: the base phys addr of the region * @size: the size of the region * * struct pages will not be initialized for reserved memory regions marked with * %MEMBLOCK_RSRV_NOINIT. * * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size) { return memblock_setclr_flag(&memblock.reserved, base, size, 1, MEMBLOCK_RSRV_NOINIT); } static bool should_skip_region(struct memblock_type *type, struct memblock_region *m, int nid, int flags) { int m_nid = memblock_get_region_node(m); /* we never skip regions when iterating memblock.reserved or physmem */ if (type != memblock_memory) return false; /* only memory regions are associated with nodes, check it */ if (numa_valid_node(nid) && nid != m_nid) return true; /* skip hotpluggable memory regions if needed */ if (movable_node_is_enabled() && memblock_is_hotpluggable(m) && !(flags & MEMBLOCK_HOTPLUG)) return true; /* if we want mirror memory skip non-mirror memory regions */ if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) return true; /* skip nomap memory unless we were asked for it explicitly */ if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) return true; /* skip driver-managed memory unless we were asked for it explicitly */ if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m)) return true; return false; } /** * __next_mem_range - next function for for_each_free_mem_range() etc. * @idx: pointer to u64 loop variable * @nid: node selector, %NUMA_NO_NODE for all nodes * @flags: pick from blocks based on memory attributes * @type_a: pointer to memblock_type from where the range is taken * @type_b: pointer to memblock_type which excludes memory from being taken * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL * * Find the first area from *@idx which matches @nid, fill the out * parameters, and update *@idx for the next iteration. The lower 32bit of * *@idx contains index into type_a and the upper 32bit indexes the * areas before each region in type_b. For example, if type_b regions * look like the following, * * 0:[0-16), 1:[32-48), 2:[128-130) * * The upper 32bit indexes the following regions. * * 0:[0-0), 1:[16-32), 2:[48-128), 3:[130-MAX) * * As both region arrays are sorted, the function advances the two indices * in lockstep and returns each intersection. */ void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid) { int idx_a = *idx & 0xffffffff; int idx_b = *idx >> 32; for (; idx_a < type_a->cnt; idx_a++) { struct memblock_region *m = &type_a->regions[idx_a]; phys_addr_t m_start = m->base; phys_addr_t m_end = m->base + m->size; int m_nid = memblock_get_region_node(m); if (should_skip_region(type_a, m, nid, flags)) continue; if (!type_b) { if (out_start) *out_start = m_start; if (out_end) *out_end = m_end; if (out_nid) *out_nid = m_nid; idx_a++; *idx = (u32)idx_a | (u64)idx_b << 32; return; } /* scan areas before each reservation */ for (; idx_b < type_b->cnt + 1; idx_b++) { struct memblock_region *r; phys_addr_t r_start; phys_addr_t r_end; r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? r->base : PHYS_ADDR_MAX; /* * if idx_b advanced past idx_a, * break out to advance idx_a */ if (r_start >= m_end) break; /* if the two regions intersect, we're done */ if (m_start < r_end) { if (out_start) *out_start = max(m_start, r_start); if (out_end) *out_end = min(m_end, r_end); if (out_nid) *out_nid = m_nid; /* * The region which ends first is * advanced for the next iteration. */ if (m_end <= r_end) idx_a++; else idx_b++; *idx = (u32)idx_a | (u64)idx_b << 32; return; } } } /* signal end of iteration */ *idx = ULLONG_MAX; } /** * __next_mem_range_rev - generic next function for for_each_*_range_rev() * * @idx: pointer to u64 loop variable * @nid: node selector, %NUMA_NO_NODE for all nodes * @flags: pick from blocks based on memory attributes * @type_a: pointer to memblock_type from where the range is taken * @type_b: pointer to memblock_type which excludes memory from being taken * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL * * Finds the next range from type_a which is not marked as unsuitable * in type_b. * * Reverse of __next_mem_range(). */ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, enum memblock_flags flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid) { int idx_a = *idx & 0xffffffff; int idx_b = *idx >> 32; if (*idx == (u64)ULLONG_MAX) { idx_a = type_a->cnt - 1; if (type_b != NULL) idx_b = type_b->cnt; else idx_b = 0; } for (; idx_a >= 0; idx_a--) { struct memblock_region *m = &type_a->regions[idx_a]; phys_addr_t m_start = m->base; phys_addr_t m_end = m->base + m->size; int m_nid = memblock_get_region_node(m); if (should_skip_region(type_a, m, nid, flags)) continue; if (!type_b) { if (out_start) *out_start = m_start; if (out_end) *out_end = m_end; if (out_nid) *out_nid = m_nid; idx_a--; *idx = (u32)idx_a | (u64)idx_b << 32; return; } /* scan areas before each reservation */ for (; idx_b >= 0; idx_b--) { struct memblock_region *r; phys_addr_t r_start; phys_addr_t r_end; r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? r->base : PHYS_ADDR_MAX; /* * if idx_b advanced past idx_a, * break out to advance idx_a */ if (r_end <= m_start) break; /* if the two regions intersect, we're done */ if (m_end > r_start) { if (out_start) *out_start = max(m_start, r_start); if (out_end) *out_end = min(m_end, r_end); if (out_nid) *out_nid = m_nid; if (m_start >= r_start) idx_a--; else idx_b--; *idx = (u32)idx_a | (u64)idx_b << 32; return; } } } /* signal end of iteration */ *idx = ULLONG_MAX; } /* * Common iterator interface used to define for_each_mem_pfn_range(). */ void __init_memblock __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, unsigned long *out_end_pfn, int *out_nid) { struct memblock_type *type = &memblock.memory; struct memblock_region *r; int r_nid; while (++*idx < type->cnt) { r = &type->regions[*idx]; r_nid = memblock_get_region_node(r); if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size)) continue; if (!numa_valid_node(nid) || nid == r_nid) break; } if (*idx >= type->cnt) { *idx = -1; return; } if (out_start_pfn) *out_start_pfn = PFN_UP(r->base); if (out_end_pfn) *out_end_pfn = PFN_DOWN(r->base + r->size); if (out_nid) *out_nid = r_nid; } /** * memblock_set_node - set node ID on memblock regions * @base: base of area to set node ID for * @size: size of area to set node ID for * @type: memblock type to set node ID for * @nid: node ID to set * * Set the nid of memblock @type regions in [@base, @base + @size) to @nid. * Regions which cross the area boundaries are split as necessary. * * Return: * 0 on success, -errno on failure. */ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid) { #ifdef CONFIG_NUMA int start_rgn, end_rgn; int i, ret; ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); if (ret) return ret; for (i = start_rgn; i < end_rgn; i++) memblock_set_region_node(&type->regions[i], nid); memblock_merge_regions(type, start_rgn, end_rgn); #endif return 0; } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /** * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone() * * @idx: pointer to u64 loop variable * @zone: zone in which all of the memory blocks reside * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL * * This function is meant to be a zone/pfn specific wrapper for the * for_each_mem_range type iterators. Specifically they are used in the * deferred memory init routines and as such we were duplicating much of * this logic throughout the code. So instead of having it in multiple * locations it seemed like it would make more sense to centralize this to * one new iterator that does everything they need. */ void __init_memblock __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, unsigned long *out_spfn, unsigned long *out_epfn) { int zone_nid = zone_to_nid(zone); phys_addr_t spa, epa; __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, &memblock.memory, &memblock.reserved, &spa, &epa, NULL); while (*idx != U64_MAX) { unsigned long epfn = PFN_DOWN(epa); unsigned long spfn = PFN_UP(spa); /* * Verify the end is at least past the start of the zone and * that we have at least one PFN to initialize. */ if (zone->zone_start_pfn < epfn && spfn < epfn) { /* if we went too far just stop searching */ if (zone_end_pfn(zone) <= spfn) { *idx = U64_MAX; break; } if (out_spfn) *out_spfn = max(zone->zone_start_pfn, spfn); if (out_epfn) *out_epfn = min(zone_end_pfn(zone), epfn); return; } __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, &memblock.memory, &memblock.reserved, &spa, &epa, NULL); } /* signal end of iteration */ if (out_spfn) *out_spfn = ULONG_MAX; if (out_epfn) *out_epfn = 0; } #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** * memblock_alloc_range_nid - allocate boot memory block * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @start: the lower bound of the memory region to allocate (phys address) * @end: the upper bound of the memory region to allocate (phys address) * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * @exact_nid: control the allocation fall back to other nodes * * The allocation is performed from memory region limited by * memblock.current_limit if @end == %MEMBLOCK_ALLOC_ACCESSIBLE. * * If the specified node can not hold the requested memory and @exact_nid * is false, the allocation falls back to any node in the system. * * For systems with memory mirroring, the allocation is attempted first * from the regions with mirroring enabled and then retried from any * memory region. * * In addition, function using kmemleak_alloc_phys for allocated boot * memory block, it is never reported as leaks. * * Return: * Physical address of allocated memory block on success, %0 on failure. */ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, int nid, bool exact_nid) { enum memblock_flags flags = choose_memblock_flags(); phys_addr_t found; /* * Detect any accidental use of these APIs after slab is ready, as at * this moment memblock may be deinitialized already and its * internal data may be destroyed (after execution of memblock_free_all) */ if (WARN_ON_ONCE(slab_is_available())) { void *vaddr = kzalloc_node(size, GFP_NOWAIT, nid); return vaddr ? virt_to_phys(vaddr) : 0; } if (!align) { /* Can't use WARNs this early in boot on powerpc */ dump_stack(); align = SMP_CACHE_BYTES; } again: found = memblock_find_in_range_node(size, align, start, end, nid, flags); if (found && !memblock_reserve(found, size)) goto done; if (numa_valid_node(nid) && !exact_nid) { found = memblock_find_in_range_node(size, align, start, end, NUMA_NO_NODE, flags); if (found && !memblock_reserve(found, size)) goto done; } if (flags & MEMBLOCK_MIRROR) { flags &= ~MEMBLOCK_MIRROR; pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n", &size); goto again; } return 0; done: /* * Skip kmemleak for those places like kasan_init() and * early_pgtable_alloc() due to high volume. */ if (end != MEMBLOCK_ALLOC_NOLEAKTRACE) /* * Memblock allocated blocks are never reported as * leaks. This is because many of these blocks are * only referred via the physical address which is * not looked up by kmemleak. */ kmemleak_alloc_phys(found, size, 0); /* * Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP, * require memory to be accepted before it can be used by the * guest. * * Accept the memory of the allocated buffer. */ accept_memory(found, size); return found; } /** * memblock_phys_alloc_range - allocate a memory block inside specified range * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @start: the lower bound of the memory region to allocate (physical address) * @end: the upper bound of the memory region to allocate (physical address) * * Allocate @size bytes in the between @start and @end. * * Return: physical address of the allocated memory block on success, * %0 on failure. */ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end) { memblock_dbg("%s: %llu bytes align=0x%llx from=%pa max_addr=%pa %pS\n", __func__, (u64)size, (u64)align, &start, &end, (void *)_RET_IP_); return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE, false); } /** * memblock_phys_alloc_try_nid - allocate a memory block from specified NUMA node * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * Allocates memory block from the specified NUMA node. If the node * has no available memory, attempts to allocated from any node in the * system. * * Return: physical address of the allocated memory block on success, * %0 on failure. */ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) { return memblock_alloc_range_nid(size, align, 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid, false); } /** * memblock_alloc_internal - allocate boot memory block * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region to allocate (phys address) * @max_addr: the upper bound of the memory region to allocate (phys address) * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * @exact_nid: control the allocation fall back to other nodes * * Allocates memory block using memblock_alloc_range_nid() and * converts the returned physical address to virtual. * * The @min_addr limit is dropped if it can not be satisfied and the allocation * will fall back to memory below @min_addr. Other constraints, such * as node and mirrored memory will be handled again in * memblock_alloc_range_nid(). * * Return: * Virtual address of allocated memory block on success, NULL on failure. */ static void * __init memblock_alloc_internal( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid, bool exact_nid) { phys_addr_t alloc; if (max_addr > memblock.current_limit) max_addr = memblock.current_limit; alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid, exact_nid); /* retry allocation without lower limit */ if (!alloc && min_addr) alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid, exact_nid); if (!alloc) return NULL; return phys_to_virt(alloc); } /** * memblock_alloc_exact_nid_raw - allocate boot memory block on the exact node * without zeroing memory * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region from where the allocation * is preferred (phys address) * @max_addr: the upper bound of the memory region from where the allocation * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * Public function, provides additional debug information (including caller * info), if enabled. Does not zero allocated memory. * * Return: * Virtual address of allocated memory block on success, NULL on failure. */ void * __init memblock_alloc_exact_nid_raw( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) { memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); return memblock_alloc_internal(size, align, min_addr, max_addr, nid, true); } /** * memblock_alloc_try_nid_raw - allocate boot memory block without zeroing * memory and without panicking * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region from where the allocation * is preferred (phys address) * @max_addr: the upper bound of the memory region from where the allocation * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * Public function, provides additional debug information (including caller * info), if enabled. Does not zero allocated memory, does not panic if request * cannot be satisfied. * * Return: * Virtual address of allocated memory block on success, NULL on failure. */ void * __init memblock_alloc_try_nid_raw( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) { memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); return memblock_alloc_internal(size, align, min_addr, max_addr, nid, false); } /** * memblock_alloc_try_nid - allocate boot memory block * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region from where the allocation * is preferred (phys address) * @max_addr: the upper bound of the memory region from where the allocation * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * Public function, provides additional debug information (including caller * info), if enabled. This function zeroes the allocated memory. * * Return: * Virtual address of allocated memory block on success, NULL on failure. */ void * __init memblock_alloc_try_nid( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) { void *ptr; memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); ptr = memblock_alloc_internal(size, align, min_addr, max_addr, nid, false); if (ptr) memset(ptr, 0, size); return ptr; } /** * __memblock_alloc_or_panic - Try to allocate memory and panic on failure * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @func: caller func name * * This function attempts to allocate memory using memblock_alloc, * and in case of failure, it calls panic with the formatted message. * This function should not be used directly, please use the macro memblock_alloc_or_panic. */ void *__init __memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align, const char *func) { void *addr = memblock_alloc(size, align); if (unlikely(!addr)) panic("%s: Failed to allocate %pap bytes\n", func, &size); return addr; } /** * memblock_free_late - free pages directly to buddy allocator * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes * * This is only useful when the memblock allocator has already been torn * down, but we are still initializing the system. Pages are released directly * to the buddy allocator. */ void __init memblock_free_late(phys_addr_t base, phys_addr_t size) { phys_addr_t cursor, end; end = base + size - 1; memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, &base, &end, (void *)_RET_IP_); kmemleak_free_part_phys(base, size); cursor = PFN_UP(base); end = PFN_DOWN(base + size); for (; cursor < end; cursor++) { memblock_free_pages(pfn_to_page(cursor), cursor, 0); totalram_pages_inc(); } } /* * Remaining API functions */ phys_addr_t __init_memblock memblock_phys_mem_size(void) { return memblock.memory.total_size; } phys_addr_t __init_memblock memblock_reserved_size(void) { return memblock.reserved.total_size; } /** * memblock_estimated_nr_free_pages - return estimated number of free pages * from memblock point of view * * During bootup, subsystems might need a rough estimate of the number of free * pages in the whole system, before precise numbers are available from the * buddy. Especially with CONFIG_DEFERRED_STRUCT_PAGE_INIT, the numbers * obtained from the buddy might be very imprecise during bootup. * * Return: * An estimated number of free pages from memblock point of view. */ unsigned long __init memblock_estimated_nr_free_pages(void) { return PHYS_PFN(memblock_phys_mem_size() - memblock_reserved_size()); } /* lowest address */ phys_addr_t __init_memblock memblock_start_of_DRAM(void) { return memblock.memory.regions[0].base; } phys_addr_t __init_memblock memblock_end_of_DRAM(void) { int idx = memblock.memory.cnt - 1; return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size); } static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) { phys_addr_t max_addr = PHYS_ADDR_MAX; struct memblock_region *r; /* * translate the memory @limit size into the max address within one of * the memory memblock regions, if the @limit exceeds the total size * of those regions, max_addr will keep original value PHYS_ADDR_MAX */ for_each_mem_region(r) { if (limit <= r->size) { max_addr = r->base + limit; break; } limit -= r->size; } return max_addr; } void __init memblock_enforce_memory_limit(phys_addr_t limit) { phys_addr_t max_addr; if (!limit) return; max_addr = __find_max_addr(limit); /* @limit exceeds the total size of the memory, do nothing */ if (max_addr == PHYS_ADDR_MAX) return; /* truncate both memory and reserved regions */ memblock_remove_range(&memblock.memory, max_addr, PHYS_ADDR_MAX); memblock_remove_range(&memblock.reserved, max_addr, PHYS_ADDR_MAX); } void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) { int start_rgn, end_rgn; int i, ret; if (!size) return; if (!memblock_memory->total_size) { pr_warn("%s: No memory registered yet\n", __func__); return; } ret = memblock_isolate_range(&memblock.memory, base, size, &start_rgn, &end_rgn); if (ret) return; /* remove all the MAP regions */ for (i = memblock.memory.cnt - 1; i >= end_rgn; i--) if (!memblock_is_nomap(&memblock.memory.regions[i])) memblock_remove_region(&memblock.memory, i); for (i = start_rgn - 1; i >= 0; i--) if (!memblock_is_nomap(&memblock.memory.regions[i])) memblock_remove_region(&memblock.memory, i); /* truncate the reserved regions */ memblock_remove_range(&memblock.reserved, 0, base); memblock_remove_range(&memblock.reserved, base + size, PHYS_ADDR_MAX); } void __init memblock_mem_limit_remove_map(phys_addr_t limit) { phys_addr_t max_addr; if (!limit) return; max_addr = __find_max_addr(limit); /* @limit exceeds the total size of the memory, do nothing */ if (max_addr == PHYS_ADDR_MAX) return; memblock_cap_memory_range(0, max_addr); } static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) { unsigned int left = 0, right = type->cnt; do { unsigned int mid = (right + left) / 2; if (addr < type->regions[mid].base) right = mid; else if (addr >= (type->regions[mid].base + type->regions[mid].size)) left = mid + 1; else return mid; } while (left < right); return -1; } bool __init_memblock memblock_is_reserved(phys_addr_t addr) { return memblock_search(&memblock.reserved, addr) != -1; } bool __init_memblock memblock_is_memory(phys_addr_t addr) { return memblock_search(&memblock.memory, addr) != -1; } bool __init_memblock memblock_is_map_memory(phys_addr_t addr) { int i = memblock_search(&memblock.memory, addr); if (i == -1) return false; return !memblock_is_nomap(&memblock.memory.regions[i]); } int __init_memblock memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn) { struct memblock_type *type = &memblock.memory; int mid = memblock_search(type, PFN_PHYS(pfn)); if (mid == -1) return NUMA_NO_NODE; *start_pfn = PFN_DOWN(type->regions[mid].base); *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size); return memblock_get_region_node(&type->regions[mid]); } /** * memblock_is_region_memory - check if a region is a subset of memory * @base: base of region to check * @size: size of region to check * * Check if the region [@base, @base + @size) is a subset of a memory block. * * Return: * 0 if false, non-zero if true */ bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) { int idx = memblock_search(&memblock.memory, base); phys_addr_t end = base + memblock_cap_size(base, &size); if (idx == -1) return false; return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size) >= end; } /** * memblock_is_region_reserved - check if a region intersects reserved memory * @base: base of region to check * @size: size of region to check * * Check if the region [@base, @base + @size) intersects a reserved * memory block. * * Return: * True if they intersect, false if not. */ bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) { return memblock_overlaps_region(&memblock.reserved, base, size); } void __init_memblock memblock_trim_memory(phys_addr_t align) { phys_addr_t start, end, orig_start, orig_end; struct memblock_region *r; for_each_mem_region(r) { orig_start = r->base; orig_end = r->base + r->size; start = round_up(orig_start, align); end = round_down(orig_end, align); if (start == orig_start && end == orig_end) continue; if (start < end) { r->base = start; r->size = end - start; } else { memblock_remove_region(&memblock.memory, r - memblock.memory.regions); r--; } } } void __init_memblock memblock_set_current_limit(phys_addr_t limit) { memblock.current_limit = limit; } phys_addr_t __init_memblock memblock_get_current_limit(void) { return memblock.current_limit; } static void __init_memblock memblock_dump(struct memblock_type *type) { phys_addr_t base, end, size; enum memblock_flags flags; int idx; struct memblock_region *rgn; pr_info(" %s.cnt = 0x%lx\n", type->name, type->cnt); for_each_memblock_type(idx, type, rgn) { char nid_buf[32] = ""; base = rgn->base; size = rgn->size; end = base + size - 1; flags = rgn->flags; #ifdef CONFIG_NUMA if (numa_valid_node(memblock_get_region_node(rgn))) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); #endif pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#x\n", type->name, idx, &base, &end, &size, nid_buf, flags); } } static void __init_memblock __memblock_dump_all(void) { pr_info("MEMBLOCK configuration:\n"); pr_info(" memory size = %pa reserved size = %pa\n", &memblock.memory.total_size, &memblock.reserved.total_size); memblock_dump(&memblock.memory); memblock_dump(&memblock.reserved); #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP memblock_dump(&physmem); #endif } void __init_memblock memblock_dump_all(void) { if (memblock_debug) __memblock_dump_all(); } void __init memblock_allow_resize(void) { memblock_can_resize = 1; } static int __init early_memblock(char *p) { if (p && strstr(p, "debug")) memblock_debug = 1; return 0; } early_param("memblock", early_memblock); static void __init free_memmap(unsigned long start_pfn, unsigned long end_pfn) { struct page *start_pg, *end_pg; phys_addr_t pg, pgend; /* * Convert start_pfn/end_pfn to a struct page pointer. */ start_pg = pfn_to_page(start_pfn - 1) + 1; end_pg = pfn_to_page(end_pfn - 1) + 1; /* * Convert to physical addresses, and round start upwards and end * downwards. */ pg = PAGE_ALIGN(__pa(start_pg)); pgend = PAGE_ALIGN_DOWN(__pa(end_pg)); /* * If there are free pages between these, free the section of the * memmap array. */ if (pg < pgend) memblock_phys_free(pg, pgend - pg); } /* * The mem_map array can get very big. Free the unused area of the memory map. */ static void __init free_unused_memmap(void) { unsigned long start, end, prev_end = 0; int i; if (!IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) || IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) return; /* * This relies on each bank being in address order. * The banks are sorted previously in bootmem_init(). */ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) { #ifdef CONFIG_SPARSEMEM /* * Take care not to free memmap entries that don't exist * due to SPARSEMEM sections which aren't present. */ start = min(start, ALIGN(prev_end, PAGES_PER_SECTION)); #endif /* * Align down here since many operations in VM subsystem * presume that there are no holes in the memory map inside * a pageblock */ start = pageblock_start_pfn(start); /* * If we had a previous bank, and there is a space * between the current bank and the previous, free it. */ if (prev_end && prev_end < start) free_memmap(prev_end, start); /* * Align up here since many operations in VM subsystem * presume that there are no holes in the memory map inside * a pageblock */ prev_end = pageblock_align(end); } #ifdef CONFIG_SPARSEMEM if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION)) { prev_end = pageblock_align(end); free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION)); } #endif } static void __init __free_pages_memory(unsigned long start, unsigned long end) { int order; while (start < end) { /* * Free the pages in the largest chunks alignment allows. * * __ffs() behaviour is undefined for 0. start == 0 is * MAX_PAGE_ORDER-aligned, set order to MAX_PAGE_ORDER for * the case. */ if (start) order = min_t(int, MAX_PAGE_ORDER, __ffs(start)); else order = MAX_PAGE_ORDER; while (start + (1UL << order) > end) order--; memblock_free_pages(pfn_to_page(start), start, order); start += (1UL << order); } } static unsigned long __init __free_memory_core(phys_addr_t start, phys_addr_t end) { unsigned long start_pfn = PFN_UP(start); unsigned long end_pfn = min_t(unsigned long, PFN_DOWN(end), max_low_pfn); if (start_pfn >= end_pfn) return 0; __free_pages_memory(start_pfn, end_pfn); return end_pfn - start_pfn; } static void __init memmap_init_reserved_pages(void) { struct memblock_region *region; phys_addr_t start, end; int nid; /* * set nid on all reserved pages and also treat struct * pages for the NOMAP regions as PageReserved */ for_each_mem_region(region) { nid = memblock_get_region_node(region); start = region->base; end = start + region->size; if (memblock_is_nomap(region)) reserve_bootmem_region(start, end, nid); memblock_set_node(start, end, &memblock.reserved, nid); } /* * initialize struct pages for reserved regions that don't have * the MEMBLOCK_RSRV_NOINIT flag set */ for_each_reserved_mem_region(region) { if (!memblock_is_reserved_noinit(region)) { nid = memblock_get_region_node(region); start = region->base; end = start + region->size; if (!numa_valid_node(nid)) nid = early_pfn_to_nid(PFN_DOWN(start)); reserve_bootmem_region(start, end, nid); } } } static unsigned long __init free_low_memory_core_early(void) { unsigned long count = 0; phys_addr_t start, end; u64 i; memblock_clear_hotplug(0, -1); memmap_init_reserved_pages(); /* * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id * because in some case like Node0 doesn't have RAM installed * low ram will be on Node1 */ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) count += __free_memory_core(start, end); return count; } static int reset_managed_pages_done __initdata; static void __init reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) atomic_long_set(&z->managed_pages, 0); } void __init reset_all_zones_managed_pages(void) { struct pglist_data *pgdat; if (reset_managed_pages_done) return; for_each_online_pgdat(pgdat) reset_node_managed_pages(pgdat); reset_managed_pages_done = 1; } /** * memblock_free_all - release free pages to the buddy allocator */ void __init memblock_free_all(void) { unsigned long pages; free_unused_memmap(); reset_all_zones_managed_pages(); pages = free_low_memory_core_early(); totalram_pages_add(pages); } /* Keep a table to reserve named memory */ #define RESERVE_MEM_MAX_ENTRIES 8 #define RESERVE_MEM_NAME_SIZE 16 struct reserve_mem_table { char name[RESERVE_MEM_NAME_SIZE]; phys_addr_t start; phys_addr_t size; }; static struct reserve_mem_table reserved_mem_table[RESERVE_MEM_MAX_ENTRIES]; static int reserved_mem_count; /* Add wildcard region with a lookup name */ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size, const char *name) { struct reserve_mem_table *map; map = &reserved_mem_table[reserved_mem_count++]; map->start = start; map->size = size; strscpy(map->name, name); } /** * reserve_mem_find_by_name - Find reserved memory region with a given name * @name: The name that is attached to a reserved memory region * @start: If found, holds the start address * @size: If found, holds the size of the address. * * @start and @size are only updated if @name is found. * * Returns: 1 if found or 0 if not found. */ int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size) { struct reserve_mem_table *map; int i; for (i = 0; i < reserved_mem_count; i++) { map = &reserved_mem_table[i]; if (!map->size) continue; if (strcmp(name, map->name) == 0) { *start = map->start; *size = map->size; return 1; } } return 0; } EXPORT_SYMBOL_GPL(reserve_mem_find_by_name); /* * Parse reserve_mem=nn:align:name */ static int __init reserve_mem(char *p) { phys_addr_t start, size, align, tmp; char *name; char *oldp; int len; if (!p) return -EINVAL; /* Check if there's room for more reserved memory */ if (reserved_mem_count >= RESERVE_MEM_MAX_ENTRIES) return -EBUSY; oldp = p; size = memparse(p, &p); if (!size || p == oldp) return -EINVAL; if (*p != ':') return -EINVAL; align = memparse(p+1, &p); if (*p != ':') return -EINVAL; /* * memblock_phys_alloc() doesn't like a zero size align, * but it is OK for this command to have it. */ if (align < SMP_CACHE_BYTES) align = SMP_CACHE_BYTES; name = p + 1; len = strlen(name); /* name needs to have length but not too big */ if (!len || len >= RESERVE_MEM_NAME_SIZE) return -EINVAL; /* Make sure that name has text */ for (p = name; *p; p++) { if (!isspace(*p)) break; } if (!*p) return -EINVAL; /* Make sure the name is not already used */ if (reserve_mem_find_by_name(name, &start, &tmp)) return -EBUSY; start = memblock_phys_alloc(size, align); if (!start) return -ENOMEM; reserved_mem_add(start, size, name); return 1; } __setup("reserve_mem=", reserve_mem); #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK) static const char * const flagname[] = { [ilog2(MEMBLOCK_HOTPLUG)] = "HOTPLUG", [ilog2(MEMBLOCK_MIRROR)] = "MIRROR", [ilog2(MEMBLOCK_NOMAP)] = "NOMAP", [ilog2(MEMBLOCK_DRIVER_MANAGED)] = "DRV_MNG", [ilog2(MEMBLOCK_RSRV_NOINIT)] = "RSV_NIT", }; static int memblock_debug_show(struct seq_file *m, void *private) { struct memblock_type *type = m->private; struct memblock_region *reg; int i, j, nid; unsigned int count = ARRAY_SIZE(flagname); phys_addr_t end; for (i = 0; i < type->cnt; i++) { reg = &type->regions[i]; end = reg->base + reg->size - 1; nid = memblock_get_region_node(reg); seq_printf(m, "%4d: ", i); seq_printf(m, "%pa..%pa ", ®->base, &end); if (numa_valid_node(nid)) seq_printf(m, "%4d ", nid); else seq_printf(m, "%4c ", 'x'); if (reg->flags) { for (j = 0; j < count; j++) { if (reg->flags & (1U << j)) { seq_printf(m, "%s\n", flagname[j]); break; } } if (j == count) seq_printf(m, "%s\n", "UNKNOWN"); } else { seq_printf(m, "%s\n", "NONE"); } } return 0; } DEFINE_SHOW_ATTRIBUTE(memblock_debug); static int __init memblock_init_debugfs(void) { struct dentry *root = debugfs_create_dir("memblock", NULL); debugfs_create_file("memory", 0444, root, &memblock.memory, &memblock_debug_fops); debugfs_create_file("reserved", 0444, root, &memblock.reserved, &memblock_debug_fops); #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP debugfs_create_file("physmem", 0444, root, &physmem, &memblock_debug_fops); #endif return 0; } __initcall(memblock_init_debugfs); #endif /* CONFIG_DEBUG_FS */ |
| 521 521 244 245 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 | // SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. * Copyright (C) 2009 Red Hat, Inc. * * Creation is done via kthreadd, so that we get a clean environment * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ #include <uapi/linux/sched/types.h> #include <linux/mm.h> #include <linux/mmu_context.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/numa.h> #include <linux/sched/isolation.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; static LIST_HEAD(kthreads_hotplug); static DEFINE_MUTEX(kthreads_hotplug_lock); struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ char *full_name; int (*threadfn)(void *data); void *data; int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; struct completion *done; struct list_head list; }; struct kthread { unsigned long flags; unsigned int cpu; unsigned int node; int started; int result; int (*threadfn)(void *); void *data; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif /* To store the full name if task comm is truncated. */ char *full_name; struct task_struct *task; struct list_head hotplug_node; struct cpumask *preferred_affinity; }; enum KTHREAD_BITS { KTHREAD_IS_PER_CPU = 0, KTHREAD_SHOULD_STOP, KTHREAD_SHOULD_PARK, }; static inline struct kthread *to_kthread(struct task_struct *k) { WARN_ON(!(k->flags & PF_KTHREAD)); return k->worker_private; } /* * Variant of to_kthread() that doesn't assume @p is a kthread. * * Per construction; when: * * (p->flags & PF_KTHREAD) && p->worker_private * * the task is both a kthread and struct kthread is persistent. However * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and * begin_new_exec()). */ static inline struct kthread *__to_kthread(struct task_struct *p) { void *kthread = p->worker_private; if (kthread && !(p->flags & PF_KTHREAD)) kthread = NULL; return kthread; } void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) { struct kthread *kthread = to_kthread(tsk); if (!kthread || !kthread->full_name) { strscpy(buf, tsk->comm, buf_size); return; } strscpy_pad(buf, kthread->full_name, buf_size); } bool set_kthread_struct(struct task_struct *p) { struct kthread *kthread; if (WARN_ON_ONCE(to_kthread(p))) return false; kthread = kzalloc(sizeof(*kthread), GFP_KERNEL); if (!kthread) return false; init_completion(&kthread->exited); init_completion(&kthread->parked); INIT_LIST_HEAD(&kthread->hotplug_node); p->vfork_done = &kthread->exited; kthread->task = p; kthread->node = tsk_fork_get_node(current); p->worker_private = kthread; return true; } void free_kthread_struct(struct task_struct *k) { struct kthread *kthread; /* * Can be NULL if kmalloc() in set_kthread_struct() failed. */ kthread = to_kthread(k); if (!kthread) return; #ifdef CONFIG_BLK_CGROUP WARN_ON_ONCE(kthread->blkcg_css); #endif k->worker_private = NULL; kfree(kthread->full_name); kfree(kthread); } /** * kthread_should_stop - should this kthread return now? * * When someone calls kthread_stop() on your kthread, it will be woken * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ bool kthread_should_stop(void) { return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); } EXPORT_SYMBOL(kthread_should_stop); static bool __kthread_should_park(struct task_struct *k) { return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags); } /** * kthread_should_park - should this kthread park now? * * When someone calls kthread_park() on your kthread, it will be woken * and this will return true. You should then do the necessary * cleanup and call kthread_parkme() * * Similar to kthread_should_stop(), but this keeps the thread alive * and in a park position. kthread_unpark() "restarts" the thread and * calls the thread function again. */ bool kthread_should_park(void) { return __kthread_should_park(current); } EXPORT_SYMBOL_GPL(kthread_should_park); bool kthread_should_stop_or_park(void) { struct kthread *kthread = __to_kthread(current); if (!kthread) return false; return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK)); } /** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen * * kthread_should_stop() for freezable kthreads, which will enter * refrigerator if necessary. This function is safe from kthread_stop() / * freezer deadlock and freezable kthreads should use this function instead * of calling try_to_freeze() directly. */ bool kthread_freezable_should_stop(bool *was_frozen) { bool frozen = false; might_sleep(); if (unlikely(freezing(current))) frozen = __refrigerator(true); if (was_frozen) *was_frozen = frozen; return kthread_should_stop(); } EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); /** * kthread_func - return the function specified on kthread creation * @task: kthread task in question * * Returns NULL if the task is not a kthread. */ void *kthread_func(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); if (kthread) return kthread->threadfn; return NULL; } EXPORT_SYMBOL_GPL(kthread_func); /** * kthread_data - return data value specified on kthread creation * @task: kthread task in question * * Return the data value specified when kthread @task was created. * The caller is responsible for ensuring the validity of @task when * calling this function. */ void *kthread_data(struct task_struct *task) { return to_kthread(task)->data; } EXPORT_SYMBOL_GPL(kthread_data); /** * kthread_probe_data - speculative version of kthread_data() * @task: possible kthread task in question * * @task could be a kthread task. Return the data value specified when it * was created if accessible. If @task isn't a kthread task or its data is * inaccessible for any reason, %NULL is returned. This function requires * that @task itself is safe to dereference. */ void *kthread_probe_data(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); void *data = NULL; if (kthread) copy_from_kernel_nofault(&data, &kthread->data, sizeof(data)); return data; } static void __kthread_parkme(struct kthread *self) { for (;;) { /* * TASK_PARKED is a special state; we must serialize against * possible pending wakeups to avoid store-store collisions on * task->state. * * Such a collision might possibly result in the task state * changin from TASK_PARKED and us failing the * wait_task_inactive() in kthread_park(). */ set_special_state(TASK_PARKED); if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; /* * Thread is going to call schedule(), do not preempt it, * or the caller of kthread_park() may spend more time in * wait_task_inactive(). */ preempt_disable(); complete(&self->parked); schedule_preempt_disabled(); preempt_enable(); } __set_current_state(TASK_RUNNING); } void kthread_parkme(void) { __kthread_parkme(to_kthread(current)); } EXPORT_SYMBOL_GPL(kthread_parkme); /** * kthread_exit - Cause the current kthread return @result to kthread_stop(). * @result: The integer value to return to kthread_stop(). * * While kthread_exit can be called directly, it exists so that * functions which do some additional work in non-modular code such as * module_put_and_kthread_exit can be implemented. * * Does not return. */ void __noreturn kthread_exit(long result) { struct kthread *kthread = to_kthread(current); kthread->result = result; if (!list_empty(&kthread->hotplug_node)) { mutex_lock(&kthreads_hotplug_lock); list_del(&kthread->hotplug_node); mutex_unlock(&kthreads_hotplug_lock); if (kthread->preferred_affinity) { kfree(kthread->preferred_affinity); kthread->preferred_affinity = NULL; } } do_exit(0); } EXPORT_SYMBOL(kthread_exit); /** * kthread_complete_and_exit - Exit the current kthread. * @comp: Completion to complete * @code: The integer value to return to kthread_stop(). * * If present, complete @comp and then return code to kthread_stop(). * * A kernel thread whose module may be removed after the completion of * @comp can use this function to exit safely. * * Does not return. */ void __noreturn kthread_complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); kthread_exit(code); } EXPORT_SYMBOL(kthread_complete_and_exit); static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask) { const struct cpumask *pref; if (kthread->preferred_affinity) { pref = kthread->preferred_affinity; } else { if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE)) return; pref = cpumask_of_node(kthread->node); } cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD)); if (cpumask_empty(cpumask)) cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD)); } static void kthread_affine_node(void) { struct kthread *kthread = to_kthread(current); cpumask_var_t affinity; WARN_ON_ONCE(kthread_is_per_cpu(current)); if (kthread->node == NUMA_NO_NODE) { housekeeping_affine(current, HK_TYPE_KTHREAD); } else { if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) { WARN_ON_ONCE(1); return; } mutex_lock(&kthreads_hotplug_lock); WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); /* * The node cpumask is racy when read from kthread() but: * - a racing CPU going down will either fail on the subsequent * call to set_cpus_allowed_ptr() or be migrated to housekeepers * afterwards by the scheduler. * - a racing CPU going up will be handled by kthreads_online_cpu() */ kthread_fetch_affinity(kthread, affinity); set_cpus_allowed_ptr(current, affinity); mutex_unlock(&kthreads_hotplug_lock); free_cpumask_var(affinity); } } static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; /* Copy data: it's on kthread's stack */ struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; void *data = create->data; struct completion *done; struct kthread *self; int ret; self = to_kthread(current); /* Release the structure when caller killed by a fatal signal. */ done = xchg(&create->done, NULL); if (!done) { kfree(create->full_name); kfree(create); kthread_exit(-EINTR); } self->full_name = create->full_name; self->threadfn = threadfn; self->data = data; /* * The new thread inherited kthreadd's priority and CPU mask. Reset * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; /* * Thread is going to call schedule(), do not preempt it, * or the creator may spend more time in wait_task_inactive(). */ preempt_disable(); complete(done); schedule_preempt_disabled(); preempt_enable(); self->started = 1; if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity) kthread_affine_node(); ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { cgroup_kthread_ready(); __kthread_parkme(self); ret = threadfn(data); } kthread_exit(ret); } /* called from kernel_clone() to get node information for about to be created task */ int tsk_fork_get_node(struct task_struct *tsk) { #ifdef CONFIG_NUMA if (tsk == kthreadd_task) return tsk->pref_node_fork; #endif return NUMA_NO_NODE; } static void create_kthread(struct kthread_create_info *create) { int pid; #ifdef CONFIG_NUMA current->pref_node_fork = create->node; #endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, create->full_name, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { /* Release the structure when caller killed by a fatal signal. */ struct completion *done = xchg(&create->done, NULL); kfree(create->full_name); if (!done) { kfree(create); return; } create->result = ERR_PTR(pid); complete(done); } } static __printf(4, 0) struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], va_list args) { DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; struct kthread_create_info *create = kmalloc(sizeof(*create), GFP_KERNEL); if (!create) return ERR_PTR(-ENOMEM); create->threadfn = threadfn; create->data = data; create->node = node; create->done = &done; create->full_name = kvasprintf(GFP_KERNEL, namefmt, args); if (!create->full_name) { task = ERR_PTR(-ENOMEM); goto free_create; } spin_lock(&kthread_create_lock); list_add_tail(&create->list, &kthread_create_list); spin_unlock(&kthread_create_lock); wake_up_process(kthreadd_task); /* * Wait for completion in killable state, for I might be chosen by * the OOM killer while kthreadd is trying to allocate memory for * new kernel thread. */ if (unlikely(wait_for_completion_killable(&done))) { /* * If I was killed by a fatal signal before kthreadd (or new * kernel thread) calls complete(), leave the cleanup of this * structure to that thread. */ if (xchg(&create->done, NULL)) return ERR_PTR(-EINTR); /* * kthreadd (or new kernel thread) will call complete() * shortly. */ wait_for_completion(&done); } task = create->result; free_create: kfree(create); return task; } /** * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @node: task and thread structures for the thread are allocated on this node * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and * is affine to all CPUs. * * If thread is going to be bound on a particular cpu, give its node * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either return directly if it is a * standalone thread for which no one will call kthread_stop(), or * return when 'kthread_should_stop()' is true (which means * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], ...) { struct task_struct *task; va_list args; va_start(args, namefmt); task = __kthread_create_on_node(threadfn, data, node, namefmt, args); va_end(args); return task; } EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { unsigned long flags; if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } /* It's safe because the task is inactive. */ raw_spin_lock_irqsave(&p->pi_lock, flags); do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; raw_spin_unlock_irqrestore(&p->pi_lock, flags); } static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) { __kthread_bind_mask(p, cpumask_of(cpu), state); } void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); WARN_ON_ONCE(kthread->started); } /** * kthread_bind - bind a just-created kthread to a cpu. * @p: thread created by kthread_create(). * @cpu: cpu (might not be online, must be possible) for @k to run on. * * Description: This function is equivalent to set_cpus_allowed(), * except that @cpu doesn't need to be online, and the thread must be * stopped (i.e., just returned from kthread_create()). */ void kthread_bind(struct task_struct *p, unsigned int cpu) { struct kthread *kthread = to_kthread(p); __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); WARN_ON_ONCE(kthread->started); } EXPORT_SYMBOL(kthread_bind); /** * kthread_create_on_cpu - Create a cpu bound kthread * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @cpu: The cpu on which the thread should be bound, * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Description: This helper function creates and names a kernel thread */ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), void *data, unsigned int cpu, const char *namefmt) { struct task_struct *p; p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, cpu); if (IS_ERR(p)) return p; kthread_bind(p, cpu); /* CPU hotplug need to bind once again when unparking the thread. */ to_kthread(p)->cpu = cpu; return p; } EXPORT_SYMBOL(kthread_create_on_cpu); void kthread_set_per_cpu(struct task_struct *k, int cpu) { struct kthread *kthread = to_kthread(k); if (!kthread) return; WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY)); if (cpu < 0) { clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags); return; } kthread->cpu = cpu; set_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } bool kthread_is_per_cpu(struct task_struct *p) { struct kthread *kthread = __to_kthread(p); if (!kthread) return false; return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } /** * kthread_unpark - unpark a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return false, wakes it, and * waits for it to return. If the thread is marked percpu then its * bound to the cpu again. */ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)) return; /* * Newly created kthread was parked when the CPU was offline. * The binding was lost and we need to set it again. */ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. */ wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); /** * kthread_park - park a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return true, wakes it, and * waits for it to return. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will park without * calling threadfn(). * * Returns 0 if the thread is parked, -ENOSYS if the thread exited. * If called by the kthread itself just the park bit is set. */ int kthread_park(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) return -EBUSY; set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); /* * Wait for __kthread_parkme() to complete(), this means we * _will_ have TASK_PARKED and are about to call schedule(). */ wait_for_completion(&kthread->parked); /* * Now wait for that schedule() to complete and the task to * get scheduled out. */ WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); } return 0; } EXPORT_SYMBOL_GPL(kthread_park); /** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_stop() for @k to return true, wakes it, and * waits for it to exit. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will exit without * calling threadfn(). * * If threadfn() may call kthread_exit() itself, the caller must ensure * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() * was never called. */ int kthread_stop(struct task_struct *k) { struct kthread *kthread; int ret; trace_sched_kthread_stop(k); get_task_struct(k); kthread = to_kthread(k); set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); kthread_unpark(k); set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL); wake_up_process(k); wait_for_completion(&kthread->exited); ret = kthread->result; put_task_struct(k); trace_sched_kthread_stop_ret(ret); return ret; } EXPORT_SYMBOL(kthread_stop); /** * kthread_stop_put - stop a thread and put its task struct * @k: thread created by kthread_create(). * * Stops a thread created by kthread_create() and put its task_struct. * Only use when holding an extra task struct reference obtained by * calling get_task_struct(). */ int kthread_stop_put(struct task_struct *k) { int ret; ret = kthread_stop(k); put_task_struct(k); return ret; } EXPORT_SYMBOL(kthread_stop_put); int kthreadd(void *unused) { static const char comm[TASK_COMM_LEN] = "kthreadd"; struct task_struct *tsk = current; /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, comm); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; cgroup_init_kthreadd(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (list_empty(&kthread_create_list)) schedule(); __set_current_state(TASK_RUNNING); spin_lock(&kthread_create_lock); while (!list_empty(&kthread_create_list)) { struct kthread_create_info *create; create = list_entry(kthread_create_list.next, struct kthread_create_info, list); list_del_init(&create->list); spin_unlock(&kthread_create_lock); create_kthread(create); spin_lock(&kthread_create_lock); } spin_unlock(&kthread_create_lock); } return 0; } int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); cpumask_var_t affinity; unsigned long flags; int ret = 0; if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { WARN_ON(1); return -EINVAL; } WARN_ON_ONCE(kthread->preferred_affinity); if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) return -ENOMEM; kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL); if (!kthread->preferred_affinity) { ret = -ENOMEM; goto out; } mutex_lock(&kthreads_hotplug_lock); cpumask_copy(kthread->preferred_affinity, mask); WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); kthread_fetch_affinity(kthread, affinity); /* It's safe because the task is inactive. */ raw_spin_lock_irqsave(&p->pi_lock, flags); do_set_cpus_allowed(p, affinity); raw_spin_unlock_irqrestore(&p->pi_lock, flags); mutex_unlock(&kthreads_hotplug_lock); out: free_cpumask_var(affinity); return ret; } /* * Re-affine kthreads according to their preferences * and the newly online CPU. The CPU down part is handled * by select_fallback_rq() which default re-affines to * housekeepers from other nodes in case the preferred * affinity doesn't apply anymore. */ static int kthreads_online_cpu(unsigned int cpu) { cpumask_var_t affinity; struct kthread *k; int ret; guard(mutex)(&kthreads_hotplug_lock); if (list_empty(&kthreads_hotplug)) return 0; if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) return -ENOMEM; ret = 0; list_for_each_entry(k, &kthreads_hotplug, hotplug_node) { if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) || kthread_is_per_cpu(k->task))) { ret = -EINVAL; continue; } kthread_fetch_affinity(k, affinity); set_cpus_allowed_ptr(k->task, affinity); } free_cpumask_var(affinity); return ret; } static int kthreads_init(void) { return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online", kthreads_online_cpu, NULL); } early_initcall(kthreads_init); void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) { memset(worker, 0, sizeof(struct kthread_worker)); raw_spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); INIT_LIST_HEAD(&worker->delayed_work_list); } EXPORT_SYMBOL_GPL(__kthread_init_worker); /** * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker * * This function implements the main cycle of kthread worker. It processes * work_list until it is stopped with kthread_stop(). It sleeps when the queue * is empty. * * The works are not allowed to keep any locks, disable preemption or interrupts * when they finish. There is defined a safe point for freezing when one work * finishes and before a new one is started. * * Also the works must not be handled by more than one worker at the same time, * see also kthread_queue_work(). */ int kthread_worker_fn(void *worker_ptr) { struct kthread_worker *worker = worker_ptr; struct kthread_work *work; /* * FIXME: Update the check and remove the assignment when all kthread * worker users are created using kthread_create_worker*() functions. */ WARN_ON(worker->task && worker->task != current); worker->task = current; if (worker->flags & KTW_FREEZABLE) set_freezable(); repeat: set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&worker->lock); worker->task = NULL; raw_spin_unlock_irq(&worker->lock); return 0; } work = NULL; raw_spin_lock_irq(&worker->lock); if (!list_empty(&worker->work_list)) { work = list_first_entry(&worker->work_list, struct kthread_work, node); list_del_init(&work->node); } worker->current_work = work; raw_spin_unlock_irq(&worker->lock); if (work) { kthread_work_func_t func = work->func; __set_current_state(TASK_RUNNING); trace_sched_kthread_work_execute_start(work); work->func(work); /* * Avoid dereferencing work after this point. The trace * event only cares about the address. */ trace_sched_kthread_work_execute_end(work, func); } else if (!freezing(current)) { schedule(); } else { /* * Handle the case where the current remains * TASK_INTERRUPTIBLE. try_to_freeze() expects * the current to be TASK_RUNNING. */ __set_current_state(TASK_RUNNING); } try_to_freeze(); cond_resched(); goto repeat; } EXPORT_SYMBOL_GPL(kthread_worker_fn); static __printf(3, 0) struct kthread_worker * __kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], va_list args) { struct kthread_worker *worker; struct task_struct *task; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) return ERR_PTR(-ENOMEM); kthread_init_worker(worker); task = __kthread_create_on_node(kthread_worker_fn, worker, node, namefmt, args); if (IS_ERR(task)) goto fail_task; worker->flags = flags; worker->task = task; return worker; fail_task: kfree(worker); return ERR_CAST(task); } /** * kthread_create_worker_on_node - create a kthread worker * @flags: flags modifying the default behavior of the worker * @node: task structure for the thread is allocated on this node * @namefmt: printf-style name for the kthread worker (task). * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); worker = __kthread_create_worker_on_node(flags, node, namefmt, args); va_end(args); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_node); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Use a valid CPU number if you want to bind the kthread worker * to the given CPU and the associated NUMA node. * * A good practice is to add the cpu number also into the worker name. * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). * * CPU hotplug: * The kthread worker API is simple and generic. It just provides a way * to create, use, and destroy workers. * * It is up to the API user how to handle CPU hotplug. They have to decide * how to handle pending work items, prevent queuing new ones, and * restore the functionality when the CPU goes off and on. There are a * few catches: * * - CPU affinity gets lost when it is scheduled on an offline CPU. * * - The worker might not exist when the CPU was off when the user * created the workers. * * Good practice is to implement two CPU hotplug callbacks and to * destroy/create the worker when the CPU goes down/up. * * Return: * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[]) { struct kthread_worker *worker; worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu); if (!IS_ERR(worker)) kthread_bind(worker->task, cpu); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_cpu); /* * Returns true when the work could not be queued at the moment. * It happens when it is already pending in a worker list * or when it is being cancelled. */ static inline bool queuing_blocked(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); return !list_empty(&work->node) || work->canceling; } static void kthread_insert_work_sanity_check(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); WARN_ON_ONCE(!list_empty(&work->node)); /* Do not use a work with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker && work->worker != worker); } /* insert @work before @pos in @worker */ static void kthread_insert_work(struct kthread_worker *worker, struct kthread_work *work, struct list_head *pos) { kthread_insert_work_sanity_check(worker, work); trace_sched_kthread_work_queue_work(worker, work); list_add_tail(&work->node, pos); work->worker = worker; if (!worker->current_work && likely(worker->task)) wake_up_process(worker->task); } /** * kthread_queue_work - queue a kthread_work * @worker: target kthread_worker * @work: kthread_work to queue * * Queue @work to work processor @task for async execution. @task * must have been created with kthread_create_worker(). Returns %true * if @work was successfully queued, %false if it was already pending. * * Reinitialize the work if it needs to be used by another worker. * For example, when the worker was stopped and started again. */ bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work) { bool ret = false; unsigned long flags; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { kthread_insert_work(worker, work, &worker->work_list); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_work); /** * kthread_delayed_work_timer_fn - callback that queues the associated kthread * delayed work when the timer expires. * @t: pointer to the expired timer * * The format of the function is defined by struct timer_list. * It should have been called from irqsafe timer with irq already off. */ void kthread_delayed_work_timer_fn(struct timer_list *t) { struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; unsigned long flags; /* * This might happen when a pending work is reinitialized. * It means that it is used a wrong way. */ if (WARN_ON_ONCE(!worker)) return; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); /* Move the work from worker->delayed_work_list. */ WARN_ON_ONCE(list_empty(&work->node)); list_del_init(&work->node); if (!work->canceling) kthread_insert_work(worker, work, &worker->work_list); raw_spin_unlock_irqrestore(&worker->lock, flags); } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); static void __kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for * both optimization and correctness. The earliest @timer can * expire is on the closest next tick and delayed_work users depend * on that there's no such delay when @delay is 0. */ if (!delay) { kthread_insert_work(worker, work, &worker->work_list); return; } /* Be paranoid and try to detect possible races already now. */ kthread_insert_work_sanity_check(worker, work); list_add(&work->node, &worker->delayed_work_list); work->worker = worker; timer->expires = jiffies + delay; add_timer(timer); } /** * kthread_queue_delayed_work - queue the associated kthread work * after a delay. * @worker: target kthread_worker * @dwork: kthread_delayed_work to queue * @delay: number of jiffies to wait before queuing * * If the work has not been pending it starts a timer that will queue * the work after the given @delay. If @delay is zero, it queues the * work immediately. * * Return: %false if the @work has already been pending. It means that * either the timer was running or the work was queued. It returns %true * otherwise. */ bool kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; bool ret = false; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { __kthread_queue_delayed_work(worker, dwork, delay); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); struct kthread_flush_work { struct kthread_work work; struct completion done; }; static void kthread_flush_work_fn(struct kthread_work *work) { struct kthread_flush_work *fwork = container_of(work, struct kthread_flush_work, work); complete(&fwork->done); } /** * kthread_flush_work - flush a kthread_work * @work: work to flush * * If @work is queued or executing, wait for it to finish execution. */ void kthread_flush_work(struct kthread_work *work) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; struct kthread_worker *worker; bool noop = false; worker = work->worker; if (!worker) return; raw_spin_lock_irq(&worker->lock); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (!list_empty(&work->node)) kthread_insert_work(worker, &fwork.work, work->node.next); else if (worker->current_work == work) kthread_insert_work(worker, &fwork.work, worker->work_list.next); else noop = true; raw_spin_unlock_irq(&worker->lock); if (!noop) wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_work); /* * Make sure that the timer is neither set nor running and could * not manipulate the work list_head any longer. * * The function is called under worker->lock. The lock is temporary * released but the timer can't be set again in the meantime. */ static void kthread_cancel_delayed_work_timer(struct kthread_work *work, unsigned long *flags) { struct kthread_delayed_work *dwork = container_of(work, struct kthread_delayed_work, work); struct kthread_worker *worker = work->worker; /* * del_timer_sync() must be called to make sure that the timer * callback is not running. The lock must be temporary released * to avoid a deadlock with the callback. In the meantime, * any queuing is blocked by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, *flags); del_timer_sync(&dwork->timer); raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } /* * This function removes the work from the worker queue. * * It is called under worker->lock. The caller must make sure that * the timer used by delayed work is not running, e.g. by calling * kthread_cancel_delayed_work_timer(). * * The work might still be in use when this function finishes. See the * current_work proceed by the worker. * * Return: %true if @work was pending and successfully canceled, * %false if @work was not pending */ static bool __kthread_cancel_work(struct kthread_work *work) { /* * Try to remove the work from a worker list. It might either * be from worker->work_list or from worker->delayed_work_list. */ if (!list_empty(&work->node)) { list_del_init(&work->node); return true; } return false; } /** * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work * @worker: kthread worker to use * @dwork: kthread delayed work to queue * @delay: number of jiffies to wait before queuing * * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise, * modify @dwork's timer so that it expires after @delay. If @delay is zero, * @work is guaranteed to be queued immediately. * * Return: %false if @dwork was idle and queued, %true otherwise. * * A special case is when the work is being canceled in parallel. * It might be caused either by the real kthread_cancel_delayed_work_sync() * or yet another kthread_mod_delayed_work() call. We let the other command * win and return %true here. The return value can be used for reference * counting and the number of queued works stays the same. Anyway, the caller * is supposed to synchronize these operations a reasonable way. * * This function is safe to call from any context including IRQ handler. * See __kthread_cancel_work() and kthread_delayed_work_timer_fn() * for details. */ bool kthread_mod_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; int ret; raw_spin_lock_irqsave(&worker->lock, flags); /* Do not bother with canceling when never queued. */ if (!work->worker) { ret = false; goto fast_queue; } /* Work must not be used with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker != worker); /* * Temporary cancel the work but do not fight with another command * that is canceling the work as well. * * It is a bit tricky because of possible races with another * mod_delayed_work() and cancel_delayed_work() callers. * * The timer must be canceled first because worker->lock is released * when doing so. But the work can be removed from the queue (list) * only when it can be queued again so that the return value can * be used for reference counting. */ kthread_cancel_delayed_work_timer(work, &flags); if (work->canceling) { /* The number of works in the queue does not change. */ ret = true; goto out; } ret = __kthread_cancel_work(work); fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) { struct kthread_worker *worker = work->worker; unsigned long flags; int ret = false; if (!worker) goto out; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (is_dwork) kthread_cancel_delayed_work_timer(work, &flags); ret = __kthread_cancel_work(work); if (worker->current_work != work) goto out_fast; /* * The work is in progress and we need to wait with the lock released. * In the meantime, block any queuing by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, flags); kthread_flush_work(work); raw_spin_lock_irqsave(&worker->lock, flags); work->canceling--; out_fast: raw_spin_unlock_irqrestore(&worker->lock, flags); out: return ret; } /** * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish * @work: the kthread work to cancel * * Cancel @work and wait for its execution to finish. This function * can be used even if the work re-queues itself. On return from this * function, @work is guaranteed to be not pending or executing on any CPU. * * kthread_cancel_work_sync(&delayed_work->work) must not be used for * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. * * The caller must ensure that the worker on which @work was last * queued can't be destroyed before this function returns. * * Return: %true if @work was pending, %false otherwise. */ bool kthread_cancel_work_sync(struct kthread_work *work) { return __kthread_cancel_work_sync(work, false); } EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); /** * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and * wait for it to finish. * @dwork: the kthread delayed work to cancel * * This is kthread_cancel_work_sync() for delayed works. * * Return: %true if @dwork was pending, %false otherwise. */ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork) { return __kthread_cancel_work_sync(&dwork->work, true); } EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync); /** * kthread_flush_worker - flush all current works on a kthread_worker * @worker: worker to flush * * Wait until all currently executing or pending works on @worker are * finished. */ void kthread_flush_worker(struct kthread_worker *worker) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; kthread_queue_work(worker, &fwork.work); wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_worker); /** * kthread_destroy_worker - destroy a kthread worker * @worker: worker to be destroyed * * Flush and destroy @worker. The simple flush is enough because the kthread * worker API is used only in trivial scenarios. There are no multi-step state * machines needed. * * Note that this function is not responsible for handling delayed work, so * caller should be responsible for queuing or canceling all delayed work items * before invoke this function. */ void kthread_destroy_worker(struct kthread_worker *worker) { struct task_struct *task; task = worker->task; if (WARN_ON(!task)) return; kthread_flush_worker(worker); kthread_stop(task); WARN_ON(!list_empty(&worker->delayed_work_list)); WARN_ON(!list_empty(&worker->work_list)); kfree(worker); } EXPORT_SYMBOL(kthread_destroy_worker); /** * kthread_use_mm - make the calling kthread operate on an address space * @mm: address space to operate on */ void kthread_use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); /* * It is possible for mm to be the same as tsk->active_mm, but * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm), * because these references are not equivalent. */ mmgrab(mm); task_lock(tsk); /* Hold off tlb flush IPIs while switching mm's */ local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); local_irq_enable(); task_unlock(tsk); #ifdef finish_arch_post_lock_switch finish_arch_post_lock_switch(); #endif /* * When a kthread starts operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after storing to tsk->mm, before accessing * user-space memory. A full memory barrier for membarrier * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by * mmdrop_lazy_tlb(). */ mmdrop_lazy_tlb(active_mm); } EXPORT_SYMBOL_GPL(kthread_use_mm); /** * kthread_unuse_mm - reverse the effect of kthread_use_mm() * @mm: address space to operate on */ void kthread_unuse_mm(struct mm_struct *mm) { struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); task_lock(tsk); /* * When a kthread stops operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after accessing user-space memory, before * clearing tsk->mm. */ smp_mb__after_spinlock(); local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); mmgrab_lazy_tlb(mm); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); task_unlock(tsk); mmdrop(mm); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); #ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread * @css: the cgroup info * * Current thread must be a kthread. The thread is running jobs on behalf of * other threads. In some cases, we expect the jobs attach cgroup info of * original threads instead of that of current thread. This function stores * original thread's cgroup info in current kthread context for later * retrieval. */ void kthread_associate_blkcg(struct cgroup_subsys_state *css) { struct kthread *kthread; if (!(current->flags & PF_KTHREAD)) return; kthread = to_kthread(current); if (!kthread) return; if (kthread->blkcg_css) { css_put(kthread->blkcg_css); kthread->blkcg_css = NULL; } if (css) { css_get(css); kthread->blkcg_css = css; } } EXPORT_SYMBOL(kthread_associate_blkcg); /** * kthread_blkcg - get associated blkcg css of current kthread * * Current thread must be a kthread. */ struct cgroup_subsys_state *kthread_blkcg(void) { struct kthread *kthread; if (current->flags & PF_KTHREAD) { kthread = to_kthread(current); if (kthread) return kthread->blkcg_css; } return NULL; } #endif |
| 152 6 152 152 152 151 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/char_dev.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/init.h> #include <linux/fs.h> #include <linux/kdev_t.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/major.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/kobject.h> #include <linux/kobj_map.h> #include <linux/cdev.h> #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/tty.h> #include "internal.h" static struct kobj_map *cdev_map __ro_after_init; static DEFINE_MUTEX(chrdevs_lock); #define CHRDEV_MAJOR_HASH_SIZE 255 static struct char_device_struct { struct char_device_struct *next; unsigned int major; unsigned int baseminor; int minorct; char name[64]; struct cdev *cdev; /* will die */ } *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; /* index in the above */ static inline int major_to_index(unsigned major) { return major % CHRDEV_MAJOR_HASH_SIZE; } #ifdef CONFIG_PROC_FS void chrdev_show(struct seq_file *f, off_t offset) { struct char_device_struct *cd; mutex_lock(&chrdevs_lock); for (cd = chrdevs[major_to_index(offset)]; cd; cd = cd->next) { if (cd->major == offset) seq_printf(f, "%3d %s\n", cd->major, cd->name); } mutex_unlock(&chrdevs_lock); } #endif /* CONFIG_PROC_FS */ static int find_dynamic_major(void) { int i; struct char_device_struct *cd; for (i = ARRAY_SIZE(chrdevs)-1; i >= CHRDEV_MAJOR_DYN_END; i--) { if (chrdevs[i] == NULL) return i; } for (i = CHRDEV_MAJOR_DYN_EXT_START; i >= CHRDEV_MAJOR_DYN_EXT_END; i--) { for (cd = chrdevs[major_to_index(i)]; cd; cd = cd->next) if (cd->major == i) break; if (cd == NULL) return i; } return -EBUSY; } /* * Register a single major with a specified minor range. * * If major == 0 this function will dynamically allocate an unused major. * If major > 0 this function will attempt to reserve the range of minors * with given major. * */ static struct char_device_struct * __register_chrdev_region(unsigned int major, unsigned int baseminor, int minorct, const char *name) { struct char_device_struct *cd, *curr, *prev = NULL; int ret; int i; if (major >= CHRDEV_MAJOR_MAX) { pr_err("CHRDEV \"%s\" major requested (%u) is greater than the maximum (%u)\n", name, major, CHRDEV_MAJOR_MAX-1); return ERR_PTR(-EINVAL); } if (minorct > MINORMASK + 1 - baseminor) { pr_err("CHRDEV \"%s\" minor range requested (%u-%u) is out of range of maximum range (%u-%u) for a single major\n", name, baseminor, baseminor + minorct - 1, 0, MINORMASK); return ERR_PTR(-EINVAL); } cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL); if (cd == NULL) return ERR_PTR(-ENOMEM); mutex_lock(&chrdevs_lock); if (major == 0) { ret = find_dynamic_major(); if (ret < 0) { pr_err("CHRDEV \"%s\" dynamic allocation region is full\n", name); goto out; } major = ret; } ret = -EBUSY; i = major_to_index(major); for (curr = chrdevs[i]; curr; prev = curr, curr = curr->next) { if (curr->major < major) continue; if (curr->major > major) break; if (curr->baseminor + curr->minorct <= baseminor) continue; if (curr->baseminor >= baseminor + minorct) break; goto out; } cd->major = major; cd->baseminor = baseminor; cd->minorct = minorct; strscpy(cd->name, name, sizeof(cd->name)); if (!prev) { cd->next = curr; chrdevs[i] = cd; } else { cd->next = prev->next; prev->next = cd; } mutex_unlock(&chrdevs_lock); return cd; out: mutex_unlock(&chrdevs_lock); kfree(cd); return ERR_PTR(ret); } static struct char_device_struct * __unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct) { struct char_device_struct *cd = NULL, **cp; int i = major_to_index(major); mutex_lock(&chrdevs_lock); for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) if ((*cp)->major == major && (*cp)->baseminor == baseminor && (*cp)->minorct == minorct) break; if (*cp) { cd = *cp; *cp = cd->next; } mutex_unlock(&chrdevs_lock); return cd; } /** * register_chrdev_region() - register a range of device numbers * @from: the first in the desired range of device numbers; must include * the major number. * @count: the number of consecutive device numbers required * @name: the name of the device or driver. * * Return value is zero on success, a negative error code on failure. */ int register_chrdev_region(dev_t from, unsigned count, const char *name) { struct char_device_struct *cd; dev_t to = from + count; dev_t n, next; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); if (next > to) next = to; cd = __register_chrdev_region(MAJOR(n), MINOR(n), next - n, name); if (IS_ERR(cd)) goto fail; } return 0; fail: to = n; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); } return PTR_ERR(cd); } /** * alloc_chrdev_region() - register a range of char device numbers * @dev: output parameter for first assigned number * @baseminor: first of the requested range of minor numbers * @count: the number of minor numbers required * @name: the name of the associated device or driver * * Allocates a range of char device numbers. The major number will be * chosen dynamically, and returned (along with the first minor number) * in @dev. Returns zero or a negative error code. */ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, const char *name) { struct char_device_struct *cd; cd = __register_chrdev_region(0, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); *dev = MKDEV(cd->major, cd->baseminor); return 0; } /** * __register_chrdev() - create and register a cdev occupying a range of minors * @major: major device number or 0 for dynamic allocation * @baseminor: first of the requested range of minor numbers * @count: the number of minor numbers required * @name: name of this range of devices * @fops: file operations associated with this devices * * If @major == 0 this functions will dynamically allocate a major and return * its number. * * If @major > 0 this function will attempt to reserve a device with the given * major number and will return zero on success. * * Returns a -ve errno on failure. * * The name of this device has nothing to do with the name of the device in * /dev. It only helps to keep track of the different owners of devices. If * your module name has only one type of devices it's ok to use e.g. the name * of the module here. */ int __register_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops) { struct char_device_struct *cd; struct cdev *cdev; int err = -ENOMEM; cd = __register_chrdev_region(major, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); cdev = cdev_alloc(); if (!cdev) goto out2; cdev->owner = fops->owner; cdev->ops = fops; kobject_set_name(&cdev->kobj, "%s", name); err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); if (err) goto out; cd->cdev = cdev; return major ? 0 : cd->major; out: kobject_put(&cdev->kobj); out2: kfree(__unregister_chrdev_region(cd->major, baseminor, count)); return err; } /** * unregister_chrdev_region() - unregister a range of device numbers * @from: the first in the range of numbers to unregister * @count: the number of device numbers to unregister * * This function will unregister a range of @count device numbers, * starting with @from. The caller should normally be the one who * allocated those numbers in the first place... */ void unregister_chrdev_region(dev_t from, unsigned count) { dev_t to = from + count; dev_t n, next; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); if (next > to) next = to; kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); } } /** * __unregister_chrdev - unregister and destroy a cdev * @major: major device number * @baseminor: first of the range of minor numbers * @count: the number of minor numbers this cdev is occupying * @name: name of this range of devices * * Unregister and destroy the cdev occupying the region described by * @major, @baseminor and @count. This function undoes what * __register_chrdev() did. */ void __unregister_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name) { struct char_device_struct *cd; cd = __unregister_chrdev_region(major, baseminor, count); if (cd && cd->cdev) cdev_del(cd->cdev); kfree(cd); } static DEFINE_SPINLOCK(cdev_lock); static struct kobject *cdev_get(struct cdev *p) { struct module *owner = p->owner; struct kobject *kobj; if (!try_module_get(owner)) return NULL; kobj = kobject_get_unless_zero(&p->kobj); if (!kobj) module_put(owner); return kobj; } void cdev_put(struct cdev *p) { if (p) { struct module *owner = p->owner; kobject_put(&p->kobj); module_put(owner); } } /* * Called every time a character special file is opened */ static int chrdev_open(struct inode *inode, struct file *filp) { const struct file_operations *fops; struct cdev *p; struct cdev *new = NULL; int ret = 0; spin_lock(&cdev_lock); p = inode->i_cdev; if (!p) { struct kobject *kobj; int idx; spin_unlock(&cdev_lock); kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx); if (!kobj) return -ENXIO; new = container_of(kobj, struct cdev, kobj); spin_lock(&cdev_lock); /* Check i_cdev again in case somebody beat us to it while we dropped the lock. */ p = inode->i_cdev; if (!p) { inode->i_cdev = p = new; list_add(&inode->i_devices, &p->list); new = NULL; } else if (!cdev_get(p)) ret = -ENXIO; } else if (!cdev_get(p)) ret = -ENXIO; spin_unlock(&cdev_lock); cdev_put(new); if (ret) return ret; ret = -ENXIO; fops = fops_get(p->ops); if (!fops) goto out_cdev_put; replace_fops(filp, fops); if (filp->f_op->open) { ret = filp->f_op->open(inode, filp); if (ret) goto out_cdev_put; } return 0; out_cdev_put: cdev_put(p); return ret; } void cd_forget(struct inode *inode) { spin_lock(&cdev_lock); list_del_init(&inode->i_devices); inode->i_cdev = NULL; inode->i_mapping = &inode->i_data; spin_unlock(&cdev_lock); } static void cdev_purge(struct cdev *cdev) { spin_lock(&cdev_lock); while (!list_empty(&cdev->list)) { struct inode *inode; inode = container_of(cdev->list.next, struct inode, i_devices); list_del_init(&inode->i_devices); inode->i_cdev = NULL; } spin_unlock(&cdev_lock); } /* * Dummy default file-operations: the only thing this does * is contain the open that then fills in the correct operations * depending on the special file... */ const struct file_operations def_chr_fops = { .open = chrdev_open, .llseek = noop_llseek, }; static struct kobject *exact_match(dev_t dev, int *part, void *data) { struct cdev *p = data; return &p->kobj; } static int exact_lock(dev_t dev, void *data) { struct cdev *p = data; return cdev_get(p) ? 0 : -1; } /** * cdev_add() - add a char device to the system * @p: the cdev structure for the device * @dev: the first device number for which this device is responsible * @count: the number of consecutive minor numbers corresponding to this * device * * cdev_add() adds the device represented by @p to the system, making it * live immediately. A negative error code is returned on failure. */ int cdev_add(struct cdev *p, dev_t dev, unsigned count) { int error; p->dev = dev; p->count = count; if (WARN_ON(dev == WHITEOUT_DEV)) { error = -EBUSY; goto err; } error = kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p); if (error) goto err; kobject_get(p->kobj.parent); return 0; err: kfree_const(p->kobj.name); p->kobj.name = NULL; return error; } /** * cdev_set_parent() - set the parent kobject for a char device * @p: the cdev structure * @kobj: the kobject to take a reference to * * cdev_set_parent() sets a parent kobject which will be referenced * appropriately so the parent is not freed before the cdev. This * should be called before cdev_add. */ void cdev_set_parent(struct cdev *p, struct kobject *kobj) { WARN_ON(!kobj->state_initialized); p->kobj.parent = kobj; } /** * cdev_device_add() - add a char device and it's corresponding * struct device, linkink * @dev: the device structure * @cdev: the cdev structure * * cdev_device_add() adds the char device represented by @cdev to the system, * just as cdev_add does. It then adds @dev to the system using device_add * The dev_t for the char device will be taken from the struct device which * needs to be initialized first. This helper function correctly takes a * reference to the parent device so the parent will not get released until * all references to the cdev are released. * * This helper uses dev->devt for the device number. If it is not set * it will not add the cdev and it will be equivalent to device_add. * * This function should be used whenever the struct cdev and the * struct device are members of the same structure whose lifetime is * managed by the struct device. * * NOTE: Callers must assume that userspace was able to open the cdev and * can call cdev fops callbacks at any time, even if this function fails. */ int cdev_device_add(struct cdev *cdev, struct device *dev) { int rc = 0; if (dev->devt) { cdev_set_parent(cdev, &dev->kobj); rc = cdev_add(cdev, dev->devt, 1); if (rc) return rc; } rc = device_add(dev); if (rc && dev->devt) cdev_del(cdev); return rc; } /** * cdev_device_del() - inverse of cdev_device_add * @cdev: the cdev structure * @dev: the device structure * * cdev_device_del() is a helper function to call cdev_del and device_del. * It should be used whenever cdev_device_add is used. * * If dev->devt is not set it will not remove the cdev and will be equivalent * to device_del. * * NOTE: This guarantees that associated sysfs callbacks are not running * or runnable, however any cdevs already open will remain and their fops * will still be callable even after this function returns. */ void cdev_device_del(struct cdev *cdev, struct device *dev) { device_del(dev); if (dev->devt) cdev_del(cdev); } static void cdev_unmap(dev_t dev, unsigned count) { kobj_unmap(cdev_map, dev, count); } /** * cdev_del() - remove a cdev from the system * @p: the cdev structure to be removed * * cdev_del() removes @p from the system, possibly freeing the structure * itself. * * NOTE: This guarantees that cdev device will no longer be able to be * opened, however any cdevs already open will remain and their fops will * still be callable even after cdev_del returns. */ void cdev_del(struct cdev *p) { cdev_unmap(p->dev, p->count); kobject_put(&p->kobj); } static void cdev_default_release(struct kobject *kobj) { struct cdev *p = container_of(kobj, struct cdev, kobj); struct kobject *parent = kobj->parent; cdev_purge(p); kobject_put(parent); } static void cdev_dynamic_release(struct kobject *kobj) { struct cdev *p = container_of(kobj, struct cdev, kobj); struct kobject *parent = kobj->parent; cdev_purge(p); kfree(p); kobject_put(parent); } static struct kobj_type ktype_cdev_default = { .release = cdev_default_release, }; static struct kobj_type ktype_cdev_dynamic = { .release = cdev_dynamic_release, }; /** * cdev_alloc() - allocate a cdev structure * * Allocates and returns a cdev structure, or NULL on failure. */ struct cdev *cdev_alloc(void) { struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL); if (p) { INIT_LIST_HEAD(&p->list); kobject_init(&p->kobj, &ktype_cdev_dynamic); } return p; } /** * cdev_init() - initialize a cdev structure * @cdev: the structure to initialize * @fops: the file_operations for this device * * Initializes @cdev, remembering @fops, making it ready to add to the * system with cdev_add(). */ void cdev_init(struct cdev *cdev, const struct file_operations *fops) { memset(cdev, 0, sizeof *cdev); INIT_LIST_HEAD(&cdev->list); kobject_init(&cdev->kobj, &ktype_cdev_default); cdev->ops = fops; } static struct kobject *base_probe(dev_t dev, int *part, void *data) { if (request_module("char-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0) /* Make old-style 2.4 aliases work */ request_module("char-major-%d", MAJOR(dev)); return NULL; } void __init chrdev_init(void) { cdev_map = kobj_map_init(base_probe, &chrdevs_lock); } /* Let modules do char dev stuff */ EXPORT_SYMBOL(register_chrdev_region); EXPORT_SYMBOL(unregister_chrdev_region); EXPORT_SYMBOL(alloc_chrdev_region); EXPORT_SYMBOL(cdev_init); EXPORT_SYMBOL(cdev_alloc); EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); EXPORT_SYMBOL(cdev_set_parent); EXPORT_SYMBOL(cdev_device_add); EXPORT_SYMBOL(cdev_device_del); EXPORT_SYMBOL(__register_chrdev); EXPORT_SYMBOL(__unregister_chrdev); |
| 27 27 27 61 22 57 61 45 19 23 23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 | // SPDX-License-Identifier: GPL-2.0-only /* * mm/page-writeback.c * * Copyright (C) 2002, Linus Torvalds. * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Contains functions related to writing back dirty pages at the * address_space level. * * 10Apr2002 Andrew Morton * Initial version */ #include <linux/kernel.h> #include <linux/math64.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/init.h> #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> #include <linux/blkdev.h> #include <linux/mpage.h> #include <linux/rmap.h> #include <linux/percpu.h> #include <linux/smp.h> #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/syscalls.h> #include <linux/pagevec.h> #include <linux/timer.h> #include <linux/sched/rt.h> #include <linux/sched/signal.h> #include <linux/mm_inline.h> #include <trace/events/writeback.h> #include "internal.h" /* * Sleep at most 200ms at a time in balance_dirty_pages(). */ #define MAX_PAUSE max(HZ/5, 1) /* * Try to keep balance_dirty_pages() call intervals higher than this many pages * by raising pause time to max_pause when falls below it. */ #define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) /* * Estimate write bandwidth or update dirty limit at 200ms intervals. */ #define BANDWIDTH_INTERVAL max(HZ/5, 1) #define RATELIMIT_CALC_SHIFT 10 /* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. */ static long ratelimit_pages = 32; /* The following parameters are exported via /proc/sys/vm */ /* * Start background writeback (via writeback threads) at this percentage */ static int dirty_background_ratio = 10; /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of * dirty_background_ratio * the amount of dirtyable memory */ static unsigned long dirty_background_bytes; /* * free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */ static int vm_highmem_is_dirtyable; /* * The generator of dirty data starts writeback at this percentage */ static int vm_dirty_ratio = 20; /* * vm_dirty_bytes starts at 0 (disabled) so that it is a function of * vm_dirty_ratio * the amount of dirtyable memory */ static unsigned long vm_dirty_bytes; /* * The interval between `kupdate'-style writebacks */ unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ EXPORT_SYMBOL_GPL(dirty_writeback_interval); /* * The longest time for which data is allowed to remain dirty */ unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ /* * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: * a full sync is triggered after this time elapses without any disk activity. */ int laptop_mode; EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ struct wb_domain global_wb_domain; /* consolidated parameters for balance_dirty_pages() and its subroutines */ struct dirty_throttle_control { #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *dom; struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ #endif struct bdi_writeback *wb; struct fprop_local_percpu *wb_completions; unsigned long avail; /* dirtyable */ unsigned long dirty; /* file_dirty + write + nfs */ unsigned long thresh; /* dirty threshold */ unsigned long bg_thresh; /* dirty background threshold */ unsigned long wb_dirty; /* per-wb counterparts */ unsigned long wb_thresh; unsigned long wb_bg_thresh; unsigned long pos_ratio; bool freerun; bool dirty_exceeded; }; /* * Length of period for aging writeout fractions of bdis. This is an * arbitrarily chosen number. The longer the period, the slower fractions will * reflect changes in current writeout rate. */ #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) #ifdef CONFIG_CGROUP_WRITEBACK #define GDTC_INIT(__wb) .wb = (__wb), \ .dom = &global_wb_domain, \ .wb_completions = &(__wb)->completions #define GDTC_INIT_NO_WB .dom = &global_wb_domain #define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \ .dom = mem_cgroup_wb_domain(__wb), \ .wb_completions = &(__wb)->memcg_completions, \ .gdtc = __gdtc static bool mdtc_valid(struct dirty_throttle_control *dtc) { return dtc->dom; } static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return dtc->dom; } static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return mdtc->gdtc; } static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return &wb->memcg_completions; } static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); unsigned long long min = wb->bdi->min_ratio; unsigned long long max = wb->bdi->max_ratio; /* * @wb may already be clean by the time control reaches here and * the total may not include its bw. */ if (this_bw < tot_bw) { if (min) { min *= this_bw; min = div64_ul(min, tot_bw); } if (max < 100 * BDI_RATIO_SCALE) { max *= this_bw; max = div64_ul(max, tot_bw); } } *minp = min; *maxp = max; } #else /* CONFIG_CGROUP_WRITEBACK */ #define GDTC_INIT(__wb) .wb = (__wb), \ .wb_completions = &(__wb)->completions #define GDTC_INIT_NO_WB #define MDTC_INIT(__wb, __gdtc) static bool mdtc_valid(struct dirty_throttle_control *dtc) { return false; } static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return &global_wb_domain; } static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return NULL; } static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return NULL; } static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { *minp = wb->bdi->min_ratio; *maxp = wb->bdi->max_ratio; } #endif /* CONFIG_CGROUP_WRITEBACK */ /* * In a memory zone, there is a certain amount of pages we consider * available for the page cache, which is essentially the number of * free and reclaimable pages, minus some zone reserves to protect * lowmem and the ability to uphold the zone's watermarks without * requiring writeback. * * This number of dirtyable pages is the base value of which the * user-configurable dirty ratio is the effective number of pages that * are allowed to be actually dirtied. Per individual zone, or * globally by using the sum of dirtyable pages over all zones. * * Because the user is allowed to specify the dirty limit globally as * absolute number of bytes, calculating the per-zone dirty limit can * require translating the configured limit into a percentage of * global dirtyable memory first. */ /** * node_dirtyable_memory - number of dirtyable pages in a node * @pgdat: the node * * Return: the node's number of pages potentially available for dirty * page cache. This is the base value for the per-node dirty limits. */ static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) { unsigned long nr_pages = 0; int z; for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = pgdat->node_zones + z; if (!populated_zone(zone)) continue; nr_pages += zone_page_state(zone, NR_FREE_PAGES); } /* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ nr_pages -= min(nr_pages, pgdat->totalreserve_pages); nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE); nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE); return nr_pages; } static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM int node; unsigned long x = 0; int i; for_each_node_state(node, N_HIGH_MEMORY) { for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) { struct zone *z; unsigned long nr_pages; if (!is_highmem_idx(i)) continue; z = &NODE_DATA(node)->node_zones[i]; if (!populated_zone(z)) continue; nr_pages = zone_page_state(z, NR_FREE_PAGES); /* watch for underflows */ nr_pages -= min(nr_pages, high_wmark_pages(z)); nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE); nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE); x += nr_pages; } } /* * Make sure that the number of highmem pages is never larger * than the number of the total dirtyable memory. This can only * occur in very strange VM situations but we want to make sure * that this does not occur. */ return min(x, total); #else return 0; #endif } /** * global_dirtyable_memory - number of globally dirtyable pages * * Return: the global number of pages potentially available for dirty * page cache. This is the base value for the global dirty limits. */ static unsigned long global_dirtyable_memory(void) { unsigned long x; x = global_zone_page_state(NR_FREE_PAGES); /* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ x -= min(x, totalreserve_pages); x += global_node_page_state(NR_INACTIVE_FILE); x += global_node_page_state(NR_ACTIVE_FILE); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); return x + 1; /* Ensure that we never return 0 */ } /** * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain * @dtc: dirty_throttle_control of interest * * Calculate @dtc->thresh and ->bg_thresh considering * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller * must ensure that @dtc->avail is set before calling this function. The * dirty limits will be lifted by 1/4 for real-time tasks. */ static void domain_dirty_limits(struct dirty_throttle_control *dtc) { const unsigned long available_memory = dtc->avail; struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc); unsigned long bytes = vm_dirty_bytes; unsigned long bg_bytes = dirty_background_bytes; /* convert ratios to per-PAGE_SIZE for higher precision */ unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100; unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100; unsigned long thresh; unsigned long bg_thresh; struct task_struct *tsk; /* gdtc is !NULL iff @dtc is for memcg domain */ if (gdtc) { unsigned long global_avail = gdtc->avail; /* * The byte settings can't be applied directly to memcg * domains. Convert them to ratios by scaling against * globally available memory. As the ratios are in * per-PAGE_SIZE, they can be obtained by dividing bytes by * number of pages. */ if (bytes) ratio = min(DIV_ROUND_UP(bytes, global_avail), PAGE_SIZE); if (bg_bytes) bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail), PAGE_SIZE); bytes = bg_bytes = 0; } if (bytes) thresh = DIV_ROUND_UP(bytes, PAGE_SIZE); else thresh = (ratio * available_memory) / PAGE_SIZE; if (bg_bytes) bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE); else bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; tsk = current; if (rt_or_dl_task(tsk)) { bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; } /* * Dirty throttling logic assumes the limits in page units fit into * 32-bits. This gives 16TB dirty limits max which is hopefully enough. */ if (thresh > UINT_MAX) thresh = UINT_MAX; /* This makes sure bg_thresh is within 32-bits as well */ if (bg_thresh >= thresh) bg_thresh = thresh / 2; dtc->thresh = thresh; dtc->bg_thresh = bg_thresh; /* we should eventually report the domain in the TP */ if (!gdtc) trace_global_dirty_state(bg_thresh, thresh); } /** * global_dirty_limits - background-writeback and dirty-throttling thresholds * @pbackground: out parameter for bg_thresh * @pdirty: out parameter for thresh * * Calculate bg_thresh and thresh for global_wb_domain. See * domain_dirty_limits() for details. */ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; gdtc.avail = global_dirtyable_memory(); domain_dirty_limits(&gdtc); *pbackground = gdtc.bg_thresh; *pdirty = gdtc.thresh; } /** * node_dirty_limit - maximum number of dirty pages allowed in a node * @pgdat: the node * * Return: the maximum number of dirty pages allowed in a node, based * on the node's dirtyable memory. */ static unsigned long node_dirty_limit(struct pglist_data *pgdat) { unsigned long node_memory = node_dirtyable_memory(pgdat); struct task_struct *tsk = current; unsigned long dirty; if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * node_memory / global_dirtyable_memory(); else dirty = vm_dirty_ratio * node_memory / 100; if (rt_or_dl_task(tsk)) dirty += dirty / 4; /* * Dirty throttling logic assumes the limits in page units fit into * 32-bits. This gives 16TB dirty limits max which is hopefully enough. */ return min_t(unsigned long, dirty, UINT_MAX); } /** * node_dirty_ok - tells whether a node is within its dirty limits * @pgdat: the node to check * * Return: %true when the dirty pages in @pgdat are within the node's * dirty limit, %false if the limit is exceeded. */ bool node_dirty_ok(struct pglist_data *pgdat) { unsigned long limit = node_dirty_limit(pgdat); unsigned long nr_pages = 0; nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); nr_pages += node_page_state(pgdat, NR_WRITEBACK); return nr_pages <= limit; } #ifdef CONFIG_SYSCTL static int dirty_background_ratio_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_bytes = 0; return ret; } static int dirty_background_bytes_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; unsigned long old_bytes = dirty_background_bytes; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) { if (DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE) > UINT_MAX) { dirty_background_bytes = old_bytes; return -ERANGE; } dirty_background_ratio = 0; } return ret; } static int dirty_ratio_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { writeback_set_ratelimit(); vm_dirty_bytes = 0; } return ret; } static int dirty_bytes_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { if (DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) > UINT_MAX) { vm_dirty_bytes = old_bytes; return -ERANGE; } writeback_set_ratelimit(); vm_dirty_ratio = 0; } return ret; } #endif static unsigned long wp_next_time(unsigned long cur_time) { cur_time += VM_COMPLETIONS_PERIOD_LEN; /* 0 has a special meaning... */ if (!cur_time) return 1; return cur_time; } static void wb_domain_writeout_add(struct wb_domain *dom, struct fprop_local_percpu *completions, unsigned int max_prop_frac, long nr) { __fprop_add_percpu_max(&dom->completions, completions, max_prop_frac, nr); /* First event after period switching was turned off? */ if (unlikely(!dom->period_time)) { /* * We can race with other wb_domain_writeout_add calls here but * it does not cause any harm since the resulting time when * timer will fire and what is in writeout_period_time will be * roughly the same. */ dom->period_time = wp_next_time(jiffies); mod_timer(&dom->period_timer, dom->period_time); } } /* * Increment @wb's writeout completion count and the global writeout * completion count. Called from __folio_end_writeback(). */ static inline void __wb_writeout_add(struct bdi_writeback *wb, long nr) { struct wb_domain *cgdom; wb_stat_mod(wb, WB_WRITTEN, nr); wb_domain_writeout_add(&global_wb_domain, &wb->completions, wb->bdi->max_prop_frac, nr); cgdom = mem_cgroup_wb_domain(wb); if (cgdom) wb_domain_writeout_add(cgdom, wb_memcg_completions(wb), wb->bdi->max_prop_frac, nr); } void wb_writeout_inc(struct bdi_writeback *wb) { unsigned long flags; local_irq_save(flags); __wb_writeout_add(wb, 1); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(wb_writeout_inc); /* * On idle system, we can be called long after we scheduled because we use * deferred timers so count with missed periods. */ static void writeout_period(struct timer_list *t) { struct wb_domain *dom = from_timer(dom, t, period_timer); int miss_periods = (jiffies - dom->period_time) / VM_COMPLETIONS_PERIOD_LEN; if (fprop_new_period(&dom->completions, miss_periods + 1)) { dom->period_time = wp_next_time(dom->period_time + miss_periods * VM_COMPLETIONS_PERIOD_LEN); mod_timer(&dom->period_timer, dom->period_time); } else { /* * Aging has zeroed all fractions. Stop wasting CPU on period * updates. */ dom->period_time = 0; } } int wb_domain_init(struct wb_domain *dom, gfp_t gfp) { memset(dom, 0, sizeof(*dom)); spin_lock_init(&dom->lock); timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE); dom->dirty_limit_tstamp = jiffies; return fprop_global_init(&dom->completions, gfp); } #ifdef CONFIG_CGROUP_WRITEBACK void wb_domain_exit(struct wb_domain *dom) { del_timer_sync(&dom->period_timer); fprop_global_destroy(&dom->completions); } #endif /* * bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not * exceed 100%. */ static unsigned int bdi_min_ratio; static int bdi_check_pages_limit(unsigned long pages) { unsigned long max_dirty_pages = global_dirtyable_memory(); if (pages > max_dirty_pages) return -EINVAL; return 0; } static unsigned long bdi_ratio_from_pages(unsigned long pages) { unsigned long background_thresh; unsigned long dirty_thresh; unsigned long ratio; global_dirty_limits(&background_thresh, &dirty_thresh); if (!dirty_thresh) return -EINVAL; ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh); return ratio; } static u64 bdi_get_bytes(unsigned int ratio) { unsigned long background_thresh; unsigned long dirty_thresh; u64 bytes; global_dirty_limits(&background_thresh, &dirty_thresh); bytes = (dirty_thresh * PAGE_SIZE * ratio) / BDI_RATIO_SCALE / 100; return bytes; } static int __bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { unsigned int delta; int ret = 0; if (min_ratio > 100 * BDI_RATIO_SCALE) return -EINVAL; spin_lock_bh(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { if (min_ratio < bdi->min_ratio) { delta = bdi->min_ratio - min_ratio; bdi_min_ratio -= delta; bdi->min_ratio = min_ratio; } else { delta = min_ratio - bdi->min_ratio; if (bdi_min_ratio + delta < 100 * BDI_RATIO_SCALE) { bdi_min_ratio += delta; bdi->min_ratio = min_ratio; } else { ret = -EINVAL; } } } spin_unlock_bh(&bdi_lock); return ret; } static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) { int ret = 0; if (max_ratio > 100 * BDI_RATIO_SCALE) return -EINVAL; spin_lock_bh(&bdi_lock); if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / (100 * BDI_RATIO_SCALE); } spin_unlock_bh(&bdi_lock); return ret; } int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio) { return __bdi_set_min_ratio(bdi, min_ratio); } int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio) { return __bdi_set_max_ratio(bdi, max_ratio); } int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { return __bdi_set_min_ratio(bdi, min_ratio * BDI_RATIO_SCALE); } int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) { return __bdi_set_max_ratio(bdi, max_ratio * BDI_RATIO_SCALE); } EXPORT_SYMBOL(bdi_set_max_ratio); u64 bdi_get_min_bytes(struct backing_dev_info *bdi) { return bdi_get_bytes(bdi->min_ratio); } int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes) { int ret; unsigned long pages = min_bytes >> PAGE_SHIFT; long min_ratio; ret = bdi_check_pages_limit(pages); if (ret) return ret; min_ratio = bdi_ratio_from_pages(pages); if (min_ratio < 0) return min_ratio; return __bdi_set_min_ratio(bdi, min_ratio); } u64 bdi_get_max_bytes(struct backing_dev_info *bdi) { return bdi_get_bytes(bdi->max_ratio); } int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes) { int ret; unsigned long pages = max_bytes >> PAGE_SHIFT; long max_ratio; ret = bdi_check_pages_limit(pages); if (ret) return ret; max_ratio = bdi_ratio_from_pages(pages); if (max_ratio < 0) return max_ratio; return __bdi_set_max_ratio(bdi, max_ratio); } int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit) { if (strict_limit > 1) return -EINVAL; spin_lock_bh(&bdi_lock); if (strict_limit) bdi->capabilities |= BDI_CAP_STRICTLIMIT; else bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; spin_unlock_bh(&bdi_lock); return 0; } static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { return (thresh + bg_thresh) / 2; } static unsigned long hard_dirty_limit(struct wb_domain *dom, unsigned long thresh) { return max(thresh, dom->dirty_limit); } /* * Memory which can be further allocated to a memcg domain is capped by * system-wide clean memory excluding the amount being used in the domain. */ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, unsigned long filepages, unsigned long headroom) { struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc); unsigned long clean = filepages - min(filepages, mdtc->dirty); unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty); unsigned long other_clean = global_clean - min(global_clean, clean); mdtc->avail = filepages + min(headroom, other_clean); } static inline bool dtc_is_global(struct dirty_throttle_control *dtc) { return mdtc_gdtc(dtc) == NULL; } /* * Dirty background will ignore pages being written as we're trying to * decide whether to put more under writeback. */ static void domain_dirty_avail(struct dirty_throttle_control *dtc, bool include_writeback) { if (dtc_is_global(dtc)) { dtc->avail = global_dirtyable_memory(); dtc->dirty = global_node_page_state(NR_FILE_DIRTY); if (include_writeback) dtc->dirty += global_node_page_state(NR_WRITEBACK); } else { unsigned long filepages = 0, headroom = 0, writeback = 0; mem_cgroup_wb_stats(dtc->wb, &filepages, &headroom, &dtc->dirty, &writeback); if (include_writeback) dtc->dirty += writeback; mdtc_calc_avail(dtc, filepages, headroom); } } /** * __wb_calc_thresh - @wb's share of dirty threshold * @dtc: dirty_throttle_context of interest * @thresh: dirty throttling or dirty background threshold of wb_domain in @dtc * * Note that balance_dirty_pages() will only seriously take dirty throttling * threshold as a hard limit when sleeping max_pause per page is not enough * to keep the dirty pages under control. For example, when the device is * completely stalled due to some error conditions, or when there are 1000 * dd tasks writing to a slow 10MB/s USB key. * In the other normal situations, it acts more gently by throttling the tasks * more (rather than completely block them) when the wb dirty pages go high. * * It allocates high/low dirty limits to fast/slow devices, in order to prevent * - starving fast devices * - piling up dirty pages (that will take long time to sync) on slow devices * * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. * * Return: @wb's dirty limit in pages. For dirty throttling limit, the term * "dirty" in the context of dirty balancing includes all PG_dirty and * PG_writeback pages. */ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc, unsigned long thresh) { struct wb_domain *dom = dtc_dom(dtc); struct bdi_writeback *wb = dtc->wb; u64 wb_thresh; u64 wb_max_thresh; unsigned long numerator, denominator; unsigned long wb_min_ratio, wb_max_ratio; /* * Calculate this wb's share of the thresh ratio. */ fprop_fraction_percpu(&dom->completions, dtc->wb_completions, &numerator, &denominator); wb_thresh = (thresh * (100 * BDI_RATIO_SCALE - bdi_min_ratio)) / (100 * BDI_RATIO_SCALE); wb_thresh *= numerator; wb_thresh = div64_ul(wb_thresh, denominator); wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio); wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE); /* * It's very possible that wb_thresh is close to 0 not because the * device is slow, but that it has remained inactive for long time. * Honour such devices a reasonable good (hopefully IO efficient) * threshold, so that the occasional writes won't be blocked and active * writes can rampup the threshold quickly. */ if (thresh > dtc->dirty) { if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 100); else wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 8); } wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); if (wb_thresh > wb_max_thresh) wb_thresh = wb_max_thresh; return wb_thresh; } unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; domain_dirty_avail(&gdtc, true); return __wb_calc_thresh(&gdtc, thresh); } unsigned long cgwb_calc_thresh(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) }; domain_dirty_avail(&gdtc, true); domain_dirty_avail(&mdtc, true); domain_dirty_limits(&mdtc); return __wb_calc_thresh(&mdtc, mdtc.thresh); } /* * setpoint - dirty 3 * f(dirty) := 1.0 + (----------------) * limit - setpoint * * it's a 3rd order polynomial that subjects to * * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast * (2) f(setpoint) = 1.0 => the balance point * (3) f(limit) = 0 => the hard limit * (4) df/dx <= 0 => negative feedback control * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) * => fast response on large errors; small oscillation near setpoint */ static long long pos_ratio_polynom(unsigned long setpoint, unsigned long dirty, unsigned long limit) { long long pos_ratio; long x; x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, (limit - setpoint) | 1); pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio += 1 << RATELIMIT_CALC_SHIFT; return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); } /* * Dirty position control. * * (o) global/bdi setpoints * * We want the dirty pages be balanced around the global/wb setpoints. * When the number of dirty pages is higher/lower than the setpoint, the * dirty position control ratio (and hence task dirty ratelimit) will be * decreased/increased to bring the dirty pages back to the setpoint. * * pos_ratio = 1 << RATELIMIT_CALC_SHIFT * * if (dirty < setpoint) scale up pos_ratio * if (dirty > setpoint) scale down pos_ratio * * if (wb_dirty < wb_setpoint) scale up pos_ratio * if (wb_dirty > wb_setpoint) scale down pos_ratio * * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT * * (o) global control line * * ^ pos_ratio * | * | |<===== global dirty control scope ======>| * 2.0 * * * * * * * * | .* * | . * * | . * * | . * * | . * * | . * * 1.0 ................................* * | . . * * | . . * * | . . * * | . . * * | . . * * 0 +------------.------------------.----------------------*-------------> * freerun^ setpoint^ limit^ dirty pages * * (o) wb control line * * ^ pos_ratio * | * | * * | * * | * * | * * | * |<=========== span ============>| * 1.0 .......................* * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * 1/4 ...............................................* * * * * * * * * * * * * | . . * | . . * | . . * 0 +----------------------.-------------------------------.-------------> * wb_setpoint^ x_intercept^ * * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can * be smoothly throttled down to normal if it starts high in situations like * - start writing to a slow SD card and a fast disk at the same time. The SD * card's wb_dirty may rush to many times higher than wb_setpoint. * - the wb dirty thresh drops quickly due to change of JBOD workload */ static void wb_position_ratio(struct dirty_throttle_control *dtc) { struct bdi_writeback *wb = dtc->wb; unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long wb_thresh = dtc->wb_thresh; unsigned long x_intercept; unsigned long setpoint; /* dirty pages' target balance point */ unsigned long wb_setpoint; unsigned long span; long long pos_ratio; /* for scaling up/down the rate limit */ long x; dtc->pos_ratio = 0; if (unlikely(dtc->dirty >= limit)) return; /* * global setpoint * * See comment for pos_ratio_polynom(). */ setpoint = (freerun + limit) / 2; pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit); /* * The strictlimit feature is a tool preventing mistrusted filesystems * from growing a large number of dirty pages before throttling. For * such filesystems balance_dirty_pages always checks wb counters * against wb limits. Even if global "nr_dirty" is under "freerun". * This is especially important for fuse which sets bdi->max_ratio to * 1% by default. Without strictlimit feature, fuse writeback may * consume arbitrary amount of RAM because it is accounted in * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". * * Here, in wb_position_ratio(), we calculate pos_ratio based on * two values: wb_dirty and wb_thresh. Let's consider an example: * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global * limits are set by default to 10% and 20% (background and throttle). * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is * about ~6K pages (as the average of background and throttle wb * limits). The 3rd order polynomial will provide positive feedback if * wb_dirty is under wb_setpoint and vice versa. * * Note, that we cannot use global counters in these calculations * because we want to throttle process writing to a strictlimit wb * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB * in the example above). */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { long long wb_pos_ratio; if (dtc->wb_dirty >= wb_thresh) return; wb_setpoint = dirty_freerun_ceiling(wb_thresh, dtc->wb_bg_thresh); if (wb_setpoint == 0 || wb_setpoint == wb_thresh) return; wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty, wb_thresh); /* * Typically, for strictlimit case, wb_setpoint << setpoint * and pos_ratio >> wb_pos_ratio. In the other words global * state ("dirty") is not limiting factor and we have to * make decision based on wb counters. But there is an * important case when global pos_ratio should get precedence: * global limits are exceeded (e.g. due to activities on other * wb's) while given strictlimit wb is below limit. * * "pos_ratio * wb_pos_ratio" would work for the case above, * but it would look too non-natural for the case of all * activity in the system coming from a single strictlimit wb * with bdi->max_ratio == 100%. * * Note that min() below somewhat changes the dynamics of the * control system. Normally, pos_ratio value can be well over 3 * (when globally we are at freerun and wb is well below wb * setpoint). Now the maximum pos_ratio in the same situation * is 2. We might want to tweak this if we observe the control * system is too slow to adapt. */ dtc->pos_ratio = min(pos_ratio, wb_pos_ratio); return; } /* * We have computed basic pos_ratio above based on global situation. If * the wb is over/under its share of dirty pages, we want to scale * pos_ratio further down/up. That is done by the following mechanism. */ /* * wb setpoint * * f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint) * * x_intercept - wb_dirty * := -------------------------- * x_intercept - wb_setpoint * * The main wb control line is a linear function that subjects to * * (1) f(wb_setpoint) = 1.0 * (2) k = - 1 / (8 * write_bw) (in single wb case) * or equally: x_intercept = wb_setpoint + 8 * write_bw * * For single wb case, the dirty pages are observed to fluctuate * regularly within range * [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2] * for various filesystems, where (2) can yield in a reasonable 12.5% * fluctuation range for pos_ratio. * * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its * own size, so move the slope over accordingly and choose a slope that * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh. */ if (unlikely(wb_thresh > dtc->thresh)) wb_thresh = dtc->thresh; /* * scale global setpoint to wb's: * wb_setpoint = setpoint * wb_thresh / thresh */ x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1); wb_setpoint = setpoint * (u64)x >> 16; /* * Use span=(8*write_bw) in single wb case as indicated by * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case. * * wb_thresh thresh - wb_thresh * span = --------- * (8 * write_bw) + ------------------ * wb_thresh * thresh thresh */ span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16; x_intercept = wb_setpoint + span; if (dtc->wb_dirty < x_intercept - span / 4) { pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty), (x_intercept - wb_setpoint) | 1); } else pos_ratio /= 4; /* * wb reserve area, safeguard against dirty pool underrun and disk idle * It may push the desired control point of global dirty pages higher * than setpoint. */ x_intercept = wb_thresh / 2; if (dtc->wb_dirty < x_intercept) { if (dtc->wb_dirty > x_intercept / 8) pos_ratio = div_u64(pos_ratio * x_intercept, dtc->wb_dirty); else pos_ratio *= 8; } dtc->pos_ratio = pos_ratio; } static void wb_update_write_bandwidth(struct bdi_writeback *wb, unsigned long elapsed, unsigned long written) { const unsigned long period = roundup_pow_of_two(3 * HZ); unsigned long avg = wb->avg_write_bandwidth; unsigned long old = wb->write_bandwidth; u64 bw; /* * bw = written * HZ / elapsed * * bw * elapsed + write_bandwidth * (period - elapsed) * write_bandwidth = --------------------------------------------------- * period * * @written may have decreased due to folio_redirty_for_writepage(). * Avoid underflowing @bw calculation. */ bw = written - min(written, wb->written_stamp); bw *= HZ; if (unlikely(elapsed > period)) { bw = div64_ul(bw, elapsed); avg = bw; goto out; } bw += (u64)wb->write_bandwidth * (period - elapsed); bw >>= ilog2(period); /* * one more level of smoothing, for filtering out sudden spikes */ if (avg > old && old >= (unsigned long)bw) avg -= (avg - old) >> 3; if (avg < old && old <= (unsigned long)bw) avg += (old - avg) >> 3; out: /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */ avg = max(avg, 1LU); if (wb_has_dirty_io(wb)) { long delta = avg - wb->avg_write_bandwidth; WARN_ON_ONCE(atomic_long_add_return(delta, &wb->bdi->tot_write_bandwidth) <= 0); } wb->write_bandwidth = bw; WRITE_ONCE(wb->avg_write_bandwidth, avg); } static void update_dirty_limit(struct dirty_throttle_control *dtc) { struct wb_domain *dom = dtc_dom(dtc); unsigned long thresh = dtc->thresh; unsigned long limit = dom->dirty_limit; /* * Follow up in one step. */ if (limit < thresh) { limit = thresh; goto update; } /* * Follow down slowly. Use the higher one as the target, because thresh * may drop below dirty. This is exactly the reason to introduce * dom->dirty_limit which is guaranteed to lie above the dirty pages. */ thresh = max(thresh, dtc->dirty); if (limit > thresh) { limit -= (limit - thresh) >> 5; goto update; } return; update: dom->dirty_limit = limit; } static void domain_update_dirty_limit(struct dirty_throttle_control *dtc, unsigned long now) { struct wb_domain *dom = dtc_dom(dtc); /* * check locklessly first to optimize away locking for the most time */ if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) return; spin_lock(&dom->lock); if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) { update_dirty_limit(dtc); dom->dirty_limit_tstamp = now; } spin_unlock(&dom->lock); } /* * Maintain wb->dirty_ratelimit, the base dirty throttle rate. * * Normal wb tasks will be curbed at or below it in long term. * Obviously it should be around (write_bw / N) when there are N dd tasks. */ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, unsigned long dirtied, unsigned long elapsed) { struct bdi_writeback *wb = dtc->wb; unsigned long dirty = dtc->dirty; unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long setpoint = (freerun + limit) / 2; unsigned long write_bw = wb->avg_write_bandwidth; unsigned long dirty_ratelimit = wb->dirty_ratelimit; unsigned long dirty_rate; unsigned long task_ratelimit; unsigned long balanced_dirty_ratelimit; unsigned long step; unsigned long x; unsigned long shift; /* * The dirty rate will match the writeout rate in long term, except * when dirty pages are truncated by userspace or re-dirtied by FS. */ dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed; /* * task_ratelimit reflects each dd's dirty rate for the past 200ms. */ task_ratelimit = (u64)dirty_ratelimit * dtc->pos_ratio >> RATELIMIT_CALC_SHIFT; task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ /* * A linear estimation of the "balanced" throttle rate. The theory is, * if there are N dd tasks, each throttled at task_ratelimit, the wb's * dirty_rate will be measured to be (N * task_ratelimit). So the below * formula will yield the balanced rate limit (write_bw / N). * * Note that the expanded form is not a pure rate feedback: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1) * but also takes pos_ratio into account: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2) * * (1) is not realistic because pos_ratio also takes part in balancing * the dirty rate. Consider the state * pos_ratio = 0.5 (3) * rate = 2 * (write_bw / N) (4) * If (1) is used, it will stuck in that state! Because each dd will * be throttled at * task_ratelimit = pos_ratio * rate = (write_bw / N) (5) * yielding * dirty_rate = N * task_ratelimit = write_bw (6) * put (6) into (1) we get * rate_(i+1) = rate_(i) (7) * * So we end up using (2) to always keep * rate_(i+1) ~= (write_bw / N) (8) * regardless of the value of pos_ratio. As long as (8) is satisfied, * pos_ratio is able to drive itself to 1.0, which is not only where * the dirty count meet the setpoint, but also where the slope of * pos_ratio is most flat and hence task_ratelimit is least fluctuated. */ balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, dirty_rate | 1); /* * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw */ if (unlikely(balanced_dirty_ratelimit > write_bw)) balanced_dirty_ratelimit = write_bw; /* * We could safely do this and return immediately: * * wb->dirty_ratelimit = balanced_dirty_ratelimit; * * However to get a more stable dirty_ratelimit, the below elaborated * code makes use of task_ratelimit to filter out singular points and * limit the step size. * * The below code essentially only uses the relative value of * * task_ratelimit - dirty_ratelimit * = (pos_ratio - 1) * dirty_ratelimit * * which reflects the direction and size of dirty position error. */ /* * dirty_ratelimit will follow balanced_dirty_ratelimit iff * task_ratelimit is on the same side of dirty_ratelimit, too. * For example, when * - dirty_ratelimit > balanced_dirty_ratelimit * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint) * lowering dirty_ratelimit will help meet both the position and rate * control targets. Otherwise, don't update dirty_ratelimit if it will * only help meet the rate target. After all, what the users ultimately * feel and care are stable dirty rate and small position error. * * |task_ratelimit - dirty_ratelimit| is used to limit the step size * and filter out the singular points of balanced_dirty_ratelimit. Which * keeps jumping around randomly and can even leap far away at times * due to the small 200ms estimation period of dirty_rate (we want to * keep that period small to reduce time lags). */ step = 0; /* * For strictlimit case, calculations above were based on wb counters * and limits (starting from pos_ratio = wb_position_ratio() and up to * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). * Hence, to calculate "step" properly, we have to use wb_dirty as * "dirty" and wb_setpoint as "setpoint". */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { dirty = dtc->wb_dirty; setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; } if (dirty < setpoint) { x = min3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit < x) step = x - dirty_ratelimit; } else { x = max3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit > x) step = dirty_ratelimit - x; } /* * Don't pursue 100% rate matching. It's impossible since the balanced * rate itself is constantly fluctuating. So decrease the track speed * when it gets close to the target. Helps eliminate pointless tremors. */ shift = dirty_ratelimit / (2 * step + 1); if (shift < BITS_PER_LONG) step = DIV_ROUND_UP(step >> shift, 8); else step = 0; if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step; else dirty_ratelimit -= step; WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL)); wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); } static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, struct dirty_throttle_control *mdtc, bool update_ratelimit) { struct bdi_writeback *wb = gdtc->wb; unsigned long now = jiffies; unsigned long elapsed; unsigned long dirtied; unsigned long written; spin_lock(&wb->list_lock); /* * Lockless checks for elapsed time are racy and delayed update after * IO completion doesn't do it at all (to make sure written pages are * accounted reasonably quickly). Make sure elapsed >= 1 to avoid * division errors. */ elapsed = max(now - wb->bw_time_stamp, 1UL); dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); if (update_ratelimit) { domain_update_dirty_limit(gdtc, now); wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); /* * @mdtc is always NULL if !CGROUP_WRITEBACK but the * compiler has no way to figure that out. Help it. */ if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { domain_update_dirty_limit(mdtc, now); wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); } } wb_update_write_bandwidth(wb, elapsed, written); wb->dirtied_stamp = dirtied; wb->written_stamp = written; WRITE_ONCE(wb->bw_time_stamp, now); spin_unlock(&wb->list_lock); } void wb_update_bandwidth(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; __wb_update_bandwidth(&gdtc, NULL, false); } /* Interval after which we consider wb idle and don't estimate bandwidth */ #define WB_BANDWIDTH_IDLE_JIF (HZ) static void wb_bandwidth_estimate_start(struct bdi_writeback *wb) { unsigned long now = jiffies; unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp); if (elapsed > WB_BANDWIDTH_IDLE_JIF && !atomic_read(&wb->writeback_inodes)) { spin_lock(&wb->list_lock); wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED); wb->written_stamp = wb_stat(wb, WB_WRITTEN); WRITE_ONCE(wb->bw_time_stamp, now); spin_unlock(&wb->list_lock); } } /* * After a task dirtied this many pages, balance_dirty_pages_ratelimited() * will look to see if it needs to start dirty throttling. * * If dirty_poll_interval is too low, big NUMA machines will call the expensive * global_zone_page_state() too often. So scale it near-sqrt to the safety margin * (the number of pages we may dirty without exceeding the dirty limits). */ static unsigned long dirty_poll_interval(unsigned long dirty, unsigned long thresh) { if (thresh > dirty) return 1UL << (ilog2(thresh - dirty) >> 1); return 1; } static unsigned long wb_max_pause(struct bdi_writeback *wb, unsigned long wb_dirty) { unsigned long bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long t; /* * Limit pause time for small memory systems. If sleeping for too long * time, a small pool of dirty/writeback pages may go empty and disk go * idle. * * 8 serves as the safety ratio. */ t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); t++; return min_t(unsigned long, t, MAX_PAUSE); } static long wb_min_pause(struct bdi_writeback *wb, long max_pause, unsigned long task_ratelimit, unsigned long dirty_ratelimit, int *nr_dirtied_pause) { long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth)); long lo = ilog2(READ_ONCE(wb->dirty_ratelimit)); long t; /* target pause */ long pause; /* estimated next pause */ int pages; /* target nr_dirtied_pause */ /* target for 10ms pause on 1-dd case */ t = max(1, HZ / 100); /* * Scale up pause time for concurrent dirtiers in order to reduce CPU * overheads. * * (N * 10ms) on 2^N concurrent tasks. */ if (hi > lo) t += (hi - lo) * (10 * HZ) / 1024; /* * This is a bit convoluted. We try to base the next nr_dirtied_pause * on the much more stable dirty_ratelimit. However the next pause time * will be computed based on task_ratelimit and the two rate limits may * depart considerably at some time. Especially if task_ratelimit goes * below dirty_ratelimit/2 and the target pause is max_pause, the next * pause time will be max_pause*2 _trimmed down_ to max_pause. As a * result task_ratelimit won't be executed faithfully, which could * eventually bring down dirty_ratelimit. * * We apply two rules to fix it up: * 1) try to estimate the next pause time and if necessary, use a lower * nr_dirtied_pause so as not to exceed max_pause. When this happens, * nr_dirtied_pause will be "dancing" with task_ratelimit. * 2) limit the target pause time to max_pause/2, so that the normal * small fluctuations of task_ratelimit won't trigger rule (1) and * nr_dirtied_pause will remain as stable as dirty_ratelimit. */ t = min(t, 1 + max_pause / 2); pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); /* * Tiny nr_dirtied_pause is found to hurt I/O performance in the test * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}. * When the 16 consecutive reads are often interrupted by some dirty * throttling pause during the async writes, cfq will go into idles * (deadline is fine). So push nr_dirtied_pause as high as possible * until reaches DIRTY_POLL_THRESH=32 pages. */ if (pages < DIRTY_POLL_THRESH) { t = max_pause; pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); if (pages > DIRTY_POLL_THRESH) { pages = DIRTY_POLL_THRESH; t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit; } } pause = HZ * pages / (task_ratelimit + 1); if (pause > max_pause) { t = max_pause; pages = task_ratelimit * t / roundup_pow_of_two(HZ); } *nr_dirtied_pause = pages; /* * The minimal pause time will normally be half the target pause time. */ return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; } static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) { struct bdi_writeback *wb = dtc->wb; unsigned long wb_reclaimable; /* * wb_thresh is not treated as some limiting factor as * dirty_thresh, due to reasons * - in JBOD setup, wb_thresh can fluctuate a lot * - in a system with HDD and USB key, the USB key may somehow * go into state (wb_dirty >> wb_thresh) either because * wb_dirty starts high, or because wb_thresh drops low. * In this case we don't want to hard throttle the USB key * dirtiers for 100 seconds until wb_dirty drops under * wb_thresh. Instead the auxiliary wb control line in * wb_position_ratio() will let the dirtier task progress * at some rate <= (write_bw / 2) for bringing down wb_dirty. */ dtc->wb_thresh = __wb_calc_thresh(dtc, dtc->thresh); dtc->wb_bg_thresh = dtc->thresh ? div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; /* * In order to avoid the stacked BDI deadlock we need * to ensure we accurately count the 'dirty' pages when * the threshold is low. * * Otherwise it would be possible to get thresh+n pages * reported dirty, even though there are thresh-m pages * actually dirty; with m+n sitting in the percpu * deltas. */ if (dtc->wb_thresh < 2 * wb_stat_error()) { wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); } else { wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE); dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK); } } static unsigned long domain_poll_intv(struct dirty_throttle_control *dtc, bool strictlimit) { unsigned long dirty, thresh; if (strictlimit) { dirty = dtc->wb_dirty; thresh = dtc->wb_thresh; } else { dirty = dtc->dirty; thresh = dtc->thresh; } return dirty_poll_interval(dirty, thresh); } /* * Throttle it only when the background writeback cannot catch-up. This avoids * (excessively) small writeouts when the wb limits are ramping up in case of * !strictlimit. * * In strictlimit case make decision based on the wb counters and limits. Small * writeouts when the wb limits are ramping up are the price we consciously pay * for strictlimit-ing. */ static void domain_dirty_freerun(struct dirty_throttle_control *dtc, bool strictlimit) { unsigned long dirty, thresh, bg_thresh; if (unlikely(strictlimit)) { wb_dirty_limits(dtc); dirty = dtc->wb_dirty; thresh = dtc->wb_thresh; bg_thresh = dtc->wb_bg_thresh; } else { dirty = dtc->dirty; thresh = dtc->thresh; bg_thresh = dtc->bg_thresh; } dtc->freerun = dirty <= dirty_freerun_ceiling(thresh, bg_thresh); } static void balance_domain_limits(struct dirty_throttle_control *dtc, bool strictlimit) { domain_dirty_avail(dtc, true); domain_dirty_limits(dtc); domain_dirty_freerun(dtc, strictlimit); } static void wb_dirty_freerun(struct dirty_throttle_control *dtc, bool strictlimit) { dtc->freerun = false; /* was already handled in domain_dirty_freerun */ if (strictlimit) return; wb_dirty_limits(dtc); /* * LOCAL_THROTTLE tasks must not be throttled when below the per-wb * freerun ceiling. */ if (!(current->flags & PF_LOCAL_THROTTLE)) return; dtc->freerun = dtc->wb_dirty < dirty_freerun_ceiling(dtc->wb_thresh, dtc->wb_bg_thresh); } static inline void wb_dirty_exceeded(struct dirty_throttle_control *dtc, bool strictlimit) { dtc->dirty_exceeded = (dtc->wb_dirty > dtc->wb_thresh) && ((dtc->dirty > dtc->thresh) || strictlimit); } /* * The limits fields dirty_exceeded and pos_ratio won't be updated if wb is * in freerun state. Please don't use these invalid fields in freerun case. */ static void balance_wb_limits(struct dirty_throttle_control *dtc, bool strictlimit) { wb_dirty_freerun(dtc, strictlimit); if (dtc->freerun) return; wb_dirty_exceeded(dtc, strictlimit); wb_position_ratio(dtc); } /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. * If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. */ static int balance_dirty_pages(struct bdi_writeback *wb, unsigned long pages_dirtied, unsigned int flags) { struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; struct dirty_throttle_control * const gdtc = &gdtc_stor; struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; unsigned long nr_dirty; long period; long pause; long max_pause; long min_pause; int nr_dirtied_pause; unsigned long task_ratelimit; unsigned long dirty_ratelimit; struct backing_dev_info *bdi = wb->bdi; bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; int ret = 0; for (;;) { unsigned long now = jiffies; nr_dirty = global_node_page_state(NR_FILE_DIRTY); balance_domain_limits(gdtc, strictlimit); if (mdtc) { /* * If @wb belongs to !root memcg, repeat the same * basic calculations for the memcg domain. */ balance_domain_limits(mdtc, strictlimit); } /* * In laptop mode, we wait until hitting the higher threshold * before starting background writeout, and then write out all * the way down to the lower threshold. So slow writers cause * minimal disk activity. * * In normal mode, we start background writeout at the lower * background_thresh, to keep the amount of dirty memory low. */ if (!laptop_mode && nr_dirty > gdtc->bg_thresh && !writeback_in_progress(wb)) wb_start_background_writeback(wb); /* * If memcg domain is in effect, @dirty should be under * both global and memcg freerun ceilings. */ if (gdtc->freerun && (!mdtc || mdtc->freerun)) { unsigned long intv; unsigned long m_intv; free_running: intv = domain_poll_intv(gdtc, strictlimit); m_intv = ULONG_MAX; current->dirty_paused_when = now; current->nr_dirtied = 0; if (mdtc) m_intv = domain_poll_intv(mdtc, strictlimit); current->nr_dirtied_pause = min(intv, m_intv); break; } /* Start writeback even when in laptop mode */ if (unlikely(!writeback_in_progress(wb))) wb_start_background_writeback(wb); mem_cgroup_flush_foreign(wb); /* * Calculate global domain's pos_ratio and select the * global dtc by default. */ balance_wb_limits(gdtc, strictlimit); if (gdtc->freerun) goto free_running; sdtc = gdtc; if (mdtc) { /* * If memcg domain is in effect, calculate its * pos_ratio. @wb should satisfy constraints from * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ balance_wb_limits(mdtc, strictlimit); if (mdtc->freerun) goto free_running; if (mdtc->pos_ratio < gdtc->pos_ratio) sdtc = mdtc; } wb->dirty_exceeded = gdtc->dirty_exceeded || (mdtc && mdtc->dirty_exceeded); if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + BANDWIDTH_INTERVAL)) __wb_update_bandwidth(gdtc, mdtc, true); /* throttle according to the chosen dtc */ dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> RATELIMIT_CALC_SHIFT; max_pause = wb_max_pause(wb, sdtc->wb_dirty); min_pause = wb_min_pause(wb, max_pause, task_ratelimit, dirty_ratelimit, &nr_dirtied_pause); if (unlikely(task_ratelimit == 0)) { period = max_pause; pause = max_pause; goto pause; } period = HZ * pages_dirtied / task_ratelimit; pause = period; if (current->dirty_paused_when) pause -= now - current->dirty_paused_when; /* * For less than 1s think time (ext3/4 may block the dirtier * for up to 800ms from time to time on 1-HDD; so does xfs, * however at much less frequency), try to compensate it in * future periods by updating the virtual time; otherwise just * do a reset, as it may be a light dirtier. */ if (pause < min_pause) { trace_balance_dirty_pages(wb, sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, dirty_ratelimit, task_ratelimit, pages_dirtied, period, min(pause, 0L), start_time); if (pause < -HZ) { current->dirty_paused_when = now; current->nr_dirtied = 0; } else if (period) { current->dirty_paused_when += period; current->nr_dirtied = 0; } else if (current->nr_dirtied_pause <= pages_dirtied) current->nr_dirtied_pause += pages_dirtied; break; } if (unlikely(pause > max_pause)) { /* for occasional dropped task_ratelimit */ now += min(pause - max_pause, max_pause); pause = max_pause; } pause: trace_balance_dirty_pages(wb, sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, dirty_ratelimit, task_ratelimit, pages_dirtied, period, pause, start_time); if (flags & BDP_ASYNC) { ret = -EAGAIN; break; } __set_current_state(TASK_KILLABLE); bdi->last_bdp_sleep = jiffies; io_schedule_timeout(pause); current->dirty_paused_when = now + pause; current->nr_dirtied = 0; current->nr_dirtied_pause = nr_dirtied_pause; /* * This is typically equal to (dirty < thresh) and can also * keep "1000+ dd on a slow USB stick" under control. */ if (task_ratelimit) break; /* * In the case of an unresponsive NFS server and the NFS dirty * pages exceeds dirty_thresh, give the other good wb's a pipe * to go through, so that tasks on them still remain responsive. * * In theory 1 page is enough to keep the consumer-producer * pipe going: the flusher cleans 1 page => the task dirties 1 * more page. However wb_dirty has accounting errors. So use * the larger and more IO friendly wb_stat_error. */ if (sdtc->wb_dirty <= wb_stat_error()) break; if (fatal_signal_pending(current)) break; } return ret; } static DEFINE_PER_CPU(int, bdp_ratelimits); /* * Normal tasks are throttled by * loop { * dirty tsk->nr_dirtied_pause pages; * take a snap in balance_dirty_pages(); * } * However there is a worst case. If every task exit immediately when dirtied * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be * called to throttle the page dirties. The solution is to save the not yet * throttled page dirties in dirty_throttle_leaks on task exit and charge them * randomly into the running tasks. This works well for the above worst case, * as the new task will pick up and accumulate the old task's leaked dirty * count and eventually get throttled. */ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; /** * balance_dirty_pages_ratelimited_flags - Balance dirty memory state. * @mapping: address_space which was dirtied. * @flags: BDP flags. * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * * See balance_dirty_pages_ratelimited() for details. * * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to * indicate that memory is out of balance and the caller must wait * for I/O to complete. Otherwise, it will return 0 to indicate * that either memory was already in balance, or it was able to sleep * until the amount of dirty memory returned to balance. */ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, unsigned int flags) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; int ratelimit; int ret = 0; int *p; if (!(bdi->capabilities & BDI_CAP_WRITEBACK)) return ret; if (inode_cgwb_enabled(inode)) wb = wb_get_create_current(bdi, GFP_KERNEL); if (!wb) wb = &bdi->wb; ratelimit = current->nr_dirtied_pause; if (wb->dirty_exceeded) ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); preempt_disable(); /* * This prevents one CPU to accumulate too many dirtied pages without * calling into balance_dirty_pages(), which can happen when there are * 1000+ tasks, all of them start dirtying pages at exactly the same * time, hence all honoured too large initial task->nr_dirtied_pause. */ p = this_cpu_ptr(&bdp_ratelimits); if (unlikely(current->nr_dirtied >= ratelimit)) *p = 0; else if (unlikely(*p >= ratelimit_pages)) { *p = 0; ratelimit = 0; } /* * Pick up the dirtied pages by the exited tasks. This avoids lots of * short-lived tasks (eg. gcc invocations in a kernel build) escaping * the dirty throttling and livelock other long-run dirtiers. */ p = this_cpu_ptr(&dirty_throttle_leaks); if (*p > 0 && current->nr_dirtied < ratelimit) { unsigned long nr_pages_dirtied; nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); *p -= nr_pages_dirtied; current->nr_dirtied += nr_pages_dirtied; } preempt_enable(); if (unlikely(current->nr_dirtied >= ratelimit)) ret = balance_dirty_pages(wb, current->nr_dirtied, flags); wb_put(wb); return ret; } EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited_flags); /** * balance_dirty_pages_ratelimited - balance dirty memory state. * @mapping: address_space which was dirtied. * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * * Once we're over the dirty memory limit we decrease the ratelimiting * by a lot, to prevent individual processes from overshooting the limit * by (ratelimit_pages) each. */ void balance_dirty_pages_ratelimited(struct address_space *mapping) { balance_dirty_pages_ratelimited_flags(mapping, 0); } EXPORT_SYMBOL(balance_dirty_pages_ratelimited); /* * Similar to wb_dirty_limits, wb_bg_dirty_limits also calculates dirty * and thresh, but it's for background writeback. */ static void wb_bg_dirty_limits(struct dirty_throttle_control *dtc) { struct bdi_writeback *wb = dtc->wb; dtc->wb_bg_thresh = __wb_calc_thresh(dtc, dtc->bg_thresh); if (dtc->wb_bg_thresh < 2 * wb_stat_error()) dtc->wb_dirty = wb_stat_sum(wb, WB_RECLAIMABLE); else dtc->wb_dirty = wb_stat(wb, WB_RECLAIMABLE); } static bool domain_over_bg_thresh(struct dirty_throttle_control *dtc) { domain_dirty_avail(dtc, false); domain_dirty_limits(dtc); if (dtc->dirty > dtc->bg_thresh) return true; wb_bg_dirty_limits(dtc); if (dtc->wb_dirty > dtc->wb_bg_thresh) return true; return false; } /** * wb_over_bg_thresh - does @wb need to be written back? * @wb: bdi_writeback of interest * * Determines whether background writeback should keep writing @wb or it's * clean enough. * * Return: %true if writeback should continue. */ bool wb_over_bg_thresh(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) }; if (domain_over_bg_thresh(&gdtc)) return true; if (mdtc_valid(&mdtc)) return domain_over_bg_thresh(&mdtc); return false; } #ifdef CONFIG_SYSCTL /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { unsigned int old_interval = dirty_writeback_interval; int ret; ret = proc_dointvec(table, write, buffer, length, ppos); /* * Writing 0 to dirty_writeback_interval will disable periodic writeback * and a different non-zero value will wakeup the writeback threads. * wb_wakeup_delayed() would be more appropriate, but it's a pain to * iterate over all bdis and wbs. * The reason we do this is to make the change take effect immediately. */ if (!ret && write && dirty_writeback_interval && dirty_writeback_interval != old_interval) wakeup_flusher_threads(WB_REASON_PERIODIC); return ret; } #endif void laptop_mode_timer_fn(struct timer_list *t) { struct backing_dev_info *backing_dev_info = from_timer(backing_dev_info, t, laptop_mode_wb_timer); wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER); } /* * We've spun up the disk and we're in laptop mode: schedule writeback * of all dirty data a few seconds from now. If the flush is already scheduled * then push it back - the user is still using the disk. */ void laptop_io_completion(struct backing_dev_info *info) { mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); } /* * We're in laptop mode and we've just synced. The sync's writes will have * caused another writeback to be scheduled by laptop_io_completion. * Nothing needs to be written back anymore, so we unschedule the writeback. */ void laptop_sync_completion(void) { struct backing_dev_info *bdi; rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) del_timer(&bdi->laptop_mode_wb_timer); rcu_read_unlock(); } /* * If ratelimit_pages is too high then we can get into dirty-data overload * if a large number of processes all perform writes at the same time. * * Here we set ratelimit_pages to a level which ensures that when all CPUs are * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory * thresholds. */ void writeback_set_ratelimit(void) { struct wb_domain *dom = &global_wb_domain; unsigned long background_thresh; unsigned long dirty_thresh; global_dirty_limits(&background_thresh, &dirty_thresh); dom->dirty_limit = dirty_thresh; ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16; } static int page_writeback_cpu_online(unsigned int cpu) { writeback_set_ratelimit(); return 0; } #ifdef CONFIG_SYSCTL /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; static const struct ctl_table vm_page_writeback_sysctls[] = { { .procname = "dirty_background_ratio", .data = &dirty_background_ratio, .maxlen = sizeof(dirty_background_ratio), .mode = 0644, .proc_handler = dirty_background_ratio_handler, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "dirty_background_bytes", .data = &dirty_background_bytes, .maxlen = sizeof(dirty_background_bytes), .mode = 0644, .proc_handler = dirty_background_bytes_handler, .extra1 = SYSCTL_LONG_ONE, }, { .procname = "dirty_ratio", .data = &vm_dirty_ratio, .maxlen = sizeof(vm_dirty_ratio), .mode = 0644, .proc_handler = dirty_ratio_handler, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "dirty_bytes", .data = &vm_dirty_bytes, .maxlen = sizeof(vm_dirty_bytes), .mode = 0644, .proc_handler = dirty_bytes_handler, .extra1 = (void *)&dirty_bytes_min, }, { .procname = "dirty_writeback_centisecs", .data = &dirty_writeback_interval, .maxlen = sizeof(dirty_writeback_interval), .mode = 0644, .proc_handler = dirty_writeback_centisecs_handler, }, { .procname = "dirty_expire_centisecs", .data = &dirty_expire_interval, .maxlen = sizeof(dirty_expire_interval), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #ifdef CONFIG_HIGHMEM { .procname = "highmem_is_dirtyable", .data = &vm_highmem_is_dirtyable, .maxlen = sizeof(vm_highmem_is_dirtyable), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif { .procname = "laptop_mode", .data = &laptop_mode, .maxlen = sizeof(laptop_mode), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, }; #endif /* * Called early on to tune the page writeback dirty limits. * * We used to scale dirty pages according to how total memory * related to pages that could be allocated for buffers. * * However, that was when we used "dirty_ratio" to scale with * all memory, and we don't do that any more. "dirty_ratio" * is now applied to total non-HIGHPAGE memory, and as such we can't * get into the old insane situation any more where we had * large amounts of dirty pages compared to a small amount of * non-HIGHMEM memory. * * But we might still want to scale the dirty_ratio by how * much memory the box has.. */ void __init page_writeback_init(void) { BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online", page_writeback_cpu_online, NULL); cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL, page_writeback_cpu_online); #ifdef CONFIG_SYSCTL register_sysctl_init("vm", vm_page_writeback_sysctls); #endif } /** * tag_pages_for_writeback - tag pages to be written by writeback * @mapping: address space structure to write * @start: starting page index * @end: ending page index (inclusive) * * This function scans the page range from @start to @end (inclusive) and tags * all pages that have DIRTY tag set with a special TOWRITE tag. The caller * can then use the TOWRITE tag to identify pages eligible for writeback. * This mechanism is used to avoid livelocking of writeback by a process * steadily creating new dirty pages in the file (thus it is important for this * function to be quick so that it can tag pages faster than a dirtying process * can create them). */ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { XA_STATE(xas, &mapping->i_pages, start); unsigned int tagged = 0; void *page; xas_lock_irq(&xas); xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) { xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); if (++tagged % XA_CHECK_SCHED) continue; xas_pause(&xas); xas_unlock_irq(&xas); cond_resched(); xas_lock_irq(&xas); } xas_unlock_irq(&xas); } EXPORT_SYMBOL(tag_pages_for_writeback); static bool folio_prepare_writeback(struct address_space *mapping, struct writeback_control *wbc, struct folio *folio) { /* * Folio truncated or invalidated. We can freely skip it then, * even for data integrity operations: the folio has disappeared * concurrently, so there could be no real expectation of this * data integrity operation even if there is now a new, dirty * folio at the same pagecache index. */ if (unlikely(folio->mapping != mapping)) return false; /* * Did somebody else write it for us? */ if (!folio_test_dirty(folio)) return false; if (folio_test_writeback(folio)) { if (wbc->sync_mode == WB_SYNC_NONE) return false; folio_wait_writeback(folio); } BUG_ON(folio_test_writeback(folio)); if (!folio_clear_dirty_for_io(folio)) return false; return true; } static xa_mark_t wbc_to_tag(struct writeback_control *wbc) { if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) return PAGECACHE_TAG_TOWRITE; return PAGECACHE_TAG_DIRTY; } static pgoff_t wbc_end(struct writeback_control *wbc) { if (wbc->range_cyclic) return -1; return wbc->range_end >> PAGE_SHIFT; } static struct folio *writeback_get_folio(struct address_space *mapping, struct writeback_control *wbc) { struct folio *folio; retry: folio = folio_batch_next(&wbc->fbatch); if (!folio) { folio_batch_release(&wbc->fbatch); cond_resched(); filemap_get_folios_tag(mapping, &wbc->index, wbc_end(wbc), wbc_to_tag(wbc), &wbc->fbatch); folio = folio_batch_next(&wbc->fbatch); if (!folio) return NULL; } folio_lock(folio); if (unlikely(!folio_prepare_writeback(mapping, wbc, folio))) { folio_unlock(folio); goto retry; } trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); return folio; } /** * writeback_iter - iterate folio of a mapping for writeback * @mapping: address space structure to write * @wbc: writeback context * @folio: previously iterated folio (%NULL to start) * @error: in-out pointer for writeback errors (see below) * * This function returns the next folio for the writeback operation described by * @wbc on @mapping and should be called in a while loop in the ->writepages * implementation. * * To start the writeback operation, %NULL is passed in the @folio argument, and * for every subsequent iteration the folio returned previously should be passed * back in. * * If there was an error in the per-folio writeback inside the writeback_iter() * loop, @error should be set to the error value. * * Once the writeback described in @wbc has finished, this function will return * %NULL and if there was an error in any iteration restore it to @error. * * Note: callers should not manually break out of the loop using break or goto * but must keep calling writeback_iter() until it returns %NULL. * * Return: the folio to write or %NULL if the loop is done. */ struct folio *writeback_iter(struct address_space *mapping, struct writeback_control *wbc, struct folio *folio, int *error) { if (!folio) { folio_batch_init(&wbc->fbatch); wbc->saved_err = *error = 0; /* * For range cyclic writeback we remember where we stopped so * that we can continue where we stopped. * * For non-cyclic writeback we always start at the beginning of * the passed in range. */ if (wbc->range_cyclic) wbc->index = mapping->writeback_index; else wbc->index = wbc->range_start >> PAGE_SHIFT; /* * To avoid livelocks when other processes dirty new pages, we * first tag pages which should be written back and only then * start writing them. * * For data-integrity writeback we have to be careful so that we * do not miss some pages (e.g., because some other process has * cleared the TOWRITE tag we set). The rule we follow is that * TOWRITE tag can be cleared only by the process clearing the * DIRTY tag (and submitting the page for I/O). */ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, wbc->index, wbc_end(wbc)); } else { wbc->nr_to_write -= folio_nr_pages(folio); WARN_ON_ONCE(*error > 0); /* * For integrity writeback we have to keep going until we have * written all the folios we tagged for writeback above, even if * we run past wbc->nr_to_write or encounter errors. * We stash away the first error we encounter in wbc->saved_err * so that it can be retrieved when we're done. This is because * the file system may still have state to clear for each folio. * * For background writeback we exit as soon as we run past * wbc->nr_to_write or encounter the first error. */ if (wbc->sync_mode == WB_SYNC_ALL) { if (*error && !wbc->saved_err) wbc->saved_err = *error; } else { if (*error || wbc->nr_to_write <= 0) goto done; } } folio = writeback_get_folio(mapping, wbc); if (!folio) { /* * To avoid deadlocks between range_cyclic writeback and callers * that hold pages in PageWriteback to aggregate I/O until * the writeback iteration finishes, we do not loop back to the * start of the file. Doing so causes a page lock/page * writeback access order inversion - we should only ever lock * multiple pages in ascending page->index order, and looping * back to the start of the file violates that rule and causes * deadlocks. */ if (wbc->range_cyclic) mapping->writeback_index = 0; /* * Return the first error we encountered (if there was any) to * the caller. */ *error = wbc->saved_err; } return folio; done: if (wbc->range_cyclic) mapping->writeback_index = folio_next_index(folio); folio_batch_release(&wbc->fbatch); return NULL; } EXPORT_SYMBOL_GPL(writeback_iter); /** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @writepage: function called for each page * @data: data passed to writepage function * * Return: %0 on success, negative error code otherwise * * Note: please use writeback_iter() instead. */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) { struct folio *folio = NULL; int error; while ((folio = writeback_iter(mapping, wbc, folio, &error))) { error = writepage(folio, wbc, data); if (error == AOP_WRITEPAGE_ACTIVATE) { folio_unlock(folio); error = 0; } } return error; } EXPORT_SYMBOL(write_cache_pages); static int writeback_use_writepage(struct address_space *mapping, struct writeback_control *wbc) { struct folio *folio = NULL; struct blk_plug plug; int err; blk_start_plug(&plug); while ((folio = writeback_iter(mapping, wbc, folio, &err))) { err = mapping->a_ops->writepage(&folio->page, wbc); if (err == AOP_WRITEPAGE_ACTIVATE) { folio_unlock(folio); err = 0; } mapping_set_error(mapping, err); } blk_finish_plug(&plug); return err; } int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; struct bdi_writeback *wb; if (wbc->nr_to_write <= 0) return 0; wb = inode_to_wb_wbc(mapping->host, wbc); wb_bandwidth_estimate_start(wb); while (1) { if (mapping->a_ops->writepages) { ret = mapping->a_ops->writepages(mapping, wbc); } else if (mapping->a_ops->writepage) { ret = writeback_use_writepage(mapping, wbc); } else { /* deal with chardevs and other special files */ ret = 0; } if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL) break; /* * Lacking an allocation context or the locality or writeback * state of any of the inode's pages, throttle based on * writeback activity on the local node. It's as good a * guess as any. */ reclaim_throttle(NODE_DATA(numa_node_id()), VMSCAN_THROTTLE_WRITEBACK); } /* * Usually few pages are written by now from those we've just submitted * but if there's constant writeback being submitted, this makes sure * writeback bandwidth is updated once in a while. */ if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + BANDWIDTH_INTERVAL)) wb_update_bandwidth(wb); return ret; } /* * For address_spaces which do not use buffers nor write back. */ bool noop_dirty_folio(struct address_space *mapping, struct folio *folio) { if (!folio_test_dirty(folio)) return !folio_test_set_dirty(folio); return false; } EXPORT_SYMBOL(noop_dirty_folio); /* * Helper function for set_page_dirty family. * * NOTE: This relies on being atomic wrt interrupts. */ static void folio_account_dirtied(struct folio *folio, struct address_space *mapping) { struct inode *inode = mapping->host; trace_writeback_dirty_folio(folio, mapping); if (mapping_can_writeback(mapping)) { struct bdi_writeback *wb; long nr = folio_nr_pages(folio); inode_attach_wb(inode, folio); wb = inode_to_wb(inode); __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr); __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); __node_stat_mod_folio(folio, NR_DIRTIED, nr); wb_stat_mod(wb, WB_RECLAIMABLE, nr); wb_stat_mod(wb, WB_DIRTIED, nr); task_io_account_write(nr * PAGE_SIZE); current->nr_dirtied += nr; __this_cpu_add(bdp_ratelimits, nr); mem_cgroup_track_foreign_dirty(folio, wb); } } /* * Helper function for deaccounting dirty page without writeback. * */ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) { long nr = folio_nr_pages(folio); lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); wb_stat_mod(wb, WB_RECLAIMABLE, -nr); task_io_account_cancelled_write(nr * PAGE_SIZE); } /* * Mark the folio dirty, and set it dirty in the page cache. * * If warn is true, then emit a warning if the folio is not uptodate and has * not been truncated. * * It is the caller's responsibility to prevent the folio from being truncated * while this function is in progress, although it may have been truncated * before this function is called. Most callers have the folio locked. * A few have the folio blocked from truncation through other means (e.g. * zap_vma_pages() has it mapped and is holding the page table lock). * When called from mark_buffer_dirty(), the filesystem should hold a * reference to the buffer_head that is being marked dirty, which causes * try_to_free_buffers() to fail. */ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, int warn) { unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); if (folio->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !folio_test_uptodate(folio)); folio_account_dirtied(folio, mapping); __xa_set_mark(&mapping->i_pages, folio_index(folio), PAGECACHE_TAG_DIRTY); } xa_unlock_irqrestore(&mapping->i_pages, flags); } /** * filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads. * @mapping: Address space this folio belongs to. * @folio: Folio to be marked as dirty. * * Filesystems which do not use buffer heads should call this function * from their dirty_folio address space operation. It ignores the * contents of folio_get_private(), so if the filesystem marks individual * blocks as dirty, the filesystem should handle that itself. * * This is also sometimes used by filesystems which use buffer_heads when * a single buffer is being dirtied: we want to set the folio dirty in * that case, but not all the buffers. This is a "bottom-up" dirtying, * whereas block_dirty_folio() is a "top-down" dirtying. * * The caller must ensure this doesn't race with truncation. Most will * simply hold the folio lock, but e.g. zap_pte_range() calls with the * folio mapped and the pte lock held, which also locks out truncation. */ bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio) { if (folio_test_set_dirty(folio)) return false; __folio_mark_dirty(folio, mapping, !folio_test_private(folio)); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } return true; } EXPORT_SYMBOL(filemap_dirty_folio); /** * folio_redirty_for_writepage - Decline to write a dirty folio. * @wbc: The writeback control. * @folio: The folio. * * When a writepage implementation decides that it doesn't want to write * @folio for some reason, it should call this function, unlock @folio and * return 0. * * Return: True if we redirtied the folio. False if someone else dirtied * it first. */ bool folio_redirty_for_writepage(struct writeback_control *wbc, struct folio *folio) { struct address_space *mapping = folio->mapping; long nr = folio_nr_pages(folio); bool ret; wbc->pages_skipped += nr; ret = filemap_dirty_folio(mapping, folio); if (mapping && mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; wb = unlocked_inode_to_wb_begin(inode, &cookie); current->nr_dirtied -= nr; node_stat_mod_folio(folio, NR_DIRTIED, -nr); wb_stat_mod(wb, WB_DIRTIED, -nr); unlocked_inode_to_wb_end(inode, &cookie); } return ret; } EXPORT_SYMBOL(folio_redirty_for_writepage); /** * folio_mark_dirty - Mark a folio as being modified. * @folio: The folio. * * The folio may not be truncated while this function is running. * Holding the folio lock is sufficient to prevent truncation, but some * callers cannot acquire a sleeping lock. These callers instead hold * the page table lock for a page table which contains at least one page * in this folio. Truncation will block on the page table lock as it * unmaps pages before removing the folio from its mapping. * * Return: True if the folio was newly dirtied, false if it was already dirty. */ bool folio_mark_dirty(struct folio *folio) { struct address_space *mapping = folio_mapping(folio); if (likely(mapping)) { /* * readahead/folio_deactivate could remain * PG_readahead/PG_reclaim due to race with folio_end_writeback * About readahead, if the folio is written, the flags would be * reset. So no problem. * About folio_deactivate, if the folio is redirtied, * the flag will be reset. So no problem. but if the * folio is used by readahead it will confuse readahead * and make it restart the size rampup process. But it's * a trivial problem. */ if (folio_test_reclaim(folio)) folio_clear_reclaim(folio); return mapping->a_ops->dirty_folio(mapping, folio); } return noop_dirty_folio(mapping, folio); } EXPORT_SYMBOL(folio_mark_dirty); /* * folio_mark_dirty() is racy if the caller has no reference against * folio->mapping->host, and if the folio is unlocked. This is because another * CPU could truncate the folio off the mapping and then free the mapping. * * Usually, the folio _is_ locked, or the caller is a user-space process which * holds a reference on the inode by having an open file. * * In other cases, the folio should be locked before running folio_mark_dirty(). */ bool folio_mark_dirty_lock(struct folio *folio) { bool ret; folio_lock(folio); ret = folio_mark_dirty(folio); folio_unlock(folio); return ret; } EXPORT_SYMBOL(folio_mark_dirty_lock); /* * This cancels just the dirty bit on the kernel page itself, it does NOT * actually remove dirty bits on any mmap's that may be around. It also * leaves the page tagged dirty, so any sync activity will still find it on * the dirty lists, and in particular, clear_page_dirty_for_io() will still * look at the dirty bits in the VM. * * Doing this should *normally* only ever be done when a page is truncated, * and is not actually mapped anywhere at all. However, fs/buffer.c does * this when it notices that somebody has cleaned out all the buffers on a * page without actually doing it through the VM. Can you say "ext3 is * horribly ugly"? Thought you could. */ void __folio_cancel_dirty(struct folio *folio) { struct address_space *mapping = folio_mapping(folio); if (mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; wb = unlocked_inode_to_wb_begin(inode, &cookie); if (folio_test_clear_dirty(folio)) folio_account_cleaned(folio, wb); unlocked_inode_to_wb_end(inode, &cookie); } else { folio_clear_dirty(folio); } } EXPORT_SYMBOL(__folio_cancel_dirty); /* * Clear a folio's dirty flag, while caring for dirty memory accounting. * Returns true if the folio was previously dirty. * * This is for preparing to put the folio under writeout. We leave * the folio tagged as dirty in the xarray so that a concurrent * write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk. * The ->writepage implementation will run either folio_start_writeback() * or folio_mark_dirty(), at which stage we bring the folio's dirty flag * and xarray dirty tag back into sync. * * This incoherency between the folio's dirty flag and xarray tag is * unfortunate, but it only exists while the folio is locked. */ bool folio_clear_dirty_for_io(struct folio *folio) { struct address_space *mapping = folio_mapping(folio); bool ret = false; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (mapping && mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; /* * Yes, Virginia, this is indeed insane. * * We use this sequence to make sure that * (a) we account for dirty stats properly * (b) we tell the low-level filesystem to * mark the whole folio dirty if it was * dirty in a pagetable. Only to then * (c) clean the folio again and return 1 to * cause the writeback. * * This way we avoid all nasty races with the * dirty bit in multiple places and clearing * them concurrently from different threads. * * Note! Normally the "folio_mark_dirty(folio)" * has no effect on the actual dirty bit - since * that will already usually be set. But we * need the side effects, and it can help us * avoid races. * * We basically use the folio "master dirty bit" * as a serialization point for all the different * threads doing their things. */ if (folio_mkclean(folio)) folio_mark_dirty(folio); /* * We carefully synchronise fault handlers against * installing a dirty pte and marking the folio dirty * at this point. We do this by having them hold the * page lock while dirtying the folio, and folios are * always locked coming in here, so we get the desired * exclusion. */ wb = unlocked_inode_to_wb_begin(inode, &cookie); if (folio_test_clear_dirty(folio)) { long nr = folio_nr_pages(folio); lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); wb_stat_mod(wb, WB_RECLAIMABLE, -nr); ret = true; } unlocked_inode_to_wb_end(inode, &cookie); return ret; } return folio_test_clear_dirty(folio); } EXPORT_SYMBOL(folio_clear_dirty_for_io); static void wb_inode_writeback_start(struct bdi_writeback *wb) { atomic_inc(&wb->writeback_inodes); } static void wb_inode_writeback_end(struct bdi_writeback *wb) { unsigned long flags; atomic_dec(&wb->writeback_inodes); /* * Make sure estimate of writeback throughput gets updated after * writeback completed. We delay the update by BANDWIDTH_INTERVAL * (which is the interval other bandwidth updates use for batching) so * that if multiple inodes end writeback at a similar time, they get * batched into one bandwidth update. */ spin_lock_irqsave(&wb->work_lock, flags); if (test_bit(WB_registered, &wb->state)) queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL); spin_unlock_irqrestore(&wb->work_lock, flags); } bool __folio_end_writeback(struct folio *folio) { long nr = folio_nr_pages(folio); struct address_space *mapping = folio_mapping(folio); bool ret; if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); __xa_clear_mark(&mapping->i_pages, folio_index(folio), PAGECACHE_TAG_WRITEBACK); if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { struct bdi_writeback *wb = inode_to_wb(inode); wb_stat_mod(wb, WB_WRITEBACK, -nr); __wb_writeout_add(wb, nr); if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) wb_inode_writeback_end(wb); } if (mapping->host && !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) sb_clear_inode_writeback(mapping->host); xa_unlock_irqrestore(&mapping->i_pages, flags); } else { ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); } lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); node_stat_mod_folio(folio, NR_WRITTEN, nr); return ret; } void __folio_start_writeback(struct folio *folio, bool keep_write) { long nr = folio_nr_pages(folio); struct address_space *mapping = folio_mapping(folio); int access_ret; VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); if (mapping && mapping_use_writeback_tags(mapping)) { XA_STATE(xas, &mapping->i_pages, folio_index(folio)); struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; bool on_wblist; xas_lock_irqsave(&xas, flags); xas_load(&xas); folio_test_set_writeback(folio); on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { struct bdi_writeback *wb = inode_to_wb(inode); wb_stat_mod(wb, WB_WRITEBACK, nr); if (!on_wblist) wb_inode_writeback_start(wb); } /* * We can come through here when swapping anonymous * folios, so we don't necessarily have an inode to * track for sync. */ if (mapping->host && !on_wblist) sb_mark_inode_writeback(mapping->host); if (!folio_test_dirty(folio)) xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); if (!keep_write) xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irqrestore(&xas, flags); } else { folio_test_set_writeback(folio); } lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); access_ret = arch_make_folio_accessible(folio); /* * If writeback has been triggered on a page that cannot be made * accessible, it is too late to recover here. */ VM_BUG_ON_FOLIO(access_ret != 0, folio); } EXPORT_SYMBOL(__folio_start_writeback); /** * folio_wait_writeback - Wait for a folio to finish writeback. * @folio: The folio to wait for. * * If the folio is currently being written back to storage, wait for the * I/O to complete. * * Context: Sleeps. Must be called in process context and with * no spinlocks held. Caller should hold a reference on the folio. * If the folio is not locked, writeback may start again after writeback * has finished. */ void folio_wait_writeback(struct folio *folio) { while (folio_test_writeback(folio)) { trace_folio_wait_writeback(folio, folio_mapping(folio)); folio_wait_bit(folio, PG_writeback); } } EXPORT_SYMBOL_GPL(folio_wait_writeback); /** * folio_wait_writeback_killable - Wait for a folio to finish writeback. * @folio: The folio to wait for. * * If the folio is currently being written back to storage, wait for the * I/O to complete or a fatal signal to arrive. * * Context: Sleeps. Must be called in process context and with * no spinlocks held. Caller should hold a reference on the folio. * If the folio is not locked, writeback may start again after writeback * has finished. * Return: 0 on success, -EINTR if we get a fatal signal while waiting. */ int folio_wait_writeback_killable(struct folio *folio) { while (folio_test_writeback(folio)) { trace_folio_wait_writeback(folio, folio_mapping(folio)); if (folio_wait_bit_killable(folio, PG_writeback)) return -EINTR; } return 0; } EXPORT_SYMBOL_GPL(folio_wait_writeback_killable); /** * folio_wait_stable() - wait for writeback to finish, if necessary. * @folio: The folio to wait on. * * This function determines if the given folio is related to a backing * device that requires folio contents to be held stable during writeback. * If so, then it will wait for any pending writeback to complete. * * Context: Sleeps. Must be called in process context and with * no spinlocks held. Caller should hold a reference on the folio. * If the folio is not locked, writeback may start again after writeback * has finished. */ void folio_wait_stable(struct folio *folio) { if (mapping_stable_writes(folio_mapping(folio))) folio_wait_writeback(folio); } EXPORT_SYMBOL_GPL(folio_wait_stable); |
| 251 251 251 251 251 251 251 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107 12108 12109 12110 12111 12112 12113 12114 12115 12116 12117 12118 12119 12120 12121 12122 12123 12124 12125 12126 12127 12128 12129 12130 12131 12132 12133 12134 12135 12136 12137 12138 12139 12140 12141 12142 12143 12144 12145 12146 12147 12148 12149 12150 12151 12152 12153 12154 12155 12156 12157 12158 12159 12160 12161 12162 12163 12164 12165 12166 12167 12168 12169 12170 12171 12172 12173 12174 12175 12176 12177 12178 12179 12180 12181 12182 12183 12184 12185 12186 12187 12188 12189 12190 12191 12192 12193 12194 12195 12196 12197 12198 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux Socket Filter - Kernel level socket filtering * * Based on the design of the Berkeley Packet Filter. The new * internal format has been designed by PLUMgrid: * * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com * * Authors: * * Jay Schulist <jschlst@samba.org> * Alexei Starovoitov <ast@plumgrid.com> * Daniel Borkmann <dborkman@redhat.com> * * Andi Kleen - Fix a few bad bugs and races. * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ #include <linux/atomic.h> #include <linux/bpf_verifier.h> #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/sock_diag.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> #include <linux/if_arp.h> #include <linux/gfp.h> #include <net/inet_common.h> #include <net/ip.h> #include <net/protocol.h> #include <net/netlink.h> #include <linux/skbuff.h> #include <linux/skmsg.h> #include <net/sock.h> #include <net/flow_dissector.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/uaccess.h> #include <linux/unaligned.h> #include <linux/filter.h> #include <linux/ratelimit.h> #include <linux/seccomp.h> #include <linux/if_vlan.h> #include <linux/bpf.h> #include <linux/btf.h> #include <net/sch_generic.h> #include <net/cls_cgroup.h> #include <net/dst_metadata.h> #include <net/dst.h> #include <net/sock_reuseport.h> #include <net/busy_poll.h> #include <net/tcp.h> #include <net/xfrm.h> #include <net/udp.h> #include <linux/bpf_trace.h> #include <net/xdp_sock.h> #include <linux/inetdevice.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/flow.h> #include <net/arp.h> #include <net/ipv6.h> #include <net/net_namespace.h> #include <linux/seg6_local.h> #include <net/seg6.h> #include <net/seg6_local.h> #include <net/lwtunnel.h> #include <net/ipv6_stubs.h> #include <net/bpf_sk_storage.h> #include <net/transp_v6.h> #include <linux/btf_ids.h> #include <net/tls.h> #include <net/xdp.h> #include <net/mptcp.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netkit.h> #include <linux/un.h> #include <net/xdp_sock_drv.h> #include <net/inet_dscp.h> #include "dev.h" /* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */ static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check"); static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len) { if (in_compat_syscall()) { struct compat_sock_fprog f32; if (len != sizeof(f32)) return -EINVAL; if (copy_from_sockptr(&f32, src, sizeof(f32))) return -EFAULT; memset(dst, 0, sizeof(*dst)); dst->len = f32.len; dst->filter = compat_ptr(f32.filter); } else { if (len != sizeof(*dst)) return -EINVAL; if (copy_from_sockptr(dst, src, sizeof(*dst))) return -EFAULT; } return 0; } EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); /** * sk_filter_trim_cap - run a packet through a socket filter * @sk: sock associated with &sk_buff * @skb: buffer to filter * @cap: limit on how short the eBPF program may trim the packet * * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level * wrapper to bpf_prog_run. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) { int err; struct sk_filter *filter; /* * If the skb was allocated from pfmemalloc reserves, only * allow SOCK_MEMALLOC sockets to use it as this socket is * helping free memory */ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); return -ENOMEM; } err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); if (err) return err; err = security_sock_rcv_skb(sk, skb); if (err) return err; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter) { struct sock *save_sk = skb->sk; unsigned int pkt_len; skb->sk = sk; pkt_len = bpf_prog_run_save_cb(filter->prog, skb); skb->sk = save_sk; err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; } rcu_read_unlock(); return err; } EXPORT_SYMBOL(sk_filter_trim_cap); BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb) { return skb_get_poff(skb); } BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; if (skb_is_nonlinear(skb)) return 0; if (skb->len < sizeof(struct nlattr)) return 0; if (a > skb->len - sizeof(struct nlattr)) return 0; nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); if (nla) return (void *) nla - (void *) skb->data; return 0; } BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; if (skb_is_nonlinear(skb)) return 0; if (skb->len < sizeof(struct nlattr)) return 0; if (a > skb->len - sizeof(struct nlattr)) return 0; nla = (struct nlattr *) &skb->data[a]; if (!nla_ok(nla, skb->len - a)) return 0; nla = nla_find_nested(nla, x); if (nla) return (void *) nla - (void *) skb->data; return 0; } BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { u8 tmp, *ptr; const int len = sizeof(tmp); if (offset >= 0) { if (headlen - offset >= len) return *(u8 *)(data + offset); if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) return tmp; } else { ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); if (likely(ptr)) return *(u8 *)ptr; } return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, int, offset) { return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len, offset); } BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { __be16 tmp, *ptr; const int len = sizeof(tmp); if (offset >= 0) { if (headlen - offset >= len) return get_unaligned_be16(data + offset); if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) return be16_to_cpu(tmp); } else { ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); if (likely(ptr)) return get_unaligned_be16(ptr); } return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, int, offset) { return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len, offset); } BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { __be32 tmp, *ptr; const int len = sizeof(tmp); if (likely(offset >= 0)) { if (headlen - offset >= len) return get_unaligned_be32(data + offset); if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) return be32_to_cpu(tmp); } else { ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); if (likely(ptr)) return get_unaligned_be32(ptr); } return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, int, offset) { return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len, offset); } static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; switch (skb_field) { case SKF_AD_MARK: BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4); *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, offsetof(struct sk_buff, mark)); break; case SKF_AD_PKTTYPE: *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); #endif break; case SKF_AD_QUEUE: BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2); *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, queue_mapping)); break; case SKF_AD_VLAN_TAG: BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2); /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, vlan_tci)); break; case SKF_AD_VLAN_TAG_PRESENT: BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_all) != 4); *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, offsetof(struct sk_buff, vlan_all)); *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); *insn++ = BPF_ALU32_IMM(BPF_MOV, dst_reg, 1); break; } return insn - insn_buf; } static bool convert_bpf_extensions(struct sock_filter *fp, struct bpf_insn **insnp) { struct bpf_insn *insn = *insnp; u32 cnt; switch (fp->k) { case SKF_AD_OFF + SKF_AD_PROTOCOL: BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2); /* A = *(u16 *) (CTX + offsetof(protocol)) */ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, offsetof(struct sk_buff, protocol)); /* A = ntohs(A) [emitting a nop or swap16] */ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); break; case SKF_AD_OFF + SKF_AD_PKTTYPE: cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_IFINDEX: case SKF_AD_OFF + SKF_AD_HATYPE: BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4); BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), BPF_REG_TMP, BPF_REG_CTX, offsetof(struct sk_buff, dev)); /* if (tmp != 0) goto pc + 1 */ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); *insn++ = BPF_EXIT_INSN(); if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, offsetof(struct net_device, ifindex)); else *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, offsetof(struct net_device, type)); break; case SKF_AD_OFF + SKF_AD_MARK: cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_RXHASH: BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4); *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, offsetof(struct sk_buff, hash)); break; case SKF_AD_OFF + SKF_AD_QUEUE: cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TAG: cnt = convert_skb_access(SKF_AD_VLAN_TAG, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TPID: BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2); /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, offsetof(struct sk_buff, vlan_proto)); /* A = ntohs(A) [emitting a nop or swap16] */ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); break; case SKF_AD_OFF + SKF_AD_PAY_OFFSET: case SKF_AD_OFF + SKF_AD_NLATTR: case SKF_AD_OFF + SKF_AD_NLATTR_NEST: case SKF_AD_OFF + SKF_AD_CPU: case SKF_AD_OFF + SKF_AD_RANDOM: /* arg1 = CTX */ *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); /* arg2 = A */ *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); /* arg3 = X */ *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); /* Emit call(arg1=CTX, arg2=A, arg3=X) */ switch (fp->k) { case SKF_AD_OFF + SKF_AD_PAY_OFFSET: *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset); break; case SKF_AD_OFF + SKF_AD_NLATTR: *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr); break; case SKF_AD_OFF + SKF_AD_NLATTR_NEST: *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest); break; case SKF_AD_OFF + SKF_AD_CPU: *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id); break; case SKF_AD_OFF + SKF_AD_RANDOM: *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); bpf_user_rnd_init_once(); break; } break; case SKF_AD_OFF + SKF_AD_ALU_XOR_X: /* A ^= X */ *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); break; default: /* This is just a dummy call to avoid letting the compiler * evict __bpf_call_base() as an optimization. Placed here * where no-one bothers. */ BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); return false; } *insnp = insn; return true; } static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) { const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS); int size = bpf_size_to_bytes(BPF_SIZE(fp->code)); bool endian = BPF_SIZE(fp->code) == BPF_H || BPF_SIZE(fp->code) == BPF_W; bool indirect = BPF_MODE(fp->code) == BPF_IND; const int ip_align = NET_IP_ALIGN; struct bpf_insn *insn = *insnp; int offset = fp->k; if (!indirect && ((unaligned_ok && offset >= 0) || (!unaligned_ok && offset >= 0 && offset + ip_align >= 0 && offset + ip_align % size == 0))) { bool ldx_off_ok = offset <= S16_MAX; *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); if (offset) *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian + (!ldx_off_ok * 2)); if (ldx_off_ok) { *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D, offset); } else { *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D); *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset); *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_TMP, 0); } if (endian) *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); *insn++ = BPF_JMP_A(8); } *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D); *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H); if (!indirect) { *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset); } else { *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X); if (fp->k) *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset); } switch (BPF_SIZE(fp->code)) { case BPF_B: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8); break; case BPF_H: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16); break; case BPF_W: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32); break; default: return false; } *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2); *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); *insn = BPF_EXIT_INSN(); *insnp = insn; return true; } /** * bpf_convert_filter - convert filter program * @prog: the user passed filter program * @len: the length of the user passed filter program * @new_prog: allocated 'struct bpf_prog' or NULL * @new_len: pointer to store length of converted program * @seen_ld_abs: bool whether we've seen ld_abs/ind * * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' * style extended BPF (eBPF). * Conversion workflow: * * 1) First pass for calculating the new program length: * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs) * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs) */ static int bpf_convert_filter(struct sock_filter *prog, int len, struct bpf_prog *new_prog, int *new_len, bool *seen_ld_abs) { int new_flen = 0, pass = 0, target, i, stack_off; struct bpf_insn *new_insn, *first_insn = NULL; struct sock_filter *fp; int *addrs = NULL; u8 bpf_src; BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); if (len <= 0 || len > BPF_MAXINSNS) return -EINVAL; if (new_prog) { first_insn = new_prog->insnsi; addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL | __GFP_NOWARN); if (!addrs) return -ENOMEM; } do_pass: new_insn = first_insn; fp = prog; /* Classic BPF related prologue emission. */ if (new_prog) { /* Classic BPF expects A and X to be reset first. These need * to be guaranteed to be the first two instructions. */ *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); /* All programs must keep CTX in callee saved BPF_REG_CTX. * In eBPF case it's done by the compiler, here we need to * do this ourself. Initial CTX is present in BPF_REG_ARG1. */ *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); if (*seen_ld_abs) { /* For packet access in classic BPF, cache skb->data * in callee-saved BPF R8 and skb->len - skb->data_len * (headlen) in BPF R9. Since classic BPF is read-only * on CTX, we only need to cache it once. */ *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), BPF_REG_D, BPF_REG_CTX, offsetof(struct sk_buff, data)); *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX, offsetof(struct sk_buff, len)); *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX, offsetof(struct sk_buff, data_len)); *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP); } } else { new_insn += 3; } for (i = 0; i < len; fp++, i++) { struct bpf_insn tmp_insns[32] = { }; struct bpf_insn *insn = tmp_insns; if (addrs) addrs[i] = new_insn - first_insn; switch (fp->code) { /* All arithmetic insns and skb loads map as-is. */ case BPF_ALU | BPF_ADD | BPF_X: case BPF_ALU | BPF_ADD | BPF_K: case BPF_ALU | BPF_SUB | BPF_X: case BPF_ALU | BPF_SUB | BPF_K: case BPF_ALU | BPF_AND | BPF_X: case BPF_ALU | BPF_AND | BPF_K: case BPF_ALU | BPF_OR | BPF_X: case BPF_ALU | BPF_OR | BPF_K: case BPF_ALU | BPF_LSH | BPF_X: case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_X: case BPF_ALU | BPF_RSH | BPF_K: case BPF_ALU | BPF_XOR | BPF_X: case BPF_ALU | BPF_XOR | BPF_K: case BPF_ALU | BPF_MUL | BPF_X: case BPF_ALU | BPF_MUL | BPF_K: case BPF_ALU | BPF_DIV | BPF_X: case BPF_ALU | BPF_DIV | BPF_K: case BPF_ALU | BPF_MOD | BPF_X: case BPF_ALU | BPF_MOD | BPF_K: case BPF_ALU | BPF_NEG: case BPF_LD | BPF_ABS | BPF_W: case BPF_LD | BPF_ABS | BPF_H: case BPF_LD | BPF_ABS | BPF_B: case BPF_LD | BPF_IND | BPF_W: case BPF_LD | BPF_IND | BPF_H: case BPF_LD | BPF_IND | BPF_B: /* Check for overloaded BPF extension and * directly convert it if found, otherwise * just move on with mapping. */ if (BPF_CLASS(fp->code) == BPF_LD && BPF_MODE(fp->code) == BPF_ABS && convert_bpf_extensions(fp, &insn)) break; if (BPF_CLASS(fp->code) == BPF_LD && convert_bpf_ld_abs(fp, &insn)) { *seen_ld_abs = true; break; } if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); /* Error with exception code on div/mod by 0. * For cBPF programs, this was always return 0. */ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); *insn++ = BPF_EXIT_INSN(); } *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); break; /* Jump transformation cannot use BPF block macros * everywhere as offset calculation and target updates * require a bit more work than the rest, i.e. jump * opcodes map as-is, but offsets need adjustment. */ #define BPF_EMIT_JMP \ do { \ const s32 off_min = S16_MIN, off_max = S16_MAX; \ s32 off; \ \ if (target >= len || target < 0) \ goto err; \ off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ /* Adjust pc relative offset for 2nd or 3rd insn. */ \ off -= insn - tmp_insns; \ /* Reject anything not fitting into insn->off. */ \ if (off < off_min || off > off_max) \ goto err; \ insn->off = off; \ } while (0) case BPF_JMP | BPF_JA: target = i + fp->k + 1; insn->code = fp->code; BPF_EMIT_JMP; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { /* BPF immediates are signed, zero extend * immediate into tmp register and use it * in compare insn. */ *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); insn->dst_reg = BPF_REG_A; insn->src_reg = BPF_REG_TMP; bpf_src = BPF_X; } else { insn->dst_reg = BPF_REG_A; insn->imm = fp->k; bpf_src = BPF_SRC(fp->code); insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; } /* Common case where 'jump_false' is next insn. */ if (fp->jf == 0) { insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; target = i + fp->jt + 1; BPF_EMIT_JMP; break; } /* Convert some jumps when 'jump_true' is next insn. */ if (fp->jt == 0) { switch (BPF_OP(fp->code)) { case BPF_JEQ: insn->code = BPF_JMP | BPF_JNE | bpf_src; break; case BPF_JGT: insn->code = BPF_JMP | BPF_JLE | bpf_src; break; case BPF_JGE: insn->code = BPF_JMP | BPF_JLT | bpf_src; break; default: goto jmp_rest; } target = i + fp->jf + 1; BPF_EMIT_JMP; break; } jmp_rest: /* Other jumps are mapped into two insns: Jxx and JA. */ target = i + fp->jt + 1; insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; BPF_EMIT_JMP; insn++; insn->code = BPF_JMP | BPF_JA; target = i + fp->jf + 1; BPF_EMIT_JMP; break; /* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */ case BPF_LDX | BPF_MSH | BPF_B: { struct sock_filter tmp = { .code = BPF_LD | BPF_ABS | BPF_B, .k = fp->k, }; *seen_ld_abs = true; /* X = A */ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = BPF_R0 = *(u8 *) (skb->data + K) */ convert_bpf_ld_abs(&tmp, &insn); insn++; /* A &= 0xf */ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); /* A <<= 2 */ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); /* tmp = X */ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X); /* X = A */ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = tmp */ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; } /* RET_K is remapped into 2 insns. RET_A case doesn't need an * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. */ case BPF_RET | BPF_A: case BPF_RET | BPF_K: if (BPF_RVAL(fp->code) == BPF_K) *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, 0, fp->k); *insn = BPF_EXIT_INSN(); break; /* Store to stack. */ case BPF_ST: case BPF_STX: stack_off = fp->k * 4 + 4; *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == BPF_ST ? BPF_REG_A : BPF_REG_X, -stack_off); /* check_load_and_stores() verifies that classic BPF can * load from stack only after write, so tracking * stack_depth for ST|STX insns is enough */ if (new_prog && new_prog->aux->stack_depth < stack_off) new_prog->aux->stack_depth = stack_off; break; /* Load from stack. */ case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: stack_off = fp->k * 4 + 4; *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, BPF_REG_FP, -stack_off); break; /* A = K or X = K */ case BPF_LD | BPF_IMM: case BPF_LDX | BPF_IMM: *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, fp->k); break; /* X = A */ case BPF_MISC | BPF_TAX: *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); break; /* A = X */ case BPF_MISC | BPF_TXA: *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); break; /* A = skb->len or X = skb->len */ case BPF_LD | BPF_W | BPF_LEN: case BPF_LDX | BPF_W | BPF_LEN: *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, BPF_REG_CTX, offsetof(struct sk_buff, len)); break; /* Access seccomp_data fields. */ case BPF_LDX | BPF_ABS | BPF_W: /* A = *(u32 *) (ctx + K) */ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); break; /* Unknown instruction. */ default: goto err; } insn++; if (new_prog) memcpy(new_insn, tmp_insns, sizeof(*insn) * (insn - tmp_insns)); new_insn += insn - tmp_insns; } if (!new_prog) { /* Only calculating new length. */ *new_len = new_insn - first_insn; if (*seen_ld_abs) *new_len += 4; /* Prologue bits. */ return 0; } pass++; if (new_flen != new_insn - first_insn) { new_flen = new_insn - first_insn; if (pass > 2) goto err; goto do_pass; } kfree(addrs); BUG_ON(*new_len != new_flen); return 0; err: kfree(addrs); return -EINVAL; } /* Security: * * As we dont want to clear mem[] array for each packet going through * __bpf_prog_run(), we check that filter loaded by user never try to read * a cell if not previously written, and we check all branches to be sure * a malicious user doesn't try to abuse us. */ static int check_load_and_stores(const struct sock_filter *filter, int flen) { u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ int pc, ret = 0; BUILD_BUG_ON(BPF_MEMWORDS > 16); masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); if (!masks) return -ENOMEM; memset(masks, 0xff, flen * sizeof(*masks)); for (pc = 0; pc < flen; pc++) { memvalid &= masks[pc]; switch (filter[pc].code) { case BPF_ST: case BPF_STX: memvalid |= (1 << filter[pc].k); break; case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: if (!(memvalid & (1 << filter[pc].k))) { ret = -EINVAL; goto error; } break; case BPF_JMP | BPF_JA: /* A jump must set masks on target */ masks[pc + 1 + filter[pc].k] &= memvalid; memvalid = ~0; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: /* A jump must set masks on targets */ masks[pc + 1 + filter[pc].jt] &= memvalid; masks[pc + 1 + filter[pc].jf] &= memvalid; memvalid = ~0; break; } } error: kfree(masks); return ret; } static bool chk_code_allowed(u16 code_to_probe) { static const bool codes[] = { /* 32 bit ALU operations */ [BPF_ALU | BPF_ADD | BPF_K] = true, [BPF_ALU | BPF_ADD | BPF_X] = true, [BPF_ALU | BPF_SUB | BPF_K] = true, [BPF_ALU | BPF_SUB | BPF_X] = true, [BPF_ALU | BPF_MUL | BPF_K] = true, [BPF_ALU | BPF_MUL | BPF_X] = true, [BPF_ALU | BPF_DIV | BPF_K] = true, [BPF_ALU | BPF_DIV | BPF_X] = true, [BPF_ALU | BPF_MOD | BPF_K] = true, [BPF_ALU | BPF_MOD | BPF_X] = true, [BPF_ALU | BPF_AND | BPF_K] = true, [BPF_ALU | BPF_AND | BPF_X] = true, [BPF_ALU | BPF_OR | BPF_K] = true, [BPF_ALU | BPF_OR | BPF_X] = true, [BPF_ALU | BPF_XOR | BPF_K] = true, [BPF_ALU | BPF_XOR | BPF_X] = true, [BPF_ALU | BPF_LSH | BPF_K] = true, [BPF_ALU | BPF_LSH | BPF_X] = true, [BPF_ALU | BPF_RSH | BPF_K] = true, [BPF_ALU | BPF_RSH | BPF_X] = true, [BPF_ALU | BPF_NEG] = true, /* Load instructions */ [BPF_LD | BPF_W | BPF_ABS] = true, [BPF_LD | BPF_H | BPF_ABS] = true, [BPF_LD | BPF_B | BPF_ABS] = true, [BPF_LD | BPF_W | BPF_LEN] = true, [BPF_LD | BPF_W | BPF_IND] = true, [BPF_LD | BPF_H | BPF_IND] = true, [BPF_LD | BPF_B | BPF_IND] = true, [BPF_LD | BPF_IMM] = true, [BPF_LD | BPF_MEM] = true, [BPF_LDX | BPF_W | BPF_LEN] = true, [BPF_LDX | BPF_B | BPF_MSH] = true, [BPF_LDX | BPF_IMM] = true, [BPF_LDX | BPF_MEM] = true, /* Store instructions */ [BPF_ST] = true, [BPF_STX] = true, /* Misc instructions */ [BPF_MISC | BPF_TAX] = true, [BPF_MISC | BPF_TXA] = true, /* Return instructions */ [BPF_RET | BPF_K] = true, [BPF_RET | BPF_A] = true, /* Jump instructions */ [BPF_JMP | BPF_JA] = true, [BPF_JMP | BPF_JEQ | BPF_K] = true, [BPF_JMP | BPF_JEQ | BPF_X] = true, [BPF_JMP | BPF_JGE | BPF_K] = true, [BPF_JMP | BPF_JGE | BPF_X] = true, [BPF_JMP | BPF_JGT | BPF_K] = true, [BPF_JMP | BPF_JGT | BPF_X] = true, [BPF_JMP | BPF_JSET | BPF_K] = true, [BPF_JMP | BPF_JSET | BPF_X] = true, }; if (code_to_probe >= ARRAY_SIZE(codes)) return false; return codes[code_to_probe]; } static bool bpf_check_basics_ok(const struct sock_filter *filter, unsigned int flen) { if (filter == NULL) return false; if (flen == 0 || flen > BPF_MAXINSNS) return false; return true; } /** * bpf_check_classic - verify socket filter code * @filter: filter to verify * @flen: length of filter * * Check the user's filter code. If we let some ugly * filter code slip through kaboom! The filter must contain * no references or jumps that are out of range, no illegal * instructions, and must end with a RET instruction. * * All jumps are forward as they are not signed. * * Returns 0 if the rule set is legal or -EINVAL if not. */ static int bpf_check_classic(const struct sock_filter *filter, unsigned int flen) { bool anc_found; int pc; /* Check the filter code now */ for (pc = 0; pc < flen; pc++) { const struct sock_filter *ftest = &filter[pc]; /* May we actually operate on this code? */ if (!chk_code_allowed(ftest->code)) return -EINVAL; /* Some instructions need special checks */ switch (ftest->code) { case BPF_ALU | BPF_DIV | BPF_K: case BPF_ALU | BPF_MOD | BPF_K: /* Check for division by zero */ if (ftest->k == 0) return -EINVAL; break; case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_K: if (ftest->k >= 32) return -EINVAL; break; case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: case BPF_ST: case BPF_STX: /* Check for invalid memory addresses */ if (ftest->k >= BPF_MEMWORDS) return -EINVAL; break; case BPF_JMP | BPF_JA: /* Note, the large ftest->k might cause loops. * Compare this with conditional jumps below, * where offsets are limited. --ANK (981016) */ if (ftest->k >= (unsigned int)(flen - pc - 1)) return -EINVAL; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: /* Both conditionals must be safe */ if (pc + ftest->jt + 1 >= flen || pc + ftest->jf + 1 >= flen) return -EINVAL; break; case BPF_LD | BPF_W | BPF_ABS: case BPF_LD | BPF_H | BPF_ABS: case BPF_LD | BPF_B | BPF_ABS: anc_found = false; if (bpf_anc_helper(ftest) & BPF_ANC) anc_found = true; /* Ancillary operation unknown or unsupported */ if (anc_found == false && ftest->k >= SKF_AD_OFF) return -EINVAL; } } /* Last instruction must be a RET code */ switch (filter[flen - 1].code) { case BPF_RET | BPF_K: case BPF_RET | BPF_A: return check_load_and_stores(filter, flen); } return -EINVAL; } static int bpf_prog_store_orig_filter(struct bpf_prog *fp, const struct sock_fprog *fprog) { unsigned int fsize = bpf_classic_proglen(fprog); struct sock_fprog_kern *fkprog; fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); if (!fp->orig_prog) return -ENOMEM; fkprog = fp->orig_prog; fkprog->len = fprog->len; fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL | __GFP_NOWARN); if (!fkprog->filter) { kfree(fp->orig_prog); return -ENOMEM; } return 0; } static void bpf_release_orig_filter(struct bpf_prog *fp) { struct sock_fprog_kern *fprog = fp->orig_prog; if (fprog) { kfree(fprog->filter); kfree(fprog); } } static void __bpf_prog_release(struct bpf_prog *prog) { if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) { bpf_prog_put(prog); } else { bpf_release_orig_filter(prog); bpf_prog_free(prog); } } static void __sk_filter_release(struct sk_filter *fp) { __bpf_prog_release(fp->prog); kfree(fp); } /** * sk_filter_release_rcu - Release a socket filter by rcu_head * @rcu: rcu_head that contains the sk_filter to free */ static void sk_filter_release_rcu(struct rcu_head *rcu) { struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); __sk_filter_release(fp); } /** * sk_filter_release - release a socket filter * @fp: filter to remove * * Remove a filter from a socket and release its resources. */ static void sk_filter_release(struct sk_filter *fp) { if (refcount_dec_and_test(&fp->refcnt)) call_rcu(&fp->rcu, sk_filter_release_rcu); } void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) { u32 filter_size = bpf_prog_size(fp->prog->len); atomic_sub(filter_size, &sk->sk_omem_alloc); sk_filter_release(fp); } /* try to charge the socket memory if there is space available * return true on success */ static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) { int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); u32 filter_size = bpf_prog_size(fp->prog->len); /* same check as in sock_kmalloc() */ if (filter_size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) { atomic_add(filter_size, &sk->sk_omem_alloc); return true; } return false; } bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) { if (!refcount_inc_not_zero(&fp->refcnt)) return false; if (!__sk_filter_charge(sk, fp)) { sk_filter_release(fp); return false; } return true; } static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) { struct sock_filter *old_prog; struct bpf_prog *old_fp; int err, new_len, old_len = fp->len; bool seen_ld_abs = false; /* We are free to overwrite insns et al right here as it won't be used at * this point in time anymore internally after the migration to the eBPF * instruction representation. */ BUILD_BUG_ON(sizeof(struct sock_filter) != sizeof(struct bpf_insn)); /* Conversion cannot happen on overlapping memory areas, * so we need to keep the user BPF around until the 2nd * pass. At this time, the user BPF is stored in fp->insns. */ old_prog = kmemdup_array(fp->insns, old_len, sizeof(struct sock_filter), GFP_KERNEL | __GFP_NOWARN); if (!old_prog) { err = -ENOMEM; goto out_err; } /* 1st pass: calculate the new program length. */ err = bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs); if (err) goto out_err_free; /* Expand fp for appending the new filter representation. */ old_fp = fp; fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); if (!fp) { /* The old_fp is still around in case we couldn't * allocate new memory, so uncharge on that one. */ fp = old_fp; err = -ENOMEM; goto out_err_free; } fp->len = new_len; /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ err = bpf_convert_filter(old_prog, old_len, fp, &new_len, &seen_ld_abs); if (err) /* 2nd bpf_convert_filter() can fail only if it fails * to allocate memory, remapping must succeed. Note, * that at this time old_fp has already been released * by krealloc(). */ goto out_err_free; fp = bpf_prog_select_runtime(fp, &err); if (err) goto out_err_free; kfree(old_prog); return fp; out_err_free: kfree(old_prog); out_err: __bpf_prog_release(fp); return ERR_PTR(err); } static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, bpf_aux_classic_check_t trans) { int err; fp->bpf_func = NULL; fp->jited = 0; err = bpf_check_classic(fp->insns, fp->len); if (err) { __bpf_prog_release(fp); return ERR_PTR(err); } /* There might be additional checks and transformations * needed on classic filters, f.e. in case of seccomp. */ if (trans) { err = trans(fp->insns, fp->len); if (err) { __bpf_prog_release(fp); return ERR_PTR(err); } } /* Probe if we can JIT compile the filter and if so, do * the compilation of the filter. */ bpf_jit_compile(fp); /* JIT compiler couldn't process this filter, so do the eBPF translation * for the optimized interpreter. */ if (!fp->jited) fp = bpf_migrate_filter(fp); return fp; } /** * bpf_prog_create - create an unattached filter * @pfp: the unattached filter that is created * @fprog: the filter program * * Create a filter independent of any socket. We first run some * sanity checks on it to make sure it does not explode on us later. * If an error occurs or there is insufficient memory for the filter * a negative errno code is returned. On success the return is zero. */ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *fp; /* Make sure new filter is there and in the right amounts. */ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return -EINVAL; fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; memcpy(fp->insns, fprog->filter, fsize); fp->len = fprog->len; /* Since unattached filters are not copied back to user * space through sk_get_filter(), we do not need to hold * a copy here, and can spare us the work. */ fp->orig_prog = NULL; /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ fp = bpf_prepare_filter(fp, NULL); if (IS_ERR(fp)) return PTR_ERR(fp); *pfp = fp; return 0; } EXPORT_SYMBOL_GPL(bpf_prog_create); /** * bpf_prog_create_from_user - create an unattached filter from user buffer * @pfp: the unattached filter that is created * @fprog: the filter program * @trans: post-classic verifier transformation handler * @save_orig: save classic BPF program * * This function effectively does the same as bpf_prog_create(), only * that it builds up its insns buffer from user space provided buffer. * It also allows for passing a bpf_aux_classic_check_t handler. */ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, bpf_aux_classic_check_t trans, bool save_orig) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *fp; int err; /* Make sure new filter is there and in the right amounts. */ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return -EINVAL; fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; if (copy_from_user(fp->insns, fprog->filter, fsize)) { __bpf_prog_free(fp); return -EFAULT; } fp->len = fprog->len; fp->orig_prog = NULL; if (save_orig) { err = bpf_prog_store_orig_filter(fp, fprog); if (err) { __bpf_prog_free(fp); return -ENOMEM; } } /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ fp = bpf_prepare_filter(fp, trans); if (IS_ERR(fp)) return PTR_ERR(fp); *pfp = fp; return 0; } EXPORT_SYMBOL_GPL(bpf_prog_create_from_user); void bpf_prog_destroy(struct bpf_prog *fp) { __bpf_prog_release(fp); } EXPORT_SYMBOL_GPL(bpf_prog_destroy); static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) { struct sk_filter *fp, *old_fp; fp = kmalloc(sizeof(*fp), GFP_KERNEL); if (!fp) return -ENOMEM; fp->prog = prog; if (!__sk_filter_charge(sk, fp)) { kfree(fp); return -ENOMEM; } refcount_set(&fp->refcnt, 1); old_fp = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); rcu_assign_pointer(sk->sk_filter, fp); if (old_fp) sk_filter_uncharge(sk, old_fp); return 0; } static struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *prog; int err; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return ERR_PTR(-EPERM); /* Make sure new filter is there and in the right amounts. */ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return ERR_PTR(-EINVAL); prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!prog) return ERR_PTR(-ENOMEM); if (copy_from_user(prog->insns, fprog->filter, fsize)) { __bpf_prog_free(prog); return ERR_PTR(-EFAULT); } prog->len = fprog->len; err = bpf_prog_store_orig_filter(prog, fprog); if (err) { __bpf_prog_free(prog); return ERR_PTR(-ENOMEM); } /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ return bpf_prepare_filter(prog, NULL); } /** * sk_attach_filter - attach a socket filter * @fprog: the filter program * @sk: the socket to use * * Attach the user's filter code. We first run some sanity checks on * it to make sure it does not explode on us later. If an error * occurs or there is insufficient memory for the filter a negative * errno code is returned. On success the return is zero. */ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct bpf_prog *prog = __get_filter(fprog, sk); int err; if (IS_ERR(prog)) return PTR_ERR(prog); err = __sk_attach_prog(prog, sk); if (err < 0) { __bpf_prog_release(prog); return err; } return 0; } EXPORT_SYMBOL_GPL(sk_attach_filter); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct bpf_prog *prog = __get_filter(fprog, sk); int err, optmem_max; if (IS_ERR(prog)) return PTR_ERR(prog); optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); if (bpf_prog_size(prog->len) > optmem_max) err = -ENOMEM; else err = reuseport_attach_prog(sk, prog); if (err) __bpf_prog_release(prog); return err; } static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) { if (sock_flag(sk, SOCK_FILTER_LOCKED)) return ERR_PTR(-EPERM); return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); } int sk_attach_bpf(u32 ufd, struct sock *sk) { struct bpf_prog *prog = __get_bpf(ufd, sk); int err; if (IS_ERR(prog)) return PTR_ERR(prog); err = __sk_attach_prog(prog, sk); if (err < 0) { bpf_prog_put(prog); return err; } return 0; } int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) { struct bpf_prog *prog; int err, optmem_max; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); if (PTR_ERR(prog) == -EINVAL) prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT); if (IS_ERR(prog)) return PTR_ERR(prog); if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) { /* Like other non BPF_PROG_TYPE_SOCKET_FILTER * bpf prog (e.g. sockmap). It depends on the * limitation imposed by bpf_prog_load(). * Hence, sysctl_optmem_max is not checked. */ if ((sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) || (sk->sk_protocol != IPPROTO_UDP && sk->sk_protocol != IPPROTO_TCP) || (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) { err = -ENOTSUPP; goto err_prog_put; } } else { /* BPF_PROG_TYPE_SOCKET_FILTER */ optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); if (bpf_prog_size(prog->len) > optmem_max) { err = -ENOMEM; goto err_prog_put; } } err = reuseport_attach_prog(sk, prog); err_prog_put: if (err) bpf_prog_put(prog); return err; } void sk_reuseport_prog_free(struct bpf_prog *prog) { if (!prog) return; if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) bpf_prog_put(prog); else bpf_prog_destroy(prog); } static inline int __bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { #ifdef CONFIG_DEBUG_NET /* Avoid a splat in pskb_may_pull_reason() */ if (write_len > INT_MAX) return -EINVAL; #endif return skb_ensure_writable(skb, write_len); } static inline int bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { int err = __bpf_try_make_writable(skb, write_len); bpf_compute_data_pointers(skb); return err; } static int bpf_try_make_head_writable(struct sk_buff *skb) { return bpf_try_make_writable(skb, skb_headlen(skb)); } static inline void bpf_push_mac_rcsum(struct sk_buff *skb) { if (skb_at_tc_ingress(skb)) skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len); } static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) { if (skb_at_tc_ingress(skb)) skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); } BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, const void *, from, u32, len, u64, flags) { void *ptr; if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) return -EINVAL; if (unlikely(offset > INT_MAX)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; ptr = skb->data + offset; if (flags & BPF_F_RECOMPUTE_CSUM) __skb_postpull_rcsum(skb, ptr, len, offset); memcpy(ptr, from, len); if (flags & BPF_F_RECOMPUTE_CSUM) __skb_postpush_rcsum(skb, ptr, len, offset); if (flags & BPF_F_INVALIDATE_HASH) skb_clear_hash(skb); return 0; } static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .func = bpf_skb_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) { return ____bpf_skb_store_bytes(skb, offset, from, len, flags); } BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, void *, to, u32, len) { void *ptr; if (unlikely(offset > INT_MAX)) goto err_clear; ptr = skb_header_pointer(skb, offset, len, to); if (unlikely(!ptr)) goto err_clear; if (ptr != to) memcpy(to, ptr, len); return 0; err_clear: memset(to, 0, len); return -EFAULT; } static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .func = bpf_skb_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) { return ____bpf_skb_load_bytes(skb, offset, to, len); } BPF_CALL_4(bpf_flow_dissector_load_bytes, const struct bpf_flow_dissector *, ctx, u32, offset, void *, to, u32, len) { void *ptr; if (unlikely(offset > 0xffff)) goto err_clear; if (unlikely(!ctx->skb)) goto err_clear; ptr = skb_header_pointer(ctx->skb, offset, len, to); if (unlikely(!ptr)) goto err_clear; if (ptr != to) memcpy(to, ptr, len); return 0; err_clear: memset(to, 0, len); return -EFAULT; } static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = { .func = bpf_flow_dissector_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, u32, offset, void *, to, u32, len, u32, start_header) { u8 *end = skb_tail_pointer(skb); u8 *start, *ptr; if (unlikely(offset > 0xffff)) goto err_clear; switch (start_header) { case BPF_HDR_START_MAC: if (unlikely(!skb_mac_header_was_set(skb))) goto err_clear; start = skb_mac_header(skb); break; case BPF_HDR_START_NET: start = skb_network_header(skb); break; default: goto err_clear; } ptr = start + offset; if (likely(ptr + len <= end)) { memcpy(to, ptr, len); return 0; } err_clear: memset(to, 0, len); return -EFAULT; } static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = { .func = bpf_skb_load_bytes_relative, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) { /* Idea is the following: should the needed direct read/write * test fail during runtime, we can pull in more data and redo * again, since implicitly, we invalidate previous checks here. * * Or, since we know how much we need to make read/writeable, * this can be done once at the program beginning for direct * access case. By this we overcome limitations of only current * headroom being accessible. */ return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); } static const struct bpf_func_proto bpf_skb_pull_data_proto = { .func = bpf_skb_pull_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk) { return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL; } static const struct bpf_func_proto bpf_sk_fullsock_proto = { .func = bpf_sk_fullsock, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; static inline int sk_skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { return __bpf_try_make_writable(skb, write_len); } BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) { /* Idea is the following: should the needed direct read/write * test fail during runtime, we can pull in more data and redo * again, since implicitly, we invalidate previous checks here. * * Or, since we know how much we need to make read/writeable, * this can be done once at the program beginning for direct * access case. By this we overcome limitations of only current * headroom being accessible. */ return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb)); } static const struct bpf_func_proto sk_skb_pull_data_proto = { .func = sk_skb_pull_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { __sum16 *ptr; if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) return -EFAULT; ptr = (__sum16 *)(skb->data + offset); switch (flags & BPF_F_HDR_FIELD_MASK) { case 0: if (unlikely(from != 0)) return -EINVAL; csum_replace_by_diff(ptr, to); break; case 2: csum_replace2(ptr, from, to); break; case 4: csum_replace4(ptr, from, to); break; default: return -EINVAL; } return 0; } static const struct bpf_func_proto bpf_l3_csum_replace_proto = { .func = bpf_l3_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { bool is_pseudo = flags & BPF_F_PSEUDO_HDR; bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; bool do_mforce = flags & BPF_F_MARK_ENFORCE; __sum16 *ptr; if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) return -EFAULT; ptr = (__sum16 *)(skb->data + offset); if (is_mmzero && !do_mforce && !*ptr) return 0; switch (flags & BPF_F_HDR_FIELD_MASK) { case 0: if (unlikely(from != 0)) return -EINVAL; inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); break; case 4: inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); break; default: return -EINVAL; } if (is_mmzero && !*ptr) *ptr = CSUM_MANGLED_0; return 0; } static const struct bpf_func_proto bpf_l4_csum_replace_proto = { .func = bpf_l4_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, __be32 *, to, u32, to_size, __wsum, seed) { /* This is quite flexible, some examples: * * from_size == 0, to_size > 0, seed := csum --> pushing data * from_size > 0, to_size == 0, seed := csum --> pulling data * from_size > 0, to_size > 0, seed := 0 --> diffing data * * Even for diffing, from_size and to_size don't need to be equal. */ __wsum ret = seed; if (from_size && to_size) ret = csum_sub(csum_partial(to, to_size, ret), csum_partial(from, from_size, 0)); else if (to_size) ret = csum_partial(to, to_size, ret); else if (from_size) ret = ~csum_partial(from, from_size, ~ret); return csum_from32to16((__force unsigned int)ret); } static const struct bpf_func_proto bpf_csum_diff_proto = { .func = bpf_csum_diff, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) { /* The interface is to be used in combination with bpf_csum_diff() * for direct packet writes. csum rotation for alignment as well * as emulating csum_sub() can be done from the eBPF program. */ if (skb->ip_summed == CHECKSUM_COMPLETE) return (skb->csum = csum_add(skb->csum, csum)); return -ENOTSUPP; } static const struct bpf_func_proto bpf_csum_update_proto = { .func = bpf_csum_update, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level) { /* The interface is to be used in combination with bpf_skb_adjust_room() * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET * is passed as flags, for example. */ switch (level) { case BPF_CSUM_LEVEL_INC: __skb_incr_checksum_unnecessary(skb); break; case BPF_CSUM_LEVEL_DEC: __skb_decr_checksum_unnecessary(skb); break; case BPF_CSUM_LEVEL_RESET: __skb_reset_checksum_unnecessary(skb); break; case BPF_CSUM_LEVEL_QUERY: return skb->ip_summed == CHECKSUM_UNNECESSARY ? skb->csum_level : -EACCES; default: return -EINVAL; } return 0; } static const struct bpf_func_proto bpf_csum_level_proto = { .func = bpf_csum_level, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { return dev_forward_skb_nomtu(dev, skb); } static inline int __bpf_rx_skb_no_mac(struct net_device *dev, struct sk_buff *skb) { int ret = ____dev_forward_skb(dev, skb, false); if (likely(!ret)) { skb->dev = dev; ret = netif_rx(skb); } return ret; } static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) { int ret; if (dev_xmit_recursion()) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); kfree_skb(skb); return -ENETDOWN; } skb->dev = dev; skb_set_redirected_noclear(skb, skb_at_tc_ingress(skb)); skb_clear_tstamp(skb); dev_xmit_recursion_inc(); ret = dev_queue_xmit(skb); dev_xmit_recursion_dec(); return ret; } static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, u32 flags) { unsigned int mlen = skb_network_offset(skb); if (unlikely(skb->len <= mlen)) { kfree_skb(skb); return -ERANGE; } if (mlen) { __skb_pull(skb, mlen); /* At ingress, the mac header has already been pulled once. * At egress, skb_pospull_rcsum has to be done in case that * the skb is originated from ingress (i.e. a forwarded skb) * to ensure that rcsum starts at net header. */ if (!skb_at_tc_ingress(skb)) skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); } skb_pop_mac_header(skb); skb_reset_mac_len(skb); return flags & BPF_F_INGRESS ? __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); } static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, u32 flags) { /* Verify that a link layer header is carried */ if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) { kfree_skb(skb); return -ERANGE; } bpf_push_mac_rcsum(skb); return flags & BPF_F_INGRESS ? __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); } static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, u32 flags) { if (dev_is_mac_header_xmit(dev)) return __bpf_redirect_common(skb, dev, flags); else return __bpf_redirect_no_mac(skb, dev, flags); } #if IS_ENABLED(CONFIG_IPV6) static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { u32 hh_len = LL_RESERVED_SPACE(dev); const struct in6_addr *nexthop; struct dst_entry *dst = NULL; struct neighbour *neigh; if (dev_xmit_recursion()) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); goto out_drop; } skb->dev = dev; skb_clear_tstamp(skb); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) return -ENOMEM; } rcu_read_lock(); if (!nh) { dst = skb_dst(skb); nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr); } else { nexthop = &nh->ipv6_nh; } neigh = ip_neigh_gw6(dev, nexthop); if (likely(!IS_ERR(neigh))) { int ret; sock_confirm_neigh(skb, neigh); local_bh_disable(); dev_xmit_recursion_inc(); ret = neigh_output(neigh, skb, false); dev_xmit_recursion_dec(); local_bh_enable(); rcu_read_unlock(); return ret; } rcu_read_unlock(); if (dst) IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); out_drop: kfree_skb(skb); return -ENETDOWN; } static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; if (!nh) { struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_flags = FLOWI_FLAG_ANYSRC, .flowi6_mark = skb->mark, .flowlabel = ip6_flowinfo(ip6h), .flowi6_oif = dev->ifindex, .flowi6_proto = ip6h->nexthdr, .daddr = ip6h->daddr, .saddr = ip6h->saddr, }; dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); if (IS_ERR(dst)) goto out_drop; skb_dst_set(skb, dst); } else if (nh->nh_family != AF_INET6) { goto out_drop; } err = bpf_out_neigh_v6(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; } #else static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { kfree_skb(skb); return NET_XMIT_DROP; } #endif /* CONFIG_IPV6 */ #if IS_ENABLED(CONFIG_INET) static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { u32 hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; if (dev_xmit_recursion()) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); goto out_drop; } skb->dev = dev; skb_clear_tstamp(skb); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) return -ENOMEM; } rcu_read_lock(); if (!nh) { struct rtable *rt = skb_rtable(skb); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); } else if (nh->nh_family == AF_INET6) { neigh = ip_neigh_gw6(dev, &nh->ipv6_nh); is_v6gw = true; } else if (nh->nh_family == AF_INET) { neigh = ip_neigh_gw4(dev, nh->ipv4_nh); } else { rcu_read_unlock(); goto out_drop; } if (likely(!IS_ERR(neigh))) { int ret; sock_confirm_neigh(skb, neigh); local_bh_disable(); dev_xmit_recursion_inc(); ret = neigh_output(neigh, skb, is_v6gw); dev_xmit_recursion_dec(); local_bh_enable(); rcu_read_unlock(); return ret; } rcu_read_unlock(); out_drop: kfree_skb(skb); return -ENETDOWN; } static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { const struct iphdr *ip4h = ip_hdr(skb); struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; if (!nh) { struct flowi4 fl4 = { .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)), .flowi4_oif = dev->ifindex, .flowi4_proto = ip4h->protocol, .daddr = ip4h->daddr, .saddr = ip4h->saddr, }; struct rtable *rt; rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto out_drop; if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { ip_rt_put(rt); goto out_drop; } skb_dst_set(skb, &rt->dst); } err = bpf_out_neigh_v4(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; } #else static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { kfree_skb(skb); return NET_XMIT_DROP; } #endif /* CONFIG_INET */ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { struct ethhdr *ethh = eth_hdr(skb); if (unlikely(skb->mac_header >= skb->network_header)) goto out; bpf_push_mac_rcsum(skb); if (is_multicast_ether_addr(ethh->h_dest)) goto out; skb_pull(skb, sizeof(*ethh)); skb_unset_mac_header(skb); skb_reset_network_header(skb); if (skb->protocol == htons(ETH_P_IP)) return __bpf_redirect_neigh_v4(skb, dev, nh); else if (skb->protocol == htons(ETH_P_IPV6)) return __bpf_redirect_neigh_v6(skb, dev, nh); out: kfree_skb(skb); return -ENOTSUPP; } /* Internal, non-exposed redirect flags. */ enum { BPF_F_NEIGH = (1ULL << 16), BPF_F_PEER = (1ULL << 17), BPF_F_NEXTHOP = (1ULL << 18), #define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP) }; BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { struct net_device *dev; struct sk_buff *clone; int ret; BUILD_BUG_ON(BPF_F_REDIRECT_INTERNAL & BPF_F_REDIRECT_FLAGS); if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return -EINVAL; dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); if (unlikely(!dev)) return -EINVAL; clone = skb_clone(skb, GFP_ATOMIC); if (unlikely(!clone)) return -ENOMEM; /* For direct write, we need to keep the invariant that the skbs * we're dealing with need to be uncloned. Should uncloning fail * here, we need to free the just generated clone to unclone once * again. */ ret = bpf_try_make_head_writable(skb); if (unlikely(ret)) { kfree_skb(clone); return -ENOMEM; } return __bpf_redirect(clone, dev, flags); } static const struct bpf_func_proto bpf_clone_redirect_proto = { .func = bpf_clone_redirect, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static struct net_device *skb_get_peer_dev(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; if (likely(ops->ndo_get_peer_dev)) return INDIRECT_CALL_1(ops->ndo_get_peer_dev, netkit_peer_dev, dev); return NULL; } int skb_do_redirect(struct sk_buff *skb) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct net *net = dev_net(skb->dev); struct net_device *dev; u32 flags = ri->flags; dev = dev_get_by_index_rcu(net, ri->tgt_index); ri->tgt_index = 0; ri->flags = 0; if (unlikely(!dev)) goto out_drop; if (flags & BPF_F_PEER) { if (unlikely(!skb_at_tc_ingress(skb))) goto out_drop; dev = skb_get_peer_dev(dev); if (unlikely(!dev || !(dev->flags & IFF_UP) || net_eq(net, dev_net(dev)))) goto out_drop; skb->dev = dev; dev_sw_netstats_rx_add(dev, skb->len); return -EAGAIN; } return flags & BPF_F_NEIGH ? __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? &ri->nh : NULL) : __bpf_redirect(skb, dev, flags); out_drop: kfree_skb(skb); return -EINVAL; } BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return TC_ACT_SHOT; ri->flags = flags; ri->tgt_index = ifindex; return TC_ACT_REDIRECT; } static const struct bpf_func_proto bpf_redirect_proto = { .func = bpf_redirect, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags)) return TC_ACT_SHOT; ri->flags = BPF_F_PEER; ri->tgt_index = ifindex; return TC_ACT_REDIRECT; } static const struct bpf_func_proto bpf_redirect_peer_proto = { .func = bpf_redirect_peer, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, int, plen, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely((plen && plen < sizeof(*params)) || flags)) return TC_ACT_SHOT; ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0); ri->tgt_index = ifindex; BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params)); if (plen) memcpy(&ri->nh, params, sizeof(ri->nh)); return TC_ACT_REDIRECT; } static const struct bpf_func_proto bpf_redirect_neigh_proto = { .func = bpf_redirect_neigh, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) { msg->apply_bytes = bytes; return 0; } static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { .func = bpf_msg_apply_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) { msg->cork_bytes = bytes; return 0; } static void sk_msg_reset_curr(struct sk_msg *msg) { if (!msg->sg.size) { msg->sg.curr = msg->sg.start; msg->sg.copybreak = 0; } else { u32 i = msg->sg.end; sk_msg_iter_var_prev(i); msg->sg.curr = i; msg->sg.copybreak = msg->sg.data[i].length; } } static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .func = bpf_msg_cork_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, u32, end, u64, flags) { u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start; u32 first_sge, last_sge, i, shift, bytes_sg_total; struct scatterlist *sge; u8 *raw, *to, *from; struct page *page; if (unlikely(flags || end <= start)) return -EINVAL; /* First find the starting scatterlist element */ i = msg->sg.start; do { offset += len; len = sk_msg_elem(msg, i)->length; if (start < offset + len) break; sk_msg_iter_var_next(i); } while (i != msg->sg.end); if (unlikely(start >= offset + len)) return -EINVAL; first_sge = i; /* The start may point into the sg element so we need to also * account for the headroom. */ bytes_sg_total = start - offset + bytes; if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist * elements or a single shared page. Either way we need to * copy into a linear buffer exclusively owned by BPF. Then * place the buffer in the scatterlist and fixup the original * entries by removing the entries now in the linear buffer * and shifting the remaining entries. For now we do not try * to copy partial entries to avoid complexity of running out * of sg_entry slots. The downside is reading a single byte * will copy the entire sg entry. */ do { copy += sk_msg_elem(msg, i)->length; sk_msg_iter_var_next(i); if (bytes_sg_total <= copy) break; } while (i != msg->sg.end); last_sge = i; if (unlikely(bytes_sg_total > copy)) return -EINVAL; page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, get_order(copy)); if (unlikely(!page)) return -ENOMEM; raw = page_address(page); i = first_sge; do { sge = sk_msg_elem(msg, i); from = sg_virt(sge); len = sge->length; to = raw + poffset; memcpy(to, from, len); poffset += len; sge->length = 0; put_page(sg_page(sge)); sk_msg_iter_var_next(i); } while (i != last_sge); sg_set_page(&msg->sg.data[first_sge], page, copy, 0); /* To repair sg ring we need to shift entries. If we only * had a single entry though we can just replace it and * be done. Otherwise walk the ring and shift the entries. */ WARN_ON_ONCE(last_sge == first_sge); shift = last_sge > first_sge ? last_sge - first_sge - 1 : NR_MSG_FRAG_IDS - first_sge + last_sge - 1; if (!shift) goto out; i = first_sge; sk_msg_iter_var_next(i); do { u32 move_from; if (i + shift >= NR_MSG_FRAG_IDS) move_from = i + shift - NR_MSG_FRAG_IDS; else move_from = i + shift; if (move_from == msg->sg.end) break; msg->sg.data[i] = msg->sg.data[move_from]; msg->sg.data[move_from].length = 0; msg->sg.data[move_from].page_link = 0; msg->sg.data[move_from].offset = 0; sk_msg_iter_var_next(i); } while (1); msg->sg.end = msg->sg.end - shift > msg->sg.end ? msg->sg.end - shift + NR_MSG_FRAG_IDS : msg->sg.end - shift; out: sk_msg_reset_curr(msg); msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; msg->data_end = msg->data + bytes; return 0; } static const struct bpf_func_proto bpf_msg_pull_data_proto = { .func = bpf_msg_pull_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, u32, len, u64, flags) { struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge; u32 new, i = 0, l = 0, space, copy = 0, offset = 0; u8 *raw, *to, *from; struct page *page; if (unlikely(flags)) return -EINVAL; if (unlikely(len == 0)) return 0; /* First find the starting scatterlist element */ i = msg->sg.start; do { offset += l; l = sk_msg_elem(msg, i)->length; if (start < offset + l) break; sk_msg_iter_var_next(i); } while (i != msg->sg.end); if (start > offset + l) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); /* If no space available will fallback to copy, we need at * least one scatterlist elem available to push data into * when start aligns to the beginning of an element or two * when it falls inside an element. We handle the start equals * offset case because its the common case for inserting a * header. */ if (!space || (space == 1 && start != offset)) copy = msg->sg.data[i].length; page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, get_order(copy + len)); if (unlikely(!page)) return -ENOMEM; if (copy) { int front, back; raw = page_address(page); if (i == msg->sg.end) sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); front = start - offset; back = psge->length - front; from = sg_virt(psge); if (front) memcpy(raw, from, front); if (back) { from += front; to = raw + front + len; memcpy(to, from, back); } put_page(sg_page(psge)); new = i; goto place_new; } if (start - offset) { if (i == msg->sg.end) sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); rsge = sk_msg_elem_cpy(msg, i); psge->length = start - offset; rsge.length -= psge->length; rsge.offset += start; sk_msg_iter_var_next(i); sg_unmark_end(psge); sg_unmark_end(&rsge); } /* Slot(s) to place newly allocated data */ sk_msg_iter_next(msg, end); new = i; sk_msg_iter_var_next(i); if (i == msg->sg.end) { if (!rsge.length) goto place_new; sk_msg_iter_next(msg, end); goto place_new; } /* Shift one or two slots as needed */ sge = sk_msg_elem_cpy(msg, new); sg_unmark_end(&sge); nsge = sk_msg_elem_cpy(msg, i); if (rsge.length) { sk_msg_iter_var_next(i); nnsge = sk_msg_elem_cpy(msg, i); sk_msg_iter_next(msg, end); } while (i != msg->sg.end) { msg->sg.data[i] = sge; sge = nsge; sk_msg_iter_var_next(i); if (rsge.length) { nsge = nnsge; nnsge = sk_msg_elem_cpy(msg, i); } else { nsge = sk_msg_elem_cpy(msg, i); } } place_new: /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; __clear_bit(new, msg->sg.copy); sg_set_page(&msg->sg.data[new], page, len + copy, 0); if (rsge.length) { get_page(sg_page(&rsge)); sk_msg_iter_var_next(new); msg->sg.data[new] = rsge; } sk_msg_reset_curr(msg); sk_msg_compute_data_pointers(msg); return 0; } static const struct bpf_func_proto bpf_msg_push_data_proto = { .func = bpf_msg_push_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; static void sk_msg_shift_left(struct sk_msg *msg, int i) { struct scatterlist *sge = sk_msg_elem(msg, i); int prev; put_page(sg_page(sge)); do { prev = i; sk_msg_iter_var_next(i); msg->sg.data[prev] = msg->sg.data[i]; } while (i != msg->sg.end); sk_msg_iter_prev(msg, end); } static void sk_msg_shift_right(struct sk_msg *msg, int i) { struct scatterlist tmp, sge; sk_msg_iter_next(msg, end); sge = sk_msg_elem_cpy(msg, i); sk_msg_iter_var_next(i); tmp = sk_msg_elem_cpy(msg, i); while (i != msg->sg.end) { msg->sg.data[i] = sge; sk_msg_iter_var_next(i); sge = tmp; tmp = sk_msg_elem_cpy(msg, i); } } BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, u32, len, u64, flags) { u32 i = 0, l = 0, space, offset = 0; u64 last = start + len; int pop; if (unlikely(flags)) return -EINVAL; if (unlikely(len == 0)) return 0; /* First find the starting scatterlist element */ i = msg->sg.start; do { offset += l; l = sk_msg_elem(msg, i)->length; if (start < offset + l) break; sk_msg_iter_var_next(i); } while (i != msg->sg.end); /* Bounds checks: start and pop must be inside message */ if (start >= offset + l || last > msg->sg.size) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); pop = len; /* --------------| offset * -| start |-------- len -------| * * |----- a ----|-------- pop -------|----- b ----| * |______________________________________________| length * * * a: region at front of scatter element to save * b: region at back of scatter element to save when length > A + pop * pop: region to pop from element, same as input 'pop' here will be * decremented below per iteration. * * Two top-level cases to handle when start != offset, first B is non * zero and second B is zero corresponding to when a pop includes more * than one element. * * Then if B is non-zero AND there is no space allocate space and * compact A, B regions into page. If there is space shift ring to * the right free'ing the next element in ring to place B, leaving * A untouched except to reduce length. */ if (start != offset) { struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); int a = start - offset; int b = sge->length - pop - a; sk_msg_iter_var_next(i); if (b > 0) { if (space) { sge->length = a; sk_msg_shift_right(msg, i); nsge = sk_msg_elem(msg, i); get_page(sg_page(sge)); sg_set_page(nsge, sg_page(sge), b, sge->offset + pop + a); } else { struct page *page, *orig; u8 *to, *from; page = alloc_pages(__GFP_NOWARN | __GFP_COMP | GFP_ATOMIC, get_order(a + b)); if (unlikely(!page)) return -ENOMEM; orig = sg_page(sge); from = sg_virt(sge); to = page_address(page); memcpy(to, from, a); memcpy(to + a, from + a + pop, b); sg_set_page(sge, page, a + b, 0); put_page(orig); } pop = 0; } else { pop -= (sge->length - a); sge->length = a; } } /* From above the current layout _must_ be as follows, * * -| offset * -| start * * |---- pop ---|---------------- b ------------| * |____________________________________________| length * * Offset and start of the current msg elem are equal because in the * previous case we handled offset != start and either consumed the * entire element and advanced to the next element OR pop == 0. * * Two cases to handle here are first pop is less than the length * leaving some remainder b above. Simply adjust the element's layout * in this case. Or pop >= length of the element so that b = 0. In this * case advance to next element decrementing pop. */ while (pop) { struct scatterlist *sge = sk_msg_elem(msg, i); if (pop < sge->length) { sge->length -= pop; sge->offset += pop; pop = 0; } else { pop -= sge->length; sk_msg_shift_left(msg, i); } } sk_mem_uncharge(msg->sk, len - pop); msg->sg.size -= (len - pop); sk_msg_reset_curr(msg); sk_msg_compute_data_pointers(msg); return 0; } static const struct bpf_func_proto bpf_msg_pop_data_proto = { .func = bpf_msg_pop_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; #ifdef CONFIG_CGROUP_NET_CLASSID BPF_CALL_0(bpf_get_cgroup_classid_curr) { return __task_get_classid(current); } const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { .func = bpf_get_cgroup_classid_curr, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb) { struct sock *sk = skb_to_full_sk(skb); if (!sk || !sk_fullsock(sk)) return 0; return sock_cgroup_classid(&sk->sk_cgrp_data); } static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = { .func = bpf_skb_cgroup_classid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; #endif BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); } static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { .func = bpf_get_cgroup_classid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb) { return dst_tclassid(skb); } static const struct bpf_func_proto bpf_get_route_realm_proto = { .func = bpf_get_route_realm, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb) { /* If skb_clear_hash() was called due to mangling, we can * trigger SW recalculation here. Later access to hash * can then use the inline skb->hash via context directly * instead of calling this helper again. */ return skb_get_hash(skb); } static const struct bpf_func_proto bpf_get_hash_recalc_proto = { .func = bpf_get_hash_recalc, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) { /* After all direct packet write, this can be used once for * triggering a lazy recalc on next skb_get_hash() invocation. */ skb_clear_hash(skb); return 0; } static const struct bpf_func_proto bpf_set_hash_invalid_proto = { .func = bpf_set_hash_invalid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash) { /* Set user specified hash as L4(+), so that it gets returned * on skb_get_hash() call unless BPF prog later on triggers a * skb_clear_hash(). */ __skb_set_sw_hash(skb, hash, true); return 0; } static const struct bpf_func_proto bpf_set_hash_proto = { .func = bpf_set_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, u16, vlan_tci) { int ret; if (unlikely(vlan_proto != htons(ETH_P_8021Q) && vlan_proto != htons(ETH_P_8021AD))) vlan_proto = htons(ETH_P_8021Q); bpf_push_mac_rcsum(skb); ret = skb_vlan_push(skb, vlan_proto, vlan_tci); bpf_pull_mac_rcsum(skb); skb_reset_mac_len(skb); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_vlan_push_proto = { .func = bpf_skb_vlan_push, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) { int ret; bpf_push_mac_rcsum(skb); ret = skb_vlan_pop(skb); bpf_pull_mac_rcsum(skb); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_vlan_pop_proto = { .func = bpf_skb_vlan_pop, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) { /* Caller already did skb_cow() with len as headroom, * so no need to do it here. */ skb_push(skb, len); memmove(skb->data, skb->data + len, off); memset(skb->data + off, 0, len); /* No skb_postpush_rcsum(skb, skb->data + off, len) * needed here as it does not change the skb->csum * result for checksum complete when summing over * zeroed blocks. */ return 0; } static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) { void *old_data; /* skb_ensure_writable() is not needed here, as we're * already working on an uncloned skb. */ if (unlikely(!pskb_may_pull(skb, off + len))) return -ENOMEM; old_data = skb->data; __skb_pull(skb, len); skb_postpull_rcsum(skb, old_data + off, len); memmove(skb->data, old_data, off); return 0; } static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len) { bool trans_same = skb->transport_header == skb->network_header; int ret; /* There's no need for __skb_push()/__skb_pull() pair to * get to the start of the mac header as we're guaranteed * to always start from here under eBPF. */ ret = bpf_skb_generic_push(skb, off, len); if (likely(!ret)) { skb->mac_header -= len; skb->network_header -= len; if (trans_same) skb->transport_header = skb->network_header; } return ret; } static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) { bool trans_same = skb->transport_header == skb->network_header; int ret; /* Same here, __skb_push()/__skb_pull() pair not needed. */ ret = bpf_skb_generic_pop(skb, off, len); if (likely(!ret)) { skb->mac_header += len; skb->network_header += len; if (trans_same) skb->transport_header = skb->network_header; } return ret; } static int bpf_skb_proto_4_to_6(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); int ret; ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_push(skb, off, len_diff); if (unlikely(ret < 0)) return ret; if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */ if (shinfo->gso_type & SKB_GSO_TCPV4) { shinfo->gso_type &= ~SKB_GSO_TCPV4; shinfo->gso_type |= SKB_GSO_TCPV6; } } skb->protocol = htons(ETH_P_IPV6); skb_clear_hash(skb); return 0; } static int bpf_skb_proto_6_to_4(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); int ret; ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_pop(skb, off, len_diff); if (unlikely(ret < 0)) return ret; if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */ if (shinfo->gso_type & SKB_GSO_TCPV6) { shinfo->gso_type &= ~SKB_GSO_TCPV6; shinfo->gso_type |= SKB_GSO_TCPV4; } } skb->protocol = htons(ETH_P_IP); skb_clear_hash(skb); return 0; } static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) { __be16 from_proto = skb->protocol; if (from_proto == htons(ETH_P_IP) && to_proto == htons(ETH_P_IPV6)) return bpf_skb_proto_4_to_6(skb); if (from_proto == htons(ETH_P_IPV6) && to_proto == htons(ETH_P_IP)) return bpf_skb_proto_6_to_4(skb); return -ENOTSUPP; } BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, u64, flags) { int ret; if (unlikely(flags)) return -EINVAL; /* General idea is that this helper does the basic groundwork * needed for changing the protocol, and eBPF program fills the * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace() * and other helpers, rather than passing a raw buffer here. * * The rationale is to keep this minimal and without a need to * deal with raw packet data. F.e. even if we would pass buffers * here, the program still needs to call the bpf_lX_csum_replace() * helpers anyway. Plus, this way we keep also separation of * concerns, since f.e. bpf_skb_store_bytes() should only take * care of stores. * * Currently, additional options and extension header space are * not supported, but flags register is reserved so we can adapt * that. For offloads, we mark packet as dodgy, so that headers * need to be verified first. */ ret = bpf_skb_proto_xlat(skb, proto); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_change_proto_proto = { .func = bpf_skb_change_proto, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type) { /* We only allow a restricted subset to be changed for now. */ if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || !skb_pkt_type_ok(pkt_type))) return -EINVAL; skb->pkt_type = pkt_type; return 0; } static const struct bpf_func_proto bpf_skb_change_type_proto = { .func = bpf_skb_change_type, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; static u32 bpf_skb_net_base_len(const struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_IP): return sizeof(struct iphdr); case htons(ETH_P_IPV6): return sizeof(struct ipv6hdr); default: return ~0U; } } #define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) #define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \ BPF_F_ADJ_ROOM_DECAP_L3_IPV6) #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ BPF_ADJ_ROOM_ENCAP_L2_MASK) | \ BPF_F_ADJ_ROOM_DECAP_L3_MASK) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT; bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; u16 mac_len = 0, inner_net = 0, inner_trans = 0; unsigned int gso_type = SKB_GSO_DODGY; int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { /* udp gso_size delineates datagrams, only allow if fixed */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) return -ENOTSUPP; } ret = skb_cow_head(skb, len_diff); if (unlikely(ret < 0)) return ret; if (encap) { if (skb->protocol != htons(ETH_P_IP) && skb->protocol != htons(ETH_P_IPV6)) return -ENOTSUPP; if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) return -EINVAL; if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE && flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) return -EINVAL; if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH && inner_mac_len < ETH_HLEN) return -EINVAL; if (skb->encapsulation) return -EALREADY; mac_len = skb->network_header - skb->mac_header; inner_net = skb->network_header; if (inner_mac_len > len_diff) return -EINVAL; inner_trans = skb->transport_header; } ret = bpf_skb_net_hdr_push(skb, off, len_diff); if (unlikely(ret < 0)) return ret; if (encap) { skb->inner_mac_header = inner_net - inner_mac_len; skb->inner_network_header = inner_net; skb->inner_transport_header = inner_trans; if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH) skb_set_inner_protocol(skb, htons(ETH_P_TEB)); else skb_set_inner_protocol(skb, skb->protocol); skb->encapsulation = 1; skb_set_network_header(skb, mac_len); if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) gso_type |= SKB_GSO_UDP_TUNNEL; else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE) gso_type |= SKB_GSO_GRE; else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) gso_type |= SKB_GSO_IPXIP6; else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) gso_type |= SKB_GSO_IPXIP4; if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE || flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) { int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ? sizeof(struct ipv6hdr) : sizeof(struct iphdr); skb_set_transport_header(skb, mac_len + nh_len); } /* Match skb->protocol to new outer l3 protocol */ if (skb->protocol == htons(ETH_P_IP) && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) skb->protocol = htons(ETH_P_IPV6); else if (skb->protocol == htons(ETH_P_IPV6) && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) skb->protocol = htons(ETH_P_IP); } if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= gso_type; shinfo->gso_segs = 0; /* Due to header growth, MSS needs to be downgraded. * There is a BUG_ON() when segmenting the frag_list with * head_frag true, so linearize the skb after downgrading * the MSS. */ if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) { skb_decrease_gso_size(shinfo, len_diff); if (shinfo->frag_list) return skb_linearize(skb); } } return 0; } static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { int ret; if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_DECAP_L3_MASK | BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { /* udp gso_size delineates datagrams, only allow if fixed */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) return -ENOTSUPP; } ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_pop(skb, off, len_diff); if (unlikely(ret < 0)) return ret; /* Match skb->protocol to new outer l3 protocol */ if (skb->protocol == htons(ETH_P_IP) && flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6) skb->protocol = htons(ETH_P_IPV6); else if (skb->protocol == htons(ETH_P_IPV6) && flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4) skb->protocol = htons(ETH_P_IP); if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* Due to header shrink, MSS can be upgraded. */ if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) skb_increase_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; } return 0; } #define BPF_SKB_MAX_LEN SKB_MAX_ALLOC BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) { u32 len_diff_abs = abs(len_diff); bool shrink = len_diff < 0; int ret = 0; if (unlikely(flags || mode)) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; if (!shrink) { ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; __skb_push(skb, len_diff_abs); memset(skb->data, 0, len_diff_abs); } else { if (unlikely(!pskb_may_pull(skb, len_diff_abs))) return -ENOMEM; __skb_pull(skb, len_diff_abs); } if (tls_sw_has_ctx_rx(skb->sk)) { struct strp_msg *rxm = strp_msg(skb); rxm->full_len += len_diff; } return ret; } static const struct bpf_func_proto sk_skb_adjust_room_proto = { .func = sk_skb_adjust_room, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) { u32 len_cur, len_diff_abs = abs(len_diff); u32 len_min = bpf_skb_net_base_len(skb); u32 len_max = BPF_SKB_MAX_LEN; __be16 proto = skb->protocol; bool shrink = len_diff < 0; u32 off; int ret; if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK | BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; if (unlikely(proto != htons(ETH_P_IP) && proto != htons(ETH_P_IPV6))) return -ENOTSUPP; off = skb_mac_header_len(skb); switch (mode) { case BPF_ADJ_ROOM_NET: off += bpf_skb_net_base_len(skb); break; case BPF_ADJ_ROOM_MAC: break; default: return -ENOTSUPP; } if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { if (!shrink) return -EINVAL; switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { case BPF_F_ADJ_ROOM_DECAP_L3_IPV4: len_min = sizeof(struct iphdr); break; case BPF_F_ADJ_ROOM_DECAP_L3_IPV6: len_min = sizeof(struct ipv6hdr); break; default: return -EINVAL; } } len_cur = skb->len - skb_network_offset(skb); if ((shrink && (len_diff_abs >= len_cur || len_cur - len_diff_abs < len_min)) || (!shrink && (skb->len + len_diff_abs > len_max && !skb_is_gso(skb)))) return -ENOTSUPP; ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : bpf_skb_net_grow(skb, off, len_diff_abs, flags); if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET)) __skb_reset_checksum_unnecessary(skb); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_adjust_room_proto = { .func = bpf_skb_adjust_room, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; static u32 __bpf_skb_min_len(const struct sk_buff *skb) { int offset = skb_network_offset(skb); u32 min_len = 0; if (offset > 0) min_len = offset; if (skb_transport_header_was_set(skb)) { offset = skb_transport_offset(skb); if (offset > 0) min_len = offset; } if (skb->ip_summed == CHECKSUM_PARTIAL) { offset = skb_checksum_start_offset(skb) + skb->csum_offset + sizeof(__sum16); if (offset > 0) min_len = offset; } return min_len; } static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) { unsigned int old_len = skb->len; int ret; ret = __skb_grow_rcsum(skb, new_len); if (!ret) memset(skb->data + old_len, 0, new_len - old_len); return ret; } static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) { return __skb_trim_rcsum(skb, new_len); } static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, u64 flags) { u32 max_len = BPF_SKB_MAX_LEN; u32 min_len = __bpf_skb_min_len(skb); int ret; if (unlikely(flags || new_len > max_len || new_len < min_len)) return -EINVAL; if (skb->encapsulation) return -ENOTSUPP; /* The basic idea of this helper is that it's performing the * needed work to either grow or trim an skb, and eBPF program * rewrites the rest via helpers like bpf_skb_store_bytes(), * bpf_lX_csum_replace() and others rather than passing a raw * buffer here. This one is a slow path helper and intended * for replies with control messages. * * Like in bpf_skb_change_proto(), we want to keep this rather * minimal and without protocol specifics so that we are able * to separate concerns as in bpf_skb_store_bytes() should only * be the one responsible for writing buffers. * * It's really expected to be a slow path operation here for * control message replies, so we're implicitly linearizing, * uncloning and drop offloads from the skb by this. */ ret = __bpf_try_make_writable(skb, skb->len); if (!ret) { if (new_len > skb->len) ret = bpf_skb_grow_rcsum(skb, new_len); else if (new_len < skb->len) ret = bpf_skb_trim_rcsum(skb, new_len); if (!ret && skb_is_gso(skb)) skb_gso_reset(skb); } return ret; } BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { int ret = __bpf_skb_change_tail(skb, new_len, flags); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_change_tail_proto = { .func = bpf_skb_change_tail, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { return __bpf_skb_change_tail(skb, new_len, flags); } static const struct bpf_func_proto sk_skb_change_tail_proto = { .func = sk_skb_change_tail, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, u64 flags) { u32 max_len = BPF_SKB_MAX_LEN; u32 new_len = skb->len + head_room; int ret; if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) || new_len < skb->len)) return -EINVAL; ret = skb_cow(skb, head_room); if (likely(!ret)) { /* Idea for this helper is that we currently only * allow to expand on mac header. This means that * skb->protocol network header, etc, stay as is. * Compared to bpf_skb_change_tail(), we're more * flexible due to not needing to linearize or * reset GSO. Intention for this helper is to be * used by an L3 skb that needs to push mac header * for redirection into L2 device. */ __skb_push(skb, head_room); memset(skb->data, 0, head_room); skb_reset_mac_header(skb); skb_reset_mac_len(skb); } return ret; } BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { int ret = __bpf_skb_change_head(skb, head_room, flags); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_change_head_proto = { .func = bpf_skb_change_head, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { return __bpf_skb_change_head(skb, head_room, flags); } static const struct bpf_func_proto sk_skb_change_head_proto = { .func = sk_skb_change_head, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp) { return xdp_get_buff_len(xdp); } static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = { .func = bpf_xdp_get_buff_len, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff) const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = { .func = bpf_xdp_get_buff_len, .gpl_only = false, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_xdp_get_buff_len_bpf_ids[0], }; static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) { return xdp_data_meta_unsupported(xdp) ? 0 : xdp->data - xdp->data_meta; } BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) { void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); unsigned long metalen = xdp_get_metalen(xdp); void *data_start = xdp_frame_end + metalen; void *data = xdp->data + offset; if (unlikely(data < data_start || data > xdp->data_end - ETH_HLEN)) return -EINVAL; if (metalen) memmove(xdp->data_meta + offset, xdp->data_meta, metalen); xdp->data_meta += offset; xdp->data = data; return 0; } static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .func = bpf_xdp_adjust_head, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush) { unsigned long ptr_len, ptr_off = 0; skb_frag_t *next_frag, *end_frag; struct skb_shared_info *sinfo; void *src, *dst; u8 *ptr_buf; if (likely(xdp->data_end - xdp->data >= off + len)) { src = flush ? buf : xdp->data + off; dst = flush ? xdp->data + off : buf; memcpy(dst, src, len); return; } sinfo = xdp_get_shared_info_from_buff(xdp); end_frag = &sinfo->frags[sinfo->nr_frags]; next_frag = &sinfo->frags[0]; ptr_len = xdp->data_end - xdp->data; ptr_buf = xdp->data; while (true) { if (off < ptr_off + ptr_len) { unsigned long copy_off = off - ptr_off; unsigned long copy_len = min(len, ptr_len - copy_off); src = flush ? buf : ptr_buf + copy_off; dst = flush ? ptr_buf + copy_off : buf; memcpy(dst, src, copy_len); off += copy_len; len -= copy_len; buf += copy_len; } if (!len || next_frag == end_frag) break; ptr_off += ptr_len; ptr_buf = skb_frag_address(next_frag); ptr_len = skb_frag_size(next_frag); next_frag++; } } void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) { u32 size = xdp->data_end - xdp->data; struct skb_shared_info *sinfo; void *addr = xdp->data; int i; if (unlikely(offset > 0xffff || len > 0xffff)) return ERR_PTR(-EFAULT); if (unlikely(offset + len > xdp_get_buff_len(xdp))) return ERR_PTR(-EINVAL); if (likely(offset < size)) /* linear area */ goto out; sinfo = xdp_get_shared_info_from_buff(xdp); offset -= size; for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */ u32 frag_size = skb_frag_size(&sinfo->frags[i]); if (offset < frag_size) { addr = skb_frag_address(&sinfo->frags[i]); size = frag_size; break; } offset -= frag_size; } out: return offset + len <= size ? addr + offset : NULL; } BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset, void *, buf, u32, len) { void *ptr; ptr = bpf_xdp_pointer(xdp, offset, len); if (IS_ERR(ptr)) return PTR_ERR(ptr); if (!ptr) bpf_xdp_copy_buf(xdp, offset, buf, len, false); else memcpy(buf, ptr, len); return 0; } static const struct bpf_func_proto bpf_xdp_load_bytes_proto = { .func = bpf_xdp_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) { return ____bpf_xdp_load_bytes(xdp, offset, buf, len); } BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset, void *, buf, u32, len) { void *ptr; ptr = bpf_xdp_pointer(xdp, offset, len); if (IS_ERR(ptr)) return PTR_ERR(ptr); if (!ptr) bpf_xdp_copy_buf(xdp, offset, buf, len, true); else memcpy(ptr, buf, len); return 0; } static const struct bpf_func_proto bpf_xdp_store_bytes_proto = { .func = bpf_xdp_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) { return ____bpf_xdp_store_bytes(xdp, offset, buf, len); } static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1]; struct xdp_rxq_info *rxq = xdp->rxq; unsigned int tailroom; if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) return -EOPNOTSUPP; tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag); if (unlikely(offset > tailroom)) return -EINVAL; memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset); skb_frag_size_add(frag, offset); sinfo->xdp_frags_size += offset; if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) xsk_buff_get_tail(xdp)->data_end += offset; return 0; } static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, enum xdp_mem_type mem_type, bool release) { struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); if (release) { xsk_buff_del_tail(zc_frag); __xdp_return(0, mem_type, false, zc_frag); } else { zc_frag->data_end -= shrink; } } static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink) { enum xdp_mem_type mem_type = xdp->rxq->mem.type; bool release = skb_frag_size(frag) == shrink; if (mem_type == MEM_TYPE_XSK_BUFF_POOL) { bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release); goto out; } if (release) __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL); out: return release; } static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); int i, n_frags_free = 0, len_free = 0; if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN)) return -EINVAL; for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) { skb_frag_t *frag = &sinfo->frags[i]; int shrink = min_t(int, offset, skb_frag_size(frag)); len_free += shrink; offset -= shrink; if (bpf_xdp_shrink_data(xdp, frag, shrink)) { n_frags_free++; } else { skb_frag_size_sub(frag, shrink); break; } } sinfo->nr_frags -= n_frags_free; sinfo->xdp_frags_size -= len_free; if (unlikely(!sinfo->nr_frags)) { xdp_buff_clear_frags_flag(xdp); xdp->data_end -= offset; } return 0; } BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) { void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */ void *data_end = xdp->data_end + offset; if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */ if (offset < 0) return bpf_xdp_frags_shrink_tail(xdp, -offset); return bpf_xdp_frags_increase_tail(xdp, offset); } /* Notice that xdp_data_hard_end have reserved some tailroom */ if (unlikely(data_end > data_hard_end)) return -EINVAL; if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; /* Clear memory area on grow, can contain uninit kernel memory */ if (offset > 0) memset(xdp->data_end, 0, offset); xdp->data_end = data_end; return 0; } static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = { .func = bpf_xdp_adjust_tail, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) { void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); void *meta = xdp->data_meta + offset; unsigned long metalen = xdp->data - meta; if (xdp_data_meta_unsupported(xdp)) return -ENOTSUPP; if (unlikely(meta < xdp_frame_end || meta > xdp->data)) return -EINVAL; if (unlikely(xdp_metalen_invalid(metalen))) return -EACCES; xdp->data_meta = meta; return 0; } static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .func = bpf_xdp_adjust_meta, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; /** * DOC: xdp redirect * * XDP_REDIRECT works by a three-step process, implemented in the functions * below: * * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target * of the redirect and store it (along with some other metadata) in a per-CPU * struct bpf_redirect_info. * * 2. When the program returns the XDP_REDIRECT return code, the driver will * call xdp_do_redirect() which will use the information in struct * bpf_redirect_info to actually enqueue the frame into a map type-specific * bulk queue structure. * * 3. Before exiting its NAPI poll loop, the driver will call * xdp_do_flush(), which will flush all the different bulk queues, * thus completing the redirect. Note that xdp_do_flush() must be * called before napi_complete_done() in the driver, as the * XDP_REDIRECT logic relies on being inside a single NAPI instance * through to the xdp_do_flush() call for RCU protection of all * in-kernel data structures. */ /* * Pointers to the map entries will be kept around for this whole sequence of * steps, protected by RCU. However, there is no top-level rcu_read_lock() in * the core code; instead, the RCU protection relies on everything happening * inside a single NAPI poll sequence, which means it's between a pair of calls * to local_bh_disable()/local_bh_enable(). * * The map entries are marked as __rcu and the map code makes sure to * dereference those pointers with rcu_dereference_check() in a way that works * for both sections that to hold an rcu_read_lock() and sections that are * called from NAPI without a separate rcu_read_lock(). The code below does not * use RCU annotations, but relies on those in the map code. */ void xdp_do_flush(void) { struct list_head *lh_map, *lh_dev, *lh_xsk; bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); if (lh_dev) __dev_flush(lh_dev); if (lh_map) __cpu_map_flush(lh_map); if (lh_xsk) __xsk_map_flush(lh_xsk); } EXPORT_SYMBOL_GPL(xdp_do_flush); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) void xdp_do_check_flushed(struct napi_struct *napi) { struct list_head *lh_map, *lh_dev, *lh_xsk; bool missed = false; bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); if (lh_dev) { __dev_flush(lh_dev); missed = true; } if (lh_map) { __cpu_map_flush(lh_map); missed = true; } if (lh_xsk) { __xsk_map_flush(lh_xsk); missed = true; } WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n", napi->poll); } #endif DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key); u32 xdp_master_redirect(struct xdp_buff *xdp) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct net_device *master, *slave; master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev); slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp); if (slave && slave != xdp->rxq->dev) { /* The target device is different from the receiving device, so * redirect it to the new device. * Using XDP_REDIRECT gets the correct behaviour from XDP enabled * drivers to unmap the packet from their rx ring. */ ri->tgt_index = slave->ifindex; ri->map_id = INT_MAX; ri->map_type = BPF_MAP_TYPE_UNSPEC; return XDP_REDIRECT; } return XDP_TX; } EXPORT_SYMBOL_GPL(xdp_master_redirect); static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri, const struct net_device *dev, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->map_type = BPF_MAP_TYPE_UNSPEC; err = __xsk_map_redirect(fwd, xdp); if (unlikely(err)) goto err; _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev, struct xdp_frame *xdpf, const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; u32 flags = ri->flags; struct bpf_map *map; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (unlikely(!xdpf)) { err = -EOVERFLOW; goto err; } switch (map_type) { case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: if (unlikely(flags & BPF_F_BROADCAST)) { map = READ_ONCE(ri->map); /* The map pointer is cleared when the map is being torn * down by dev_map_free() */ if (unlikely(!map)) { err = -ENOENT; break; } WRITE_ONCE(ri->map, NULL); err = dev_map_enqueue_multi(xdpf, dev, map, flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_enqueue(fwd, xdpf, dev); } break; case BPF_MAP_TYPE_CPUMAP: err = cpu_map_enqueue(fwd, xdpf, dev); break; case BPF_MAP_TYPE_UNSPEC: if (map_id == INT_MAX) { fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); if (unlikely(!fwd)) { err = -EINVAL; break; } err = dev_xdp_enqueue(fwd, xdpf, dev); break; } fallthrough; default: err = -EBADRQC; } if (unlikely(err)) goto err; _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp), xdp_prog); } EXPORT_SYMBOL_GPL(xdp_do_redirect); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, struct xdp_frame *xdpf, const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog); } EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog, void *fwd, enum bpf_map_type map_type, u32 map_id, u32 flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct bpf_map *map; int err; switch (map_type) { case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: if (unlikely(flags & BPF_F_BROADCAST)) { map = READ_ONCE(ri->map); /* The map pointer is cleared when the map is being torn * down by dev_map_free() */ if (unlikely(!map)) { err = -ENOENT; break; } WRITE_ONCE(ri->map, NULL); err = dev_map_redirect_multi(dev, skb, xdp_prog, map, flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_generic_redirect(fwd, skb, xdp_prog); } if (unlikely(err)) goto err; break; case BPF_MAP_TYPE_XSKMAP: err = xsk_generic_rcv(fwd, xdp); if (err) goto err; consume_skb(skb); break; case BPF_MAP_TYPE_CPUMAP: err = cpu_map_generic_redirect(fwd, skb); if (unlikely(err)) goto err; break; default: err = -EBADRQC; goto err; } _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; u32 flags = ri->flags; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); if (unlikely(!fwd)) { err = -EINVAL; goto err; } err = xdp_ok_fwd_dev(fwd, skb->len); if (unlikely(err)) goto err; skb->dev = fwd; _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index); generic_xdp_tx(skb, xdp_prog); return 0; } return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags); err: _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err); return err; } BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags)) return XDP_ABORTED; /* NB! Map type UNSPEC and map_id == INT_MAX (never generated * by map_idr) is used for ifindex based XDP redirect. */ ri->tgt_index = ifindex; ri->map_id = INT_MAX; ri->map_type = BPF_MAP_TYPE_UNSPEC; return XDP_REDIRECT; } static const struct bpf_func_proto bpf_xdp_redirect_proto = { .func = bpf_xdp_redirect, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u64, key, u64, flags) { return map->ops->map_redirect(map, key, flags); } static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .func = bpf_xdp_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, unsigned long off, unsigned long len) { void *ptr = skb_header_pointer(skb, off, len, dst_buff); if (unlikely(!ptr)) return len; if (ptr != dst_buff) memcpy(dst_buff, ptr, len); return 0; } BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, u64, flags, void *, meta, u64, meta_size) { u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; if (unlikely(!skb || skb_size > skb->len)) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, bpf_skb_copy); } static const struct bpf_func_proto bpf_skb_event_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff) const struct bpf_func_proto bpf_skb_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_skb_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; static unsigned short bpf_tunnel_key_af(u64 flags) { return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; } BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to, u32, size, u64, flags) { const struct ip_tunnel_info *info = skb_tunnel_info(skb); u8 compat[sizeof(struct bpf_tunnel_key)]; void *to_orig = to; int err; if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_TUNINFO_FLAGS)))) { err = -EINVAL; goto err_clear; } if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) { err = -EPROTO; goto err_clear; } if (unlikely(size != sizeof(struct bpf_tunnel_key))) { err = -EINVAL; switch (size) { case offsetof(struct bpf_tunnel_key, local_ipv6[0]): case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, tunnel_ext): goto set_compat; case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. */ if (ip_tunnel_info_af(info) != AF_INET) goto err_clear; set_compat: to = (struct bpf_tunnel_key *)compat; break; default: goto err_clear; } } to->tunnel_id = be64_to_cpu(info->key.tun_id); to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; if (flags & BPF_F_TUNINFO_FLAGS) to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags); else to->tunnel_ext = 0; if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, sizeof(to->remote_ipv6)); memcpy(to->local_ipv6, &info->key.u.ipv6.dst, sizeof(to->local_ipv6)); to->tunnel_label = be32_to_cpu(info->key.label); } else { to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst); memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3); to->tunnel_label = 0; } if (unlikely(size != sizeof(struct bpf_tunnel_key))) memcpy(to_orig, to, size); return 0; err_clear: memset(to_orig, 0, size); return err; } static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .func = bpf_skb_get_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) { const struct ip_tunnel_info *info = skb_tunnel_info(skb); int err; if (unlikely(!info || !ip_tunnel_is_options_present(info->key.tun_flags))) { err = -ENOENT; goto err_clear; } if (unlikely(size < info->options_len)) { err = -ENOMEM; goto err_clear; } ip_tunnel_info_opts_get(to, info); if (size > info->options_len) memset(to + info->options_len, 0, size - info->options_len); return info->options_len; err_clear: memset(to, 0, size); return err; } static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { .func = bpf_skb_get_tunnel_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE, }; static struct metadata_dst __percpu *md_dst; BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, const struct bpf_tunnel_key *, from, u32, size, u64, flags) { struct metadata_dst *md = this_cpu_ptr(md_dst); u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER | BPF_F_NO_TUNNEL_KEY))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { case offsetof(struct bpf_tunnel_key, local_ipv6[0]): case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, tunnel_ext): case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. */ memcpy(compat, from, size); memset(compat + size, 0, sizeof(compat) - size); from = (const struct bpf_tunnel_key *) compat; break; default: return -EINVAL; } } if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || from->tunnel_ext)) return -EINVAL; skb_dst_drop(skb); dst_hold((struct dst_entry *) md); skb_dst_set(skb, (struct dst_entry *) md); info = &md->u.tun_info; memset(info, 0, sizeof(*info)); info->mode = IP_TUNNEL_INFO_TX; __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags, flags & BPF_F_DONT_FRAGMENT); __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags, !(flags & BPF_F_ZERO_CSUM_TX)); __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags, flags & BPF_F_SEQ_NUMBER); __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags, !(flags & BPF_F_NO_TUNNEL_KEY)); info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; info->key.ttl = from->tunnel_ttl; if (flags & BPF_F_TUNINFO_IPV6) { info->mode |= IP_TUNNEL_INFO_IPV6; memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, sizeof(from->remote_ipv6)); memcpy(&info->key.u.ipv6.src, from->local_ipv6, sizeof(from->local_ipv6)); info->key.label = cpu_to_be32(from->tunnel_label) & IPV6_FLOWLABEL_MASK; } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4); info->key.flow_flags = FLOWI_FLAG_ANYSRC; } return 0; } static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .func = bpf_skb_set_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, const u8 *, from, u32, size) { struct ip_tunnel_info *info = skb_tunnel_info(skb); const struct metadata_dst *md = this_cpu_ptr(md_dst); IP_TUNNEL_DECLARE_FLAGS(present) = { }; if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) return -EINVAL; if (unlikely(size > IP_TUNNEL_OPTS_MAX)) return -ENOMEM; ip_tunnel_set_options_present(present); ip_tunnel_info_opts_set(info, from, size, present); return 0; } static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { .func = bpf_skb_set_tunnel_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto * bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) { if (!md_dst) { struct metadata_dst __percpu *tmp; tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, METADATA_IP_TUNNEL, GFP_KERNEL); if (!tmp) return NULL; if (cmpxchg(&md_dst, NULL, tmp)) metadata_dst_free_percpu(tmp); } switch (which) { case BPF_FUNC_skb_set_tunnel_key: return &bpf_skb_set_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_opt: return &bpf_skb_set_tunnel_opt_proto; default: return NULL; } } BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, u32, idx) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct cgroup *cgrp; struct sock *sk; sk = skb_to_full_sk(skb); if (!sk || !sk_fullsock(sk)) return -ENOENT; if (unlikely(idx >= array->map.max_entries)) return -E2BIG; cgrp = READ_ONCE(array->ptrs[idx]); if (unlikely(!cgrp)) return -EAGAIN; return sk_under_cgroup_hierarchy(sk, cgrp); } static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { .func = bpf_skb_under_cgroup, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; #ifdef CONFIG_SOCK_CGROUP_DATA static inline u64 __bpf_sk_cgroup_id(struct sock *sk) { struct cgroup *cgrp; sk = sk_to_full_sk(sk); if (!sk || !sk_fullsock(sk)) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return cgroup_id(cgrp); } BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) { return __bpf_sk_cgroup_id(skb->sk); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { .func = bpf_skb_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, int ancestor_level) { struct cgroup *ancestor; struct cgroup *cgrp; sk = sk_to_full_sk(sk); if (!sk || !sk_fullsock(sk)) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ancestor = cgroup_ancestor(cgrp, ancestor_level); if (!ancestor) return 0; return cgroup_id(ancestor); } BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, ancestor_level) { return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level); } static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .func = bpf_skb_ancestor_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk) { return __bpf_sk_cgroup_id(sk); } static const struct bpf_func_proto bpf_sk_cgroup_id_proto = { .func = bpf_sk_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level) { return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); } static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { .func = bpf_sk_ancestor_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, }; #endif static unsigned long bpf_xdp_copy(void *dst, const void *ctx, unsigned long off, unsigned long len) { struct xdp_buff *xdp = (struct xdp_buff *)ctx; bpf_xdp_copy_buf(xdp, off, dst, len, false); return 0; } BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, u64, flags, void *, meta, u64, meta_size) { u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp))) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, bpf_xdp_copy); } static const struct bpf_func_proto bpf_xdp_event_output_proto = { .func = bpf_xdp_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff) const struct bpf_func_proto bpf_xdp_output_proto = { .func = bpf_xdp_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_xdp_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) { return skb->sk ? __sock_gen_cookie(skb->sk) : 0; } static const struct bpf_func_proto bpf_get_socket_cookie_proto = { .func = bpf_get_socket_cookie, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) { return __sock_gen_cookie(ctx->sk); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = { .func = bpf_get_socket_cookie_sock_addr, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx) { return __sock_gen_cookie(ctx); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = { .func = bpf_get_socket_cookie_sock, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk) { return sk ? sock_gen_cookie(sk) : 0; } const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = { .func = bpf_get_socket_ptr_cookie, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL, }; BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { return __sock_gen_cookie(ctx->sk); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { .func = bpf_get_socket_cookie_sock_ops, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static u64 __bpf_get_netns_cookie(struct sock *sk) { const struct net *net = sk ? sock_net(sk) : &init_net; return net->net_cookie; } BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb) { return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL); } static const struct bpf_func_proto bpf_get_netns_cookie_proto = { .func = bpf_get_netns_cookie, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) { return __bpf_get_netns_cookie(ctx); } static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = { .func = bpf_get_netns_cookie_sock, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) { return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); } static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = { .func = bpf_get_netns_cookie_sock_addr, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); } static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = { .func = bpf_get_netns_cookie_sock_ops, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx) { return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); } static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = { .func = bpf_get_netns_cookie_sk_msg, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) { struct sock *sk = sk_to_full_sk(skb->sk); kuid_t kuid; if (!sk || !sk_fullsock(sk)) return overflowuid; kuid = sock_net_uid(sock_net(sk), sk); return from_kuid_munged(sock_net(sk)->user_ns, kuid); } static const struct bpf_func_proto bpf_get_socket_uid_proto = { .func = bpf_get_socket_uid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static int sol_socket_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { switch (optname) { case SO_REUSEADDR: case SO_SNDBUF: case SO_RCVBUF: case SO_KEEPALIVE: case SO_PRIORITY: case SO_REUSEPORT: case SO_RCVLOWAT: case SO_MARK: case SO_MAX_PACING_RATE: case SO_BINDTOIFINDEX: case SO_TXREHASH: if (*optlen != sizeof(int)) return -EINVAL; break; case SO_BINDTODEVICE: break; default: return -EINVAL; } if (getopt) { if (optname == SO_BINDTODEVICE) return -EINVAL; return sk_getsockopt(sk, SOL_SOCKET, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); } return sk_setsockopt(sk, SOL_SOCKET, optname, KERNEL_SOCKPTR(optval), *optlen); } static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, char *optval, int optlen) { struct tcp_sock *tp = tcp_sk(sk); unsigned long timeout; int val; if (optlen != sizeof(int)) return -EINVAL; val = *(int *)optval; /* Only some options are supported */ switch (optname) { case TCP_BPF_IW: if (val <= 0 || tp->data_segs_out > tp->syn_data) return -EINVAL; tcp_snd_cwnd_set(tp, val); break; case TCP_BPF_SNDCWND_CLAMP: if (val <= 0) return -EINVAL; tp->snd_cwnd_clamp = val; tp->snd_ssthresh = val; break; case TCP_BPF_DELACK_MAX: timeout = usecs_to_jiffies(val); if (timeout > TCP_DELACK_MAX || timeout < TCP_TIMEOUT_MIN) return -EINVAL; inet_csk(sk)->icsk_delack_max = timeout; break; case TCP_BPF_RTO_MIN: timeout = usecs_to_jiffies(val); if (timeout > TCP_RTO_MIN || timeout < TCP_TIMEOUT_MIN) return -EINVAL; inet_csk(sk)->icsk_rto_min = timeout; break; case TCP_BPF_SOCK_OPS_CB_FLAGS: if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS)) return -EINVAL; tp->bpf_sock_ops_cb_flags = val; break; default: return -EINVAL; } return 0; } static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval, int *optlen, bool getopt) { struct tcp_sock *tp; int ret; if (*optlen < 2) return -EINVAL; if (getopt) { if (!inet_csk(sk)->icsk_ca_ops) return -EINVAL; /* BPF expects NULL-terminated tcp-cc string */ optval[--(*optlen)] = '\0'; return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); } /* "cdg" is the only cc that alloc a ptr * in inet_csk_ca area. The bpf-tcp-cc may * overwrite this ptr after switching to cdg. */ if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen)) return -ENOTSUPP; /* It stops this looping * * .init => bpf_setsockopt(tcp_cc) => .init => * bpf_setsockopt(tcp_cc)" => .init => .... * * The second bpf_setsockopt(tcp_cc) is not allowed * in order to break the loop when both .init * are the same bpf prog. * * This applies even the second bpf_setsockopt(tcp_cc) * does not cause a loop. This limits only the first * '.init' can call bpf_setsockopt(TCP_CONGESTION) to * pick a fallback cc (eg. peer does not support ECN) * and the second '.init' cannot fallback to * another. */ tp = tcp_sk(sk); if (tp->bpf_chg_cc_inprogress) return -EBUSY; tp->bpf_chg_cc_inprogress = 1; ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION, KERNEL_SOCKPTR(optval), *optlen); tp->bpf_chg_cc_inprogress = 0; return ret; } static int sol_tcp_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { if (sk->sk_protocol != IPPROTO_TCP) return -EINVAL; switch (optname) { case TCP_NODELAY: case TCP_MAXSEG: case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPCNT: case TCP_SYNCNT: case TCP_WINDOW_CLAMP: case TCP_THIN_LINEAR_TIMEOUTS: case TCP_USER_TIMEOUT: case TCP_NOTSENT_LOWAT: case TCP_SAVE_SYN: if (*optlen != sizeof(int)) return -EINVAL; break; case TCP_CONGESTION: return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt); case TCP_SAVED_SYN: if (*optlen < 1) return -EINVAL; break; case TCP_BPF_SOCK_OPS_CB_FLAGS: if (*optlen != sizeof(int)) return -EINVAL; if (getopt) { struct tcp_sock *tp = tcp_sk(sk); int cb_flags = tp->bpf_sock_ops_cb_flags; memcpy(optval, &cb_flags, *optlen); return 0; } return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); default: if (getopt) return -EINVAL; return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); } if (getopt) { if (optname == TCP_SAVED_SYN) { struct tcp_sock *tp = tcp_sk(sk); if (!tp->saved_syn || *optlen > tcp_saved_syn_len(tp->saved_syn)) return -EINVAL; memcpy(optval, tp->saved_syn->data, *optlen); /* It cannot free tp->saved_syn here because it * does not know if the user space still needs it. */ return 0; } return do_tcp_getsockopt(sk, SOL_TCP, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); } return do_tcp_setsockopt(sk, SOL_TCP, optname, KERNEL_SOCKPTR(optval), *optlen); } static int sol_ip_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { if (sk->sk_family != AF_INET) return -EINVAL; switch (optname) { case IP_TOS: if (*optlen != sizeof(int)) return -EINVAL; break; default: return -EINVAL; } if (getopt) return do_ip_getsockopt(sk, SOL_IP, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); return do_ip_setsockopt(sk, SOL_IP, optname, KERNEL_SOCKPTR(optval), *optlen); } static int sol_ipv6_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { if (sk->sk_family != AF_INET6) return -EINVAL; switch (optname) { case IPV6_TCLASS: case IPV6_AUTOFLOWLABEL: if (*optlen != sizeof(int)) return -EINVAL; break; default: return -EINVAL; } if (getopt) return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname, KERNEL_SOCKPTR(optval), *optlen); } static int __bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { if (!sk_fullsock(sk)) return -EINVAL; if (level == SOL_SOCKET) return sol_socket_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) return sol_ip_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) return sol_ipv6_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) return sol_tcp_sockopt(sk, optname, optval, &optlen, false); return -EINVAL; } static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { if (sk_fullsock(sk)) sock_owned_by_me(sk); return __bpf_setsockopt(sk, level, optname, optval, optlen); } static int __bpf_getsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { int err, saved_optlen = optlen; if (!sk_fullsock(sk)) { err = -EINVAL; goto done; } if (level == SOL_SOCKET) err = sol_socket_sockopt(sk, optname, optval, &optlen, true); else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) err = sol_tcp_sockopt(sk, optname, optval, &optlen, true); else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) err = sol_ip_sockopt(sk, optname, optval, &optlen, true); else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true); else err = -EINVAL; done: if (err) optlen = 0; if (optlen < saved_optlen) memset(optval + optlen, 0, saved_optlen - optlen); return err; } static int _bpf_getsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { if (sk_fullsock(sk)) sock_owned_by_me(sk); return __bpf_getsockopt(sk, level, optname, optval, optlen); } BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return _bpf_setsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_sk_setsockopt_proto = { .func = bpf_sk_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return _bpf_getsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_sk_getsockopt_proto = { .func = bpf_sk_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return __bpf_setsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = { .func = bpf_unlocked_sk_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return __bpf_getsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = { .func = bpf_unlocked_sk_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { .func = bpf_sock_addr_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { .func = bpf_sock_addr_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { .func = bpf_sock_ops_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, int optname, const u8 **start) { struct sk_buff *syn_skb = bpf_sock->syn_skb; const u8 *hdr_start; int ret; if (syn_skb) { /* sk is a request_sock here */ if (optname == TCP_BPF_SYN) { hdr_start = syn_skb->data; ret = tcp_hdrlen(syn_skb); } else if (optname == TCP_BPF_SYN_IP) { hdr_start = skb_network_header(syn_skb); ret = skb_network_header_len(syn_skb) + tcp_hdrlen(syn_skb); } else { /* optname == TCP_BPF_SYN_MAC */ hdr_start = skb_mac_header(syn_skb); ret = skb_mac_header_len(syn_skb) + skb_network_header_len(syn_skb) + tcp_hdrlen(syn_skb); } } else { struct sock *sk = bpf_sock->sk; struct saved_syn *saved_syn; if (sk->sk_state == TCP_NEW_SYN_RECV) /* synack retransmit. bpf_sock->syn_skb will * not be available. It has to resort to * saved_syn (if it is saved). */ saved_syn = inet_reqsk(sk)->saved_syn; else saved_syn = tcp_sk(sk)->saved_syn; if (!saved_syn) return -ENOENT; if (optname == TCP_BPF_SYN) { hdr_start = saved_syn->data + saved_syn->mac_hdrlen + saved_syn->network_hdrlen; ret = saved_syn->tcp_hdrlen; } else if (optname == TCP_BPF_SYN_IP) { hdr_start = saved_syn->data + saved_syn->mac_hdrlen; ret = saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; } else { /* optname == TCP_BPF_SYN_MAC */ /* TCP_SAVE_SYN may not have saved the mac hdr */ if (!saved_syn->mac_hdrlen) return -ENOENT; hdr_start = saved_syn->data; ret = saved_syn->mac_hdrlen + saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; } } *start = hdr_start; return ret; } BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) { int ret, copy_len = 0; const u8 *start; ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start); if (ret > 0) { copy_len = ret; if (optlen < copy_len) { copy_len = optlen; ret = -ENOSPC; } memcpy(optval, start, copy_len); } /* Zero out unused buffer at the end */ memset(optval + copy_len, 0, optlen - copy_len); return ret; } return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = { .func = bpf_sock_ops_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, int, argval) { struct sock *sk = bpf_sock->sk; int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) return -EINVAL; tcp_sk(sk)->bpf_sock_ops_cb_flags = val; return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); } static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { .func = bpf_sock_ops_cb_flags_set, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; EXPORT_SYMBOL_GPL(ipv6_bpf_stub); BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, int, addr_len) { #ifdef CONFIG_INET struct sock *sk = ctx->sk; u32 flags = BIND_FROM_BPF; int err; err = -EINVAL; if (addr_len < offsetofend(struct sockaddr, sa_family)) return err; if (addr->sa_family == AF_INET) { if (addr_len < sizeof(struct sockaddr_in)) return err; if (((struct sockaddr_in *)addr)->sin_port == htons(0)) flags |= BIND_FORCE_ADDRESS_NO_PORT; return __inet_bind(sk, addr, addr_len, flags); #if IS_ENABLED(CONFIG_IPV6) } else if (addr->sa_family == AF_INET6) { if (addr_len < SIN6_LEN_RFC2133) return err; if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0)) flags |= BIND_FORCE_ADDRESS_NO_PORT; /* ipv6_bpf_stub cannot be NULL, since it's called from * bpf_cgroup_inet6_connect hook and ipv6 is already loaded */ return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags); #endif /* CONFIG_IPV6 */ } #endif /* CONFIG_INET */ return -EAFNOSUPPORT; } static const struct bpf_func_proto bpf_bind_proto = { .func = bpf_bind, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; #ifdef CONFIG_XFRM #if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) struct metadata_dst __percpu *xfrm_bpf_md_dst; EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst); #endif BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, struct bpf_xfrm_state *, to, u32, size, u64, flags) { const struct sec_path *sp = skb_sec_path(skb); const struct xfrm_state *x; if (!sp || unlikely(index >= sp->len || flags)) goto err_clear; x = sp->xvec[index]; if (unlikely(size != sizeof(struct bpf_xfrm_state))) goto err_clear; to->reqid = x->props.reqid; to->spi = x->id.spi; to->family = x->props.family; to->ext = 0; if (to->family == AF_INET6) { memcpy(to->remote_ipv6, x->props.saddr.a6, sizeof(to->remote_ipv6)); } else { to->remote_ipv4 = x->props.saddr.a4; memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); } return 0; err_clear: memset(to, 0, size); return -EINVAL; } static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { .func = bpf_skb_get_xfrm_state, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; #endif #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu) { params->h_vlan_TCI = 0; params->h_vlan_proto = 0; if (mtu) params->mtu_result = mtu; /* union with tot_len */ return 0; } #endif #if IS_ENABLED(CONFIG_INET) static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 flags, bool check_mtu) { struct fib_nh_common *nhc; struct in_device *in_dev; struct neighbour *neigh; struct net_device *dev; struct fib_result res; struct flowi4 fl4; u32 mtu = 0; int err; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) return -ENODEV; /* verify forwarding is enabled on this interface */ in_dev = __in_dev_get_rcu(dev); if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl4.flowi4_iif = 1; fl4.flowi4_oif = params->ifindex; } else { fl4.flowi4_iif = params->ifindex; fl4.flowi4_oif = 0; } fl4.flowi4_tos = params->tos & INET_DSCP_MASK; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; fl4.flowi4_proto = params->l4_protocol; fl4.daddr = params->ipv4_dst; fl4.saddr = params->ipv4_src; fl4.fl4_sport = params->sport; fl4.fl4_dport = params->dport; fl4.flowi4_multipath_hash = 0; if (flags & BPF_FIB_LOOKUP_DIRECT) { u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib_table *tb; if (flags & BPF_FIB_LOOKUP_TBID) { tbid = params->tbid; /* zero out for vlan output */ params->tbid = 0; } tb = fib_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { if (flags & BPF_FIB_LOOKUP_MARK) fl4.flowi4_mark = params->mark; else fl4.flowi4_mark = 0; fl4.flowi4_secid = 0; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); } if (err) { /* map fib lookup errors to RTN_ type */ if (err == -EINVAL) return BPF_FIB_LKUP_RET_BLACKHOLE; if (err == -EHOSTUNREACH) return BPF_FIB_LKUP_RET_UNREACHABLE; if (err == -EACCES) return BPF_FIB_LKUP_RET_PROHIBIT; return BPF_FIB_LKUP_RET_NOT_FWDED; } if (res.type != RTN_UNICAST) return BPF_FIB_LKUP_RET_NOT_FWDED; if (fib_info_num_path(res.fi) > 1) fib_select_path(net, &res, &fl4, NULL); if (check_mtu) { mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); if (params->tot_len > mtu) { params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; } } nhc = res.nhc; /* do not handle lwt encaps right now */ if (nhc->nhc_lwtstate) return BPF_FIB_LKUP_RET_UNSUPP_LWT; dev = nhc->nhc_dev; params->rt_metric = res.fi->fib_priority; params->ifindex = dev->ifindex; if (flags & BPF_FIB_LOOKUP_SRC) params->ipv4_src = fib_result_prefsrc(net, &res); /* xdp and cls_bpf programs are run in RCU-bh so * rcu_read_lock_bh is not needed here */ if (likely(nhc->nhc_gw_family != AF_INET6)) { if (nhc->nhc_gw_family) params->ipv4_dst = nhc->nhc_gw.ipv4; } else { struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst; params->family = AF_INET6; *dst = nhc->nhc_gw.ipv6; } if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) goto set_fwd_params; if (likely(nhc->nhc_gw_family != AF_INET6)) neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); else neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst); if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID)) return BPF_FIB_LKUP_RET_NO_NEIGH; memcpy(params->dmac, neigh->ha, ETH_ALEN); memcpy(params->smac, dev->dev_addr, ETH_ALEN); set_fwd_params: return bpf_fib_set_fwd_params(params, mtu); } #endif #if IS_ENABLED(CONFIG_IPV6) static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 flags, bool check_mtu) { struct in6_addr *src = (struct in6_addr *) params->ipv6_src; struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; struct fib6_result res = {}; struct neighbour *neigh; struct net_device *dev; struct inet6_dev *idev; struct flowi6 fl6; int strict = 0; int oif, err; u32 mtu = 0; /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) return BPF_FIB_LKUP_RET_NOT_FWDED; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) return -ENODEV; idev = __in6_dev_get_safely(dev); if (unlikely(!idev || !READ_ONCE(idev->cnf.forwarding))) return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl6.flowi6_iif = 1; oif = fl6.flowi6_oif = params->ifindex; } else { oif = fl6.flowi6_iif = params->ifindex; fl6.flowi6_oif = 0; strict = RT6_LOOKUP_F_HAS_SADDR; } fl6.flowlabel = params->flowinfo; fl6.flowi6_scope = 0; fl6.flowi6_flags = 0; fl6.mp_hash = 0; fl6.flowi6_proto = params->l4_protocol; fl6.daddr = *dst; fl6.saddr = *src; fl6.fl6_sport = params->sport; fl6.fl6_dport = params->dport; if (flags & BPF_FIB_LOOKUP_DIRECT) { u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib6_table *tb; if (flags & BPF_FIB_LOOKUP_TBID) { tbid = params->tbid; /* zero out for vlan output */ params->tbid = 0; } tb = ipv6_stub->fib6_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res, strict); } else { if (flags & BPF_FIB_LOOKUP_MARK) fl6.flowi6_mark = params->mark; else fl6.flowi6_mark = 0; fl6.flowi6_secid = 0; fl6.flowi6_tun_key.tun_id = 0; fl6.flowi6_uid = sock_net_uid(net, NULL); err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict); } if (unlikely(err || IS_ERR_OR_NULL(res.f6i) || res.f6i == net->ipv6.fib6_null_entry)) return BPF_FIB_LKUP_RET_NOT_FWDED; switch (res.fib6_type) { /* only unicast is forwarded */ case RTN_UNICAST: break; case RTN_BLACKHOLE: return BPF_FIB_LKUP_RET_BLACKHOLE; case RTN_UNREACHABLE: return BPF_FIB_LKUP_RET_UNREACHABLE; case RTN_PROHIBIT: return BPF_FIB_LKUP_RET_PROHIBIT; default: return BPF_FIB_LKUP_RET_NOT_FWDED; } ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif, fl6.flowi6_oif != 0, NULL, strict); if (check_mtu) { mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src); if (params->tot_len > mtu) { params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; } } if (res.nh->fib_nh_lws) return BPF_FIB_LKUP_RET_UNSUPP_LWT; if (res.nh->fib_nh_gw_family) *dst = res.nh->fib_nh_gw6; dev = res.nh->fib_nh_dev; params->rt_metric = res.f6i->fib6_metric; params->ifindex = dev->ifindex; if (flags & BPF_FIB_LOOKUP_SRC) { if (res.f6i->fib6_prefsrc.plen) { *src = res.f6i->fib6_prefsrc.addr; } else { err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev, &fl6.daddr, 0, src); if (err) return BPF_FIB_LKUP_RET_NO_SRC_ADDR; } } if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) goto set_fwd_params; /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is * not needed here. */ neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID)) return BPF_FIB_LKUP_RET_NO_NEIGH; memcpy(params->dmac, neigh->ha, ETH_ALEN); memcpy(params->smac, dev->dev_addr, ETH_ALEN); set_fwd_params: return bpf_fib_set_fwd_params(params, mtu); } #endif #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK) BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) { if (plen < sizeof(*params)) return -EINVAL; if (flags & ~BPF_FIB_LOOKUP_MASK) return -EINVAL; switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, flags, true); #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, flags, true); #endif } return -EAFNOSUPPORT; } static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { .func = bpf_xdp_fib_lookup, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, struct bpf_fib_lookup *, params, int, plen, u32, flags) { struct net *net = dev_net(skb->dev); int rc = -EAFNOSUPPORT; bool check_mtu = false; if (plen < sizeof(*params)) return -EINVAL; if (flags & ~BPF_FIB_LOOKUP_MASK) return -EINVAL; if (params->tot_len) check_mtu = true; switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu); break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu); break; #endif } if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) { struct net_device *dev; /* When tot_len isn't provided by user, check skb * against MTU of FIB lookup resulting net_device */ dev = dev_get_by_index_rcu(net, params->ifindex); if (!is_skb_forwardable(dev, skb)) rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; params->mtu_result = dev->mtu; /* union with tot_len */ } return rc; } static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { .func = bpf_skb_fib_lookup, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; static struct net_device *__dev_via_ifindex(struct net_device *dev_curr, u32 ifindex) { struct net *netns = dev_net(dev_curr); /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */ if (ifindex == 0) return dev_curr; return dev_get_by_index_rcu(netns, ifindex); } BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) { int ret = BPF_MTU_CHK_RET_FRAG_NEEDED; struct net_device *dev = skb->dev; int mtu, dev_len, skb_len; if (unlikely(flags & ~(BPF_MTU_CHK_SEGS))) return -EINVAL; if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len))) return -EINVAL; dev = __dev_via_ifindex(dev, ifindex); if (unlikely(!dev)) return -ENODEV; mtu = READ_ONCE(dev->mtu); dev_len = mtu + dev->hard_header_len; /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len; skb_len += len_diff; /* minus result pass check */ if (skb_len <= dev_len) { ret = BPF_MTU_CHK_RET_SUCCESS; goto out; } /* At this point, skb->len exceed MTU, but as it include length of all * segments, it can still be below MTU. The SKB can possibly get * re-segmented in transmit path (see validate_xmit_skb). Thus, user * must choose if segs are to be MTU checked. */ if (skb_is_gso(skb)) { ret = BPF_MTU_CHK_RET_SUCCESS; if (flags & BPF_MTU_CHK_SEGS && !skb_gso_validate_network_len(skb, mtu)) ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; } out: *mtu_len = mtu; return ret; } BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp, u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) { struct net_device *dev = xdp->rxq->dev; int xdp_len = xdp->data_end - xdp->data; int ret = BPF_MTU_CHK_RET_SUCCESS; int mtu, dev_len; /* XDP variant doesn't support multi-buffer segment check (yet) */ if (unlikely(flags)) return -EINVAL; dev = __dev_via_ifindex(dev, ifindex); if (unlikely(!dev)) return -ENODEV; mtu = READ_ONCE(dev->mtu); dev_len = mtu + dev->hard_header_len; /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ if (*mtu_len) xdp_len = *mtu_len + dev->hard_header_len; xdp_len += len_diff; /* minus result pass check */ if (xdp_len > dev_len) ret = BPF_MTU_CHK_RET_FRAG_NEEDED; *mtu_len = mtu; return ret; } static const struct bpf_func_proto bpf_skb_check_mtu_proto = { .func = bpf_skb_check_mtu, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED, .arg3_size = sizeof(u32), .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; static const struct bpf_func_proto bpf_xdp_check_mtu_proto = { .func = bpf_xdp_check_mtu, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED, .arg3_size = sizeof(u32), .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) { int err; struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr; if (!seg6_validate_srh(srh, len, false)) return -EINVAL; switch (type) { case BPF_LWT_ENCAP_SEG6_INLINE: if (skb->protocol != htons(ETH_P_IPV6)) return -EBADMSG; err = seg6_do_srh_inline(skb, srh); break; case BPF_LWT_ENCAP_SEG6: skb_reset_inner_headers(skb); skb->encapsulation = 1; err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6); break; default: return -EINVAL; } bpf_compute_data_pointers(skb); if (err) return err; skb_set_transport_header(skb, sizeof(struct ipv6hdr)); return seg6_lookup_nexthop(skb, NULL, 0); } #endif /* CONFIG_IPV6_SEG6_BPF */ #if IS_ENABLED(CONFIG_LWTUNNEL_BPF) static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) { return bpf_lwt_push_ip_encap(skb, hdr, len, ingress); } #endif BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, u32, len) { switch (type) { #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) case BPF_LWT_ENCAP_SEG6: case BPF_LWT_ENCAP_SEG6_INLINE: return bpf_push_seg6_encap(skb, type, hdr, len); #endif #if IS_ENABLED(CONFIG_LWTUNNEL_BPF) case BPF_LWT_ENCAP_IP: return bpf_push_ip_encap(skb, hdr, len, true /* ingress */); #endif default: return -EINVAL; } } BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, u32, len) { switch (type) { #if IS_ENABLED(CONFIG_LWTUNNEL_BPF) case BPF_LWT_ENCAP_IP: return bpf_push_ip_encap(skb, hdr, len, false /* egress */); #endif default: return -EINVAL; } } static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { .func = bpf_lwt_in_push_encap, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { .func = bpf_lwt_xmit_push_encap, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, const void *, from, u32, len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_tlvs, *srh_end, *ptr; int srhoff = 0; lockdep_assert_held(&srh_state->bh_lock); if (srh == NULL) return -EINVAL; srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); ptr = skb->data + offset; if (ptr >= srh_tlvs && ptr + len <= srh_end) srh_state->valid = false; else if (ptr < (void *)&srh->flags || ptr + len > (void *)&srh->segments) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) return -EINVAL; srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); memcpy(skb->data + offset, from, len); return 0; } static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { .func = bpf_lwt_seg6_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; static void bpf_update_srh_state(struct sk_buff *skb) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); int srhoff = 0; if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) { srh_state->srh = NULL; } else { srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_state->hdrlen = srh_state->srh->hdrlen << 3; srh_state->valid = true; } } BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, u32, action, void *, param, u32, param_len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); int hdroff = 0; int err; lockdep_assert_held(&srh_state->bh_lock); switch (action) { case SEG6_LOCAL_ACTION_END_X: if (!seg6_bpf_has_valid_srh(skb)) return -EBADMSG; if (param_len != sizeof(struct in6_addr)) return -EINVAL; return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); case SEG6_LOCAL_ACTION_END_T: if (!seg6_bpf_has_valid_srh(skb)) return -EBADMSG; if (param_len != sizeof(int)) return -EINVAL; return seg6_lookup_nexthop(skb, NULL, *(int *)param); case SEG6_LOCAL_ACTION_END_DT6: if (!seg6_bpf_has_valid_srh(skb)) return -EBADMSG; if (param_len != sizeof(int)) return -EINVAL; if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0) return -EBADMSG; if (!pskb_pull(skb, hdroff)) return -EBADMSG; skb_postpull_rcsum(skb, skb_network_header(skb), hdroff); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->encapsulation = 0; bpf_compute_data_pointers(skb); bpf_update_srh_state(skb); return seg6_lookup_nexthop(skb, NULL, *(int *)param); case SEG6_LOCAL_ACTION_END_B6: if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, param, param_len); if (!err) bpf_update_srh_state(skb); return err; case SEG6_LOCAL_ACTION_END_B6_ENCAP: if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, param, param_len); if (!err) bpf_update_srh_state(skb); return err; default: return -EINVAL; } } static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { .func = bpf_lwt_seg6_action, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, s32, len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_end, *srh_tlvs, *ptr; struct ipv6hdr *hdr; int srhoff = 0; int ret; lockdep_assert_held(&srh_state->bh_lock); if (unlikely(srh == NULL)) return -EINVAL; srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + ((srh->first_segment + 1) << 4)); srh_end = (void *)((unsigned char *)srh + sizeof(*srh) + srh_state->hdrlen); ptr = skb->data + offset; if (unlikely(ptr < srh_tlvs || ptr > srh_end)) return -EFAULT; if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end)) return -EFAULT; if (len > 0) { ret = skb_cow_head(skb, len); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_push(skb, offset, len); } else { ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len); } bpf_compute_data_pointers(skb); if (unlikely(ret < 0)) return ret; hdr = (struct ipv6hdr *)skb->data; hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) return -EINVAL; srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_state->hdrlen += len; srh_state->valid = false; return 0; } static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { .func = bpf_lwt_seg6_adjust_srh, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; #endif /* CONFIG_IPV6_SEG6_BPF */ #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) { struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; bool refcounted = false; struct sock *sk = NULL; if (family == AF_INET) { __be32 src4 = tuple->ipv4.saddr; __be32 dst4 = tuple->ipv4.daddr; if (proto == IPPROTO_TCP) sk = __inet_lookup(net, hinfo, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); else sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, net->ipv4.udp_table, NULL); #if IS_ENABLED(CONFIG_IPV6) } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; if (proto == IPPROTO_TCP) sk = __inet6_lookup(net, hinfo, NULL, 0, src6, tuple->ipv6.sport, dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); else if (likely(ipv6_bpf_stub)) sk = ipv6_bpf_stub->udp6_lib_lookup(net, src6, tuple->ipv6.sport, dst6, tuple->ipv6.dport, dif, sdif, net->ipv4.udp_table, NULL); #endif } if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) { WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); sk = NULL; } return sk; } /* bpf_skc_lookup performs the core lookup for different types of sockets, * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. */ static struct sock * __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, u64 flags, int sdif) { struct sock *sk = NULL; struct net *net; u8 family; if (len == sizeof(tuple->ipv4)) family = AF_INET; else if (len == sizeof(tuple->ipv6)) family = AF_INET6; else return NULL; if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX))) goto out; if (sdif < 0) { if (family == AF_INET) sdif = inet_sdif(skb); else sdif = inet6_sdif(skb); } if ((s32)netns_id < 0) { net = caller_net; sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); } else { net = get_net_ns_by_id(caller_net, netns_id); if (unlikely(!net)) goto out; sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); put_net(net); } out: return sk; } static struct sock * __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, u64 flags, int sdif) { struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, netns_id, flags, sdif); if (sk) { struct sock *sk2 = sk_to_full_sk(sk); /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); return NULL; } sk = sk2; } } return sk; } static struct sock * bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, u8 proto, u64 netns_id, u64 flags) { struct net *caller_net; int ifindex; if (skb->dev) { caller_net = dev_net(skb->dev); ifindex = skb->dev->ifindex; } else { caller_net = sock_net(skb->sk); ifindex = 0; } return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, netns_id, flags, -1); } static struct sock * bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, u8 proto, u64 netns_id, u64 flags) { struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id, flags); if (sk) { struct sock *sk2 = sk_to_full_sk(sk); /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); return NULL; } sk = sk2; } } return sk; } BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); } static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { .func = bpf_skc_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); } static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { .func = bpf_sk_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags); } static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { .func = bpf_sk_lookup_udp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { struct net_device *dev = skb->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = { .func = bpf_tc_skc_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { struct net_device *dev = skb->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = { .func = bpf_tc_sk_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { struct net_device *dev = skb->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, IPPROTO_UDP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = { .func = bpf_tc_sk_lookup_udp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_sk_release, struct sock *, sk) { if (sk && sk_is_refcounted(sk)) sock_gen_put(sk); return 0; } static const struct bpf_func_proto bpf_sk_release_proto = { .func = bpf_sk_release, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE, }; BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net_device *dev = ctx->rxq->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_UDP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { .func = bpf_xdp_sk_lookup_udp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net_device *dev = ctx->rxq->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { .func = bpf_xdp_skc_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net_device *dev = ctx->rxq->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .func = bpf_xdp_sk_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_TCP, netns_id, flags, -1); } static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { .func = bpf_sock_addr_skc_lookup_tcp, .gpl_only = false, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_TCP, netns_id, flags, -1); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { .func = bpf_sock_addr_sk_lookup_tcp, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_UDP, netns_id, flags, -1); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { .func = bpf_sock_addr_sk_lookup_udp, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, icsk_retransmits)) return false; if (off % size != 0) return false; switch (off) { case offsetof(struct bpf_tcp_sock, bytes_received): case offsetof(struct bpf_tcp_sock, bytes_acked): return size == sizeof(__u64); default: return size == sizeof(__u32); } } u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; #define BPF_TCP_SOCK_GET_COMMON(FIELD) \ do { \ BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) > \ sizeof_field(struct bpf_tcp_sock, FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\ si->dst_reg, si->src_reg, \ offsetof(struct tcp_sock, FIELD)); \ } while (0) #define BPF_INET_SOCK_GET_COMMON(FIELD) \ do { \ BUILD_BUG_ON(sizeof_field(struct inet_connection_sock, \ FIELD) > \ sizeof_field(struct bpf_tcp_sock, FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct inet_connection_sock, \ FIELD), \ si->dst_reg, si->src_reg, \ offsetof( \ struct inet_connection_sock, \ FIELD)); \ } while (0) BTF_TYPE_EMIT(struct bpf_tcp_sock); switch (si->off) { case offsetof(struct bpf_tcp_sock, rtt_min): BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != sizeof(struct minmax)); BUILD_BUG_ON(sizeof(struct minmax) < sizeof(struct minmax_sample)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct tcp_sock, rtt_min) + offsetof(struct minmax_sample, v)); break; case offsetof(struct bpf_tcp_sock, snd_cwnd): BPF_TCP_SOCK_GET_COMMON(snd_cwnd); break; case offsetof(struct bpf_tcp_sock, srtt_us): BPF_TCP_SOCK_GET_COMMON(srtt_us); break; case offsetof(struct bpf_tcp_sock, snd_ssthresh): BPF_TCP_SOCK_GET_COMMON(snd_ssthresh); break; case offsetof(struct bpf_tcp_sock, rcv_nxt): BPF_TCP_SOCK_GET_COMMON(rcv_nxt); break; case offsetof(struct bpf_tcp_sock, snd_nxt): BPF_TCP_SOCK_GET_COMMON(snd_nxt); break; case offsetof(struct bpf_tcp_sock, snd_una): BPF_TCP_SOCK_GET_COMMON(snd_una); break; case offsetof(struct bpf_tcp_sock, mss_cache): BPF_TCP_SOCK_GET_COMMON(mss_cache); break; case offsetof(struct bpf_tcp_sock, ecn_flags): BPF_TCP_SOCK_GET_COMMON(ecn_flags); break; case offsetof(struct bpf_tcp_sock, rate_delivered): BPF_TCP_SOCK_GET_COMMON(rate_delivered); break; case offsetof(struct bpf_tcp_sock, rate_interval_us): BPF_TCP_SOCK_GET_COMMON(rate_interval_us); break; case offsetof(struct bpf_tcp_sock, packets_out): BPF_TCP_SOCK_GET_COMMON(packets_out); break; case offsetof(struct bpf_tcp_sock, retrans_out): BPF_TCP_SOCK_GET_COMMON(retrans_out); break; case offsetof(struct bpf_tcp_sock, total_retrans): BPF_TCP_SOCK_GET_COMMON(total_retrans); break; case offsetof(struct bpf_tcp_sock, segs_in): BPF_TCP_SOCK_GET_COMMON(segs_in); break; case offsetof(struct bpf_tcp_sock, data_segs_in): BPF_TCP_SOCK_GET_COMMON(data_segs_in); break; case offsetof(struct bpf_tcp_sock, segs_out): BPF_TCP_SOCK_GET_COMMON(segs_out); break; case offsetof(struct bpf_tcp_sock, data_segs_out): BPF_TCP_SOCK_GET_COMMON(data_segs_out); break; case offsetof(struct bpf_tcp_sock, lost_out): BPF_TCP_SOCK_GET_COMMON(lost_out); break; case offsetof(struct bpf_tcp_sock, sacked_out): BPF_TCP_SOCK_GET_COMMON(sacked_out); break; case offsetof(struct bpf_tcp_sock, bytes_received): BPF_TCP_SOCK_GET_COMMON(bytes_received); break; case offsetof(struct bpf_tcp_sock, bytes_acked): BPF_TCP_SOCK_GET_COMMON(bytes_acked); break; case offsetof(struct bpf_tcp_sock, dsack_dups): BPF_TCP_SOCK_GET_COMMON(dsack_dups); break; case offsetof(struct bpf_tcp_sock, delivered): BPF_TCP_SOCK_GET_COMMON(delivered); break; case offsetof(struct bpf_tcp_sock, delivered_ce): BPF_TCP_SOCK_GET_COMMON(delivered_ce); break; case offsetof(struct bpf_tcp_sock, icsk_retransmits): BPF_INET_SOCK_GET_COMMON(icsk_retransmits); break; } return insn - insn_buf; } BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) { if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_tcp_sock_proto = { .func = bpf_tcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk) { sk = sk_to_full_sk(sk); if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) return (unsigned long)sk; return (unsigned long)NULL; } static const struct bpf_func_proto bpf_get_listener_sock_proto = { .func = bpf_get_listener_sock, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) { unsigned int iphdr_len; switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): iphdr_len = sizeof(struct iphdr); break; case cpu_to_be16(ETH_P_IPV6): iphdr_len = sizeof(struct ipv6hdr); break; default: return 0; } if (skb_headlen(skb) < iphdr_len) return 0; if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len)) return 0; return INET_ECN_set_ce(skb); } bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id)) return false; if (off % size != 0) return false; switch (off) { default: return size == sizeof(__u32); } } u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; #define BPF_XDP_SOCK_GET(FIELD) \ do { \ BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) > \ sizeof_field(struct bpf_xdp_sock, FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\ si->dst_reg, si->src_reg, \ offsetof(struct xdp_sock, FIELD)); \ } while (0) switch (si->off) { case offsetof(struct bpf_xdp_sock, queue_id): BPF_XDP_SOCK_GET(queue_id); break; } return insn - insn_buf; } static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { .func = bpf_skb_ecn_set_ce, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len, struct tcphdr *, th, u32, th_len) { #ifdef CONFIG_SYN_COOKIES int ret; if (unlikely(!sk || th_len < sizeof(*th))) return -EINVAL; /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */ if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) return -EINVAL; if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) return -EINVAL; if (!th->ack || th->rst || th->syn) return -ENOENT; if (unlikely(iph_len < sizeof(struct iphdr))) return -EINVAL; if (tcp_synq_no_recent_overflow(sk)) return -ENOENT; /* Both struct iphdr and struct ipv6hdr have the version field at the * same offset so we can cast to the shorter header (struct iphdr). */ switch (((struct iphdr *)iph)->version) { case 4: if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; ret = __cookie_v4_check((struct iphdr *)iph, th); break; #if IS_BUILTIN(CONFIG_IPV6) case 6: if (unlikely(iph_len < sizeof(struct ipv6hdr))) return -EINVAL; if (sk->sk_family != AF_INET6) return -EINVAL; ret = __cookie_v6_check((struct ipv6hdr *)iph, th); break; #endif /* CONFIG_IPV6 */ default: return -EPROTONOSUPPORT; } if (ret > 0) return 0; return -ENOENT; #else return -ENOTSUPP; #endif } static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { .func = bpf_tcp_check_syncookie, .gpl_only = true, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, struct tcphdr *, th, u32, th_len) { #ifdef CONFIG_SYN_COOKIES u32 cookie; u16 mss; if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4)) return -EINVAL; if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) return -EINVAL; if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) return -ENOENT; if (!th->syn || th->ack || th->fin || th->rst) return -EINVAL; if (unlikely(iph_len < sizeof(struct iphdr))) return -EINVAL; /* Both struct iphdr and struct ipv6hdr have the version field at the * same offset so we can cast to the shorter header (struct iphdr). */ switch (((struct iphdr *)iph)->version) { case 4: if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; mss = tcp_v4_get_syncookie(sk, iph, th, &cookie); break; #if IS_BUILTIN(CONFIG_IPV6) case 6: if (unlikely(iph_len < sizeof(struct ipv6hdr))) return -EINVAL; if (sk->sk_family != AF_INET6) return -EINVAL; mss = tcp_v6_get_syncookie(sk, iph, th, &cookie); break; #endif /* CONFIG_IPV6 */ default: return -EPROTONOSUPPORT; } if (mss == 0) return -ENOENT; return cookie | ((u64)mss << 32); #else return -EOPNOTSUPP; #endif /* CONFIG_SYN_COOKIES */ } static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { .func = bpf_tcp_gen_syncookie, .gpl_only = true, /* __cookie_v*_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) { if (!sk || flags != 0) return -EINVAL; if (!skb_at_tc_ingress(skb)) return -EOPNOTSUPP; if (unlikely(dev_net(skb->dev) != sock_net(sk))) return -ENETUNREACH; if (sk_unhashed(sk)) return -EOPNOTSUPP; if (sk_is_refcounted(sk) && unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) return -ENOENT; skb_orphan(skb); skb->sk = sk; skb->destructor = sock_pfree; return 0; } static const struct bpf_func_proto bpf_sk_assign_proto = { .func = bpf_sk_assign, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg3_type = ARG_ANYTHING, }; static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend, u8 search_kind, const u8 *magic, u8 magic_len, bool *eol) { u8 kind, kind_len; *eol = false; while (op < opend) { kind = op[0]; if (kind == TCPOPT_EOL) { *eol = true; return ERR_PTR(-ENOMSG); } else if (kind == TCPOPT_NOP) { op++; continue; } if (opend - op < 2 || opend - op < op[1] || op[1] < 2) /* Something is wrong in the received header. * Follow the TCP stack's tcp_parse_options() * and just bail here. */ return ERR_PTR(-EFAULT); kind_len = op[1]; if (search_kind == kind) { if (!magic_len) return op; if (magic_len > kind_len - 2) return ERR_PTR(-ENOMSG); if (!memcmp(&op[2], magic, magic_len)) return op; } op += kind_len; } return ERR_PTR(-ENOMSG); } BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, void *, search_res, u32, len, u64, flags) { bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN; const u8 *op, *opend, *magic, *search = search_res; u8 search_kind, search_len, copy_len, magic_len; int ret; /* 2 byte is the minimal option len except TCPOPT_NOP and * TCPOPT_EOL which are useless for the bpf prog to learn * and this helper disallow loading them also. */ if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN) return -EINVAL; search_kind = search[0]; search_len = search[1]; if (search_len > len || search_kind == TCPOPT_NOP || search_kind == TCPOPT_EOL) return -EINVAL; if (search_kind == TCPOPT_EXP || search_kind == 253) { /* 16 or 32 bit magic. +2 for kind and kind length */ if (search_len != 4 && search_len != 6) return -EINVAL; magic = &search[2]; magic_len = search_len - 2; } else { if (search_len) return -EINVAL; magic = NULL; magic_len = 0; } if (load_syn) { ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op); if (ret < 0) return ret; opend = op + ret; op += sizeof(struct tcphdr); } else { if (!bpf_sock->skb || bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB) /* This bpf_sock->op cannot call this helper */ return -EPERM; opend = bpf_sock->skb_data_end; op = bpf_sock->skb->data + sizeof(struct tcphdr); } op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len, &eol); if (IS_ERR(op)) return PTR_ERR(op); copy_len = op[1]; ret = copy_len; if (copy_len > len) { ret = -ENOSPC; copy_len = len; } memcpy(search_res, op, copy_len); return ret; } static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = { .func = bpf_sock_ops_load_hdr_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, const void *, from, u32, len, u64, flags) { u8 new_kind, new_kind_len, magic_len = 0, *opend; const u8 *op, *new_op, *magic = NULL; struct sk_buff *skb; bool eol; if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB) return -EPERM; if (len < 2 || flags) return -EINVAL; new_op = from; new_kind = new_op[0]; new_kind_len = new_op[1]; if (new_kind_len > len || new_kind == TCPOPT_NOP || new_kind == TCPOPT_EOL) return -EINVAL; if (new_kind_len > bpf_sock->remaining_opt_len) return -ENOSPC; /* 253 is another experimental kind */ if (new_kind == TCPOPT_EXP || new_kind == 253) { if (new_kind_len < 4) return -EINVAL; /* Match for the 2 byte magic also. * RFC 6994: the magic could be 2 or 4 bytes. * Hence, matching by 2 byte only is on the * conservative side but it is the right * thing to do for the 'search-for-duplication' * purpose. */ magic = &new_op[2]; magic_len = 2; } /* Check for duplication */ skb = bpf_sock->skb; op = skb->data + sizeof(struct tcphdr); opend = bpf_sock->skb_data_end; op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len, &eol); if (!IS_ERR(op)) return -EEXIST; if (PTR_ERR(op) != -ENOMSG) return PTR_ERR(op); if (eol) /* The option has been ended. Treat it as no more * header option can be written. */ return -ENOSPC; /* No duplication found. Store the header option. */ memcpy(opend, from, new_kind_len); bpf_sock->remaining_opt_len -= new_kind_len; bpf_sock->skb_data_end += new_kind_len; return 0; } static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { .func = bpf_sock_ops_store_hdr_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, u32, len, u64, flags) { if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB) return -EPERM; if (flags || len < 2) return -EINVAL; if (len > bpf_sock->remaining_opt_len) return -ENOSPC; bpf_sock->remaining_opt_len -= len; return 0; } static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = { .func = bpf_sock_ops_reserve_hdr_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb, u64, tstamp, u32, tstamp_type) { /* skb_clear_delivery_time() is done for inet protocol */ if (skb->protocol != htons(ETH_P_IP) && skb->protocol != htons(ETH_P_IPV6)) return -EOPNOTSUPP; switch (tstamp_type) { case BPF_SKB_CLOCK_REALTIME: skb->tstamp = tstamp; skb->tstamp_type = SKB_CLOCK_REALTIME; break; case BPF_SKB_CLOCK_MONOTONIC: if (!tstamp) return -EINVAL; skb->tstamp = tstamp; skb->tstamp_type = SKB_CLOCK_MONOTONIC; break; case BPF_SKB_CLOCK_TAI: if (!tstamp) return -EINVAL; skb->tstamp = tstamp; skb->tstamp_type = SKB_CLOCK_TAI; break; default: return -EINVAL; } return 0; } static const struct bpf_func_proto bpf_skb_set_tstamp_proto = { .func = bpf_skb_set_tstamp, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; #ifdef CONFIG_SYN_COOKIES BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph, struct tcphdr *, th, u32, th_len) { u32 cookie; u16 mss; if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) return -EINVAL; mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT; cookie = __cookie_v4_init_sequence(iph, th, &mss); return cookie | ((u64)mss << 32); } static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = { .func = bpf_tcp_raw_gen_syncookie_ipv4, .gpl_only = true, /* __cookie_v4_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct iphdr), .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph, struct tcphdr *, th, u32, th_len) { #if IS_BUILTIN(CONFIG_IPV6) const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); u32 cookie; u16 mss; if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) return -EINVAL; mss = tcp_parse_mss_option(th, 0) ?: mss_clamp; cookie = __cookie_v6_init_sequence(iph, th, &mss); return cookie | ((u64)mss << 32); #else return -EPROTONOSUPPORT; #endif } static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = { .func = bpf_tcp_raw_gen_syncookie_ipv6, .gpl_only = true, /* __cookie_v6_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct ipv6hdr), .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph, struct tcphdr *, th) { if (__cookie_v4_check(iph, th) > 0) return 0; return -EACCES; } static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = { .func = bpf_tcp_raw_check_syncookie_ipv4, .gpl_only = true, /* __cookie_v4_check is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct iphdr), .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg2_size = sizeof(struct tcphdr), }; BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph, struct tcphdr *, th) { #if IS_BUILTIN(CONFIG_IPV6) if (__cookie_v6_check(iph, th) > 0) return 0; return -EACCES; #else return -EPROTONOSUPPORT; #endif } static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = { .func = bpf_tcp_raw_check_syncookie_ipv6, .gpl_only = true, /* __cookie_v6_check is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct ipv6hdr), .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg2_size = sizeof(struct tcphdr), }; #endif /* CONFIG_SYN_COOKIES */ #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_clone_redirect: case BPF_FUNC_l3_csum_replace: case BPF_FUNC_l4_csum_replace: case BPF_FUNC_lwt_push_encap: case BPF_FUNC_lwt_seg6_action: case BPF_FUNC_lwt_seg6_adjust_srh: case BPF_FUNC_lwt_seg6_store_bytes: case BPF_FUNC_msg_pop_data: case BPF_FUNC_msg_pull_data: case BPF_FUNC_msg_push_data: case BPF_FUNC_skb_adjust_room: case BPF_FUNC_skb_change_head: case BPF_FUNC_skb_change_proto: case BPF_FUNC_skb_change_tail: case BPF_FUNC_skb_pull_data: case BPF_FUNC_skb_store_bytes: case BPF_FUNC_skb_vlan_pop: case BPF_FUNC_skb_vlan_push: case BPF_FUNC_store_hdr_opt: case BPF_FUNC_xdp_adjust_head: case BPF_FUNC_xdp_adjust_meta: case BPF_FUNC_xdp_adjust_tail: /* tail-called program could call any of the above */ case BPF_FUNC_tail_call: return true; default: return false; } } const struct bpf_func_proto bpf_event_output_data_proto __weak; const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak; static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; func_proto = cgroup_current_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_cg_sock_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; func_proto = cgroup_current_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_bind: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: return &bpf_bind_proto; default: return NULL; } case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_addr_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sock_addr_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_sock_addr_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_sock_addr_skc_lookup_tcp_proto; #endif /* CONFIG_INET */ case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_setsockopt: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UNIX_GETSOCKNAME: return &bpf_sock_addr_setsockopt_proto; default: return NULL; } case BPF_FUNC_getsockopt: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UNIX_GETSOCKNAME: return &bpf_sock_addr_getsockopt_proto; default: return NULL; } default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } const struct bpf_func_proto bpf_sk_storage_get_proto __weak; const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; static const struct bpf_func_proto * cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; case BPF_FUNC_sk_cgroup_id: return &bpf_sk_cgroup_id_proto; case BPF_FUNC_sk_ancestor_cgroup_id: return &bpf_sk_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_skc_lookup_tcp_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; case BPF_FUNC_get_listener_sock: return &bpf_get_listener_sock_proto; case BPF_FUNC_skb_ecn_set_ce: return &bpf_skb_ecn_set_ce_proto; #endif default: return sk_filter_func_proto(func_id, prog); } } static const struct bpf_func_proto * tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_skb_pull_data: return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; case BPF_FUNC_csum_level: return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: return &bpf_l4_csum_replace_proto; case BPF_FUNC_clone_redirect: return &bpf_clone_redirect_proto; case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_proto; case BPF_FUNC_skb_vlan_push: return &bpf_skb_vlan_push_proto; case BPF_FUNC_skb_vlan_pop: return &bpf_skb_vlan_pop_proto; case BPF_FUNC_skb_change_proto: return &bpf_skb_change_proto_proto; case BPF_FUNC_skb_change_type: return &bpf_skb_change_type_proto; case BPF_FUNC_skb_adjust_room: return &bpf_skb_adjust_room_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &bpf_skb_change_head_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_skb_get_tunnel_opt: return &bpf_skb_get_tunnel_opt_proto; case BPF_FUNC_skb_set_tunnel_opt: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_redirect_neigh: return &bpf_redirect_neigh_proto; case BPF_FUNC_redirect_peer: return &bpf_redirect_peer_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; case BPF_FUNC_set_hash: return &bpf_set_hash_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: return &bpf_skb_fib_lookup_proto; case BPF_FUNC_check_mtu: return &bpf_skb_check_mtu_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; #ifdef CONFIG_XFRM case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; #endif #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_skb_cgroup_classid: return &bpf_skb_cgroup_classid_proto; #endif #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_tc_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_tc_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; case BPF_FUNC_get_listener_sock: return &bpf_get_listener_sock_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_tc_skc_lookup_tcp_proto; case BPF_FUNC_tcp_check_syncookie: return &bpf_tcp_check_syncookie_proto; case BPF_FUNC_skb_ecn_set_ce: return &bpf_skb_ecn_set_ce_proto; case BPF_FUNC_tcp_gen_syncookie: return &bpf_tcp_gen_syncookie_proto; case BPF_FUNC_sk_assign: return &bpf_sk_assign_proto; case BPF_FUNC_skb_set_tstamp: return &bpf_skb_set_tstamp_proto; #ifdef CONFIG_SYN_COOKIES case BPF_FUNC_tcp_raw_gen_syncookie_ipv4: return &bpf_tcp_raw_gen_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_gen_syncookie_ipv6: return &bpf_tcp_raw_gen_syncookie_ipv6_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv4: return &bpf_tcp_raw_check_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv6: return &bpf_tcp_raw_check_syncookie_ipv6_proto; #endif #endif default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_xdp_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; case BPF_FUNC_xdp_adjust_meta: return &bpf_xdp_adjust_meta_proto; case BPF_FUNC_redirect: return &bpf_xdp_redirect_proto; case BPF_FUNC_redirect_map: return &bpf_xdp_redirect_map_proto; case BPF_FUNC_xdp_adjust_tail: return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_xdp_get_buff_len: return &bpf_xdp_get_buff_len_proto; case BPF_FUNC_xdp_load_bytes: return &bpf_xdp_load_bytes_proto; case BPF_FUNC_xdp_store_bytes: return &bpf_xdp_store_bytes_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; case BPF_FUNC_check_mtu: return &bpf_xdp_check_mtu_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_udp: return &bpf_xdp_sk_lookup_udp_proto; case BPF_FUNC_sk_lookup_tcp: return &bpf_xdp_sk_lookup_tcp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_xdp_skc_lookup_tcp_proto; case BPF_FUNC_tcp_check_syncookie: return &bpf_tcp_check_syncookie_proto; case BPF_FUNC_tcp_gen_syncookie: return &bpf_tcp_gen_syncookie_proto; #ifdef CONFIG_SYN_COOKIES case BPF_FUNC_tcp_raw_gen_syncookie_ipv4: return &bpf_tcp_raw_gen_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_gen_syncookie_ipv6: return &bpf_tcp_raw_gen_syncookie_ipv6_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv4: return &bpf_tcp_raw_check_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv6: return &bpf_tcp_raw_check_syncookie_ipv6_proto; #endif #endif default: return bpf_sk_base_func_proto(func_id, prog); } #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The * kfuncs are defined in two different modules, and we want to be able * to use them interchangeably with the same BTF type ID. Because modules * can't de-duplicate BTF IDs between each other, we need the type to be * referenced in the vmlinux BTF or the verifier will get confused about * the different types. So we add this dummy type reference which will * be included in vmlinux BTF, allowing both modules to refer to the * same type ID. */ BTF_TYPE_EMIT(struct nf_conn___init); #endif } const struct bpf_func_proto bpf_sock_map_update_proto __weak; const struct bpf_func_proto bpf_sock_hash_update_proto __weak; static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_sock_ops_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_sock_ops_getsockopt_proto; case BPF_FUNC_sock_ops_cb_flags_set: return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; case BPF_FUNC_sock_hash_update: return &bpf_sock_hash_update_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_ops_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_ops_proto; #ifdef CONFIG_INET case BPF_FUNC_load_hdr_opt: return &bpf_sock_ops_load_hdr_opt_proto; case BPF_FUNC_store_hdr_opt: return &bpf_sock_ops_store_hdr_opt_proto; case BPF_FUNC_reserve_hdr_opt: return &bpf_sock_ops_reserve_hdr_opt_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; #endif /* CONFIG_INET */ default: return bpf_sk_base_func_proto(func_id, prog); } } const struct bpf_func_proto bpf_msg_redirect_map_proto __weak; const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak; static const struct bpf_func_proto * sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_msg_redirect_map: return &bpf_msg_redirect_map_proto; case BPF_FUNC_msg_redirect_hash: return &bpf_msg_redirect_hash_proto; case BPF_FUNC_msg_apply_bytes: return &bpf_msg_apply_bytes_proto; case BPF_FUNC_msg_cork_bytes: return &bpf_msg_cork_bytes_proto; case BPF_FUNC_msg_pull_data: return &bpf_msg_pull_data_proto; case BPF_FUNC_msg_push_data: return &bpf_msg_push_data_proto; case BPF_FUNC_msg_pop_data: return &bpf_msg_pop_data_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sk_msg_proto; #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; #endif default: return bpf_sk_base_func_proto(func_id, prog); } } const struct bpf_func_proto bpf_sk_redirect_map_proto __weak; const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak; static const struct bpf_func_proto * sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_pull_data: return &sk_skb_pull_data_proto; case BPF_FUNC_skb_change_tail: return &sk_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &sk_skb_change_head_proto; case BPF_FUNC_skb_adjust_room: return &sk_skb_adjust_room_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_sk_redirect_map: return &bpf_sk_redirect_map_proto; case BPF_FUNC_sk_redirect_hash: return &bpf_sk_redirect_hash_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_skc_lookup_tcp_proto; #endif default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_flow_dissector_load_bytes_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_pull_data: return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_lwt_push_encap: return &bpf_lwt_in_push_encap_proto; default: return lwt_out_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_skb_get_tunnel_opt: return &bpf_skb_get_tunnel_opt_proto; case BPF_FUNC_skb_set_tunnel_opt: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_clone_redirect: return &bpf_clone_redirect_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &bpf_skb_change_head_proto; case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; case BPF_FUNC_csum_level: return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: return &bpf_l4_csum_replace_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; case BPF_FUNC_lwt_push_encap: return &bpf_lwt_xmit_push_encap_proto; default: return lwt_out_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) case BPF_FUNC_lwt_seg6_store_bytes: return &bpf_lwt_seg6_store_bytes_proto; case BPF_FUNC_lwt_seg6_action: return &bpf_lwt_seg6_action_proto; case BPF_FUNC_lwt_seg6_adjust_srh: return &bpf_lwt_seg6_adjust_srh_proto; #endif default: return lwt_out_func_proto(func_id, prog); } } static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct __sk_buff)) return false; /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; switch (off) { case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): if (off + size > offsetofend(struct __sk_buff, cb[4])) return false; break; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): if (info->is_ldsx || size != size_default) return false; break; case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): if (size != size_default) return false; break; case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): return false; case bpf_ctx_range(struct __sk_buff, hwtstamp): if (type == BPF_WRITE || size != sizeof(__u64)) return false; break; case bpf_ctx_range(struct __sk_buff, tstamp): if (size != sizeof(__u64)) return false; break; case offsetof(struct __sk_buff, sk): if (type == BPF_WRITE || size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; break; case offsetof(struct __sk_buff, tstamp_type): return false; case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1: /* Explicitly prohibit access to padding in __sk_buff. */ return false; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { if (size != size_default) return false; } else { bpf_ctx_record_field_size(info, size_default); if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } } return true; } static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, hwtstamp): return false; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool cg_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, wire_len): return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, hwtstamp): return false; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return bpf_skb_is_valid_access(off, size, type, prog, info); } /* Attach type specific accesses */ static bool __sock_filter_check_attach_type(int off, enum bpf_access_type access_type, enum bpf_attach_type attach_type) { switch (off) { case offsetof(struct bpf_sock, bound_dev_if): case offsetof(struct bpf_sock, mark): case offsetof(struct bpf_sock, priority): switch (attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET_SOCK_RELEASE: goto full_access; default: return false; } case bpf_ctx_range(struct bpf_sock, src_ip4): switch (attach_type) { case BPF_CGROUP_INET4_POST_BIND: goto read_only; default: return false; } case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): switch (attach_type) { case BPF_CGROUP_INET6_POST_BIND: goto read_only; default: return false; } case bpf_ctx_range(struct bpf_sock, src_port): switch (attach_type) { case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: goto read_only; default: return false; } } read_only: return access_type == BPF_READ; full_access: return true; } bool bpf_sock_common_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range_till(struct bpf_sock, type, priority): return false; default: return bpf_sock_is_valid_access(off, size, type, info); } } bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); int field_size; if (off < 0 || off >= sizeof(struct bpf_sock)) return false; if (off % size != 0) return false; switch (off) { case offsetof(struct bpf_sock, state): case offsetof(struct bpf_sock, family): case offsetof(struct bpf_sock, type): case offsetof(struct bpf_sock, protocol): case offsetof(struct bpf_sock, src_port): case offsetof(struct bpf_sock, rx_queue_mapping): case bpf_ctx_range(struct bpf_sock, src_ip4): case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): case bpf_ctx_range(struct bpf_sock, dst_ip4): case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); case bpf_ctx_range(struct bpf_sock, dst_port): field_size = size == size_default ? size_default : sizeof_field(struct bpf_sock, dst_port); bpf_ctx_record_field_size(info, field_size); return bpf_ctx_narrow_access_ok(off, size, field_size); case offsetofend(struct bpf_sock, dst_port) ... offsetof(struct bpf_sock, dst_ip4) - 1: return false; } return size == size_default; } static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (!bpf_sock_is_valid_access(off, size, type, info)) return false; return __sock_filter_check_attach_type(off, type, prog->expected_attach_type); } static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { /* Neither direct read nor direct write requires any preliminary * action. */ return 0; } static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog, int drop_verdict) { struct bpf_insn *insn = insn_buf; if (!direct_write) return 0; /* if (!skb->cloned) * goto start; * * (Fast-path, otherwise approximation that we might be * a clone, do the rest in helper.) */ *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); /* ret = bpf_skb_pull_data(skb, 0); */ *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_pull_data); /* if (!ret) * goto restore; * return TC_ACT_SHOT; */ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict); *insn++ = BPF_EXIT_INSN(); /* restore: */ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); /* start: */ *insn++ = prog->insnsi[0]; return insn - insn_buf; } static int bpf_gen_ld_abs(const struct bpf_insn *orig, struct bpf_insn *insn_buf) { bool indirect = BPF_MODE(orig->code) == BPF_IND; struct bpf_insn *insn = insn_buf; if (!indirect) { *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm); } else { *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg); if (orig->imm) *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm); } /* We're guaranteed here that CTX is in R6. */ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); switch (BPF_SIZE(orig->code)) { case BPF_B: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache); break; case BPF_H: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache); break; case BPF_W: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache); break; } *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2); *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); *insn++ = BPF_EXIT_INSN(); return insn - insn_buf; } static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT); } static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): case bpf_ctx_range(struct __sk_buff, tc_index): case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, queue_mapping): break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_meta): info->reg_type = PTR_TO_PACKET_META; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; case offsetof(struct __sk_buff, tstamp_type): /* The convert_ctx_access() on reading and writing * __sk_buff->tstamp depends on whether the bpf prog * has used __sk_buff->tstamp_type or not. * Thus, we need to set prog->tstamp_type_access * earlier during is_valid_access() here. */ ((struct bpf_prog *)prog)->tstamp_type_access = 1; return size == sizeof(__u8); } return bpf_skb_is_valid_access(off, size, type, prog, info); } DEFINE_MUTEX(nf_conn_btf_access_lock); EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock); int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size); EXPORT_SYMBOL_GPL(nfct_btf_struct_access); static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size) { int ret = -EACCES; mutex_lock(&nf_conn_btf_access_lock); if (nfct_btf_struct_access) ret = nfct_btf_struct_access(log, reg, off, size); mutex_unlock(&nf_conn_btf_access_lock); return ret; } static bool __is_valid_xdp_access(int off, int size) { if (off < 0 || off >= sizeof(struct xdp_md)) return false; if (off % size != 0) return false; if (size != sizeof(__u32)) return false; return true; } static bool xdp_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (prog->expected_attach_type != BPF_XDP_DEVMAP) { switch (off) { case offsetof(struct xdp_md, egress_ifindex): return false; } } if (type == BPF_WRITE) { if (bpf_prog_is_offloaded(prog->aux)) { switch (off) { case offsetof(struct xdp_md, rx_queue_index): return __is_valid_xdp_access(off, size); } } return false; } else { switch (off) { case offsetof(struct xdp_md, data_meta): case offsetof(struct xdp_md, data): case offsetof(struct xdp_md, data_end): if (info->is_ldsx) return false; } } switch (off) { case offsetof(struct xdp_md, data): info->reg_type = PTR_TO_PACKET; break; case offsetof(struct xdp_md, data_meta): info->reg_type = PTR_TO_PACKET_META; break; case offsetof(struct xdp_md, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return __is_valid_xdp_access(off, size); } void bpf_warn_invalid_xdp_action(const struct net_device *dev, const struct bpf_prog *prog, u32 act) { const u32 act_max = XDP_REDIRECT; pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n", act > act_max ? "Illegal" : "Driver unsupported", act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A"); } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); static int xdp_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size) { int ret = -EACCES; mutex_lock(&nf_conn_btf_access_lock); if (nfct_btf_struct_access) ret = nfct_btf_struct_access(log, reg, off, size); mutex_unlock(&nf_conn_btf_access_lock); return ret; } static bool sock_addr_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct bpf_sock_addr)) return false; if (off % size != 0) return false; /* Disallow access to fields not belonging to the attach type's address * family. */ switch (off) { case bpf_ctx_range(struct bpf_sock_addr, user_ip4): switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: break; default: return false; } break; case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP6_RECVMSG: break; default: return false; } break; case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): switch (prog->expected_attach_type) { case BPF_CGROUP_UDP4_SENDMSG: break; default: return false; } break; case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): switch (prog->expected_attach_type) { case BPF_CGROUP_UDP6_SENDMSG: break; default: return false; } break; } switch (off) { case bpf_ctx_range(struct bpf_sock_addr, user_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): case bpf_ctx_range(struct bpf_sock_addr, user_port): if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, user_ip6)) return true; if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, msg_src_ip6)) return true; if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } else { if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, user_ip6)) return true; if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, msg_src_ip6)) return true; if (size != size_default) return false; } break; case offsetof(struct bpf_sock_addr, sk): if (type != BPF_READ) return false; if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET; break; default: if (type == BPF_READ) { if (size != size_default) return false; } else { return false; } } return true; } static bool sock_ops_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct bpf_sock_ops)) return false; /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; if (type == BPF_WRITE) { switch (off) { case offsetof(struct bpf_sock_ops, reply): case offsetof(struct bpf_sock_ops, sk_txhash): if (size != size_default) return false; break; default: return false; } } else { switch (off) { case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, bytes_acked): if (size != sizeof(__u64)) return false; break; case offsetof(struct bpf_sock_ops, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET_OR_NULL; break; case offsetof(struct bpf_sock_ops, skb_data): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET; break; case offsetof(struct bpf_sock_ops, skb_data_end): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET_END; break; case offsetof(struct bpf_sock_ops, skb_tcp_flags): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); case offsetof(struct bpf_sock_ops, skb_hwtstamp): if (size != sizeof(__u64)) return false; break; default: if (size != size_default) return false; break; } } return true; } static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP); } static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, hwtstamp): return false; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_index): case bpf_ctx_range(struct __sk_buff, priority): break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, mark): return false; case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool sk_msg_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) return false; if (off % size != 0) return false; switch (off) { case offsetof(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; if (size != sizeof(__u64)) return false; break; case offsetof(struct sk_msg_md, data_end): info->reg_type = PTR_TO_PACKET_END; if (size != sizeof(__u64)) return false; break; case offsetof(struct sk_msg_md, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET; break; case bpf_ctx_range(struct sk_msg_md, family): case bpf_ctx_range(struct sk_msg_md, remote_ip4): case bpf_ctx_range(struct sk_msg_md, local_ip4): case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]): case bpf_ctx_range(struct sk_msg_md, remote_port): case bpf_ctx_range(struct sk_msg_md, local_port): case bpf_ctx_range(struct sk_msg_md, size): if (size != sizeof(__u32)) return false; break; default: return false; } return true; } static bool flow_dissector_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct __sk_buff)) return false; if (type == BPF_WRITE) return false; switch (off) { case bpf_ctx_range(struct __sk_buff, data): if (info->is_ldsx || size != size_default) return false; info->reg_type = PTR_TO_PACKET; return true; case bpf_ctx_range(struct __sk_buff, data_end): if (info->is_ldsx || size != size_default) return false; info->reg_type = PTR_TO_PACKET_END; return true; case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_FLOW_KEYS; return true; default: return false; } } static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct __sk_buff, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data), si->dst_reg, si->src_reg, offsetof(struct bpf_flow_dissector, data)); break; case offsetof(struct __sk_buff, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end), si->dst_reg, si->src_reg, offsetof(struct bpf_flow_dissector, data_end)); break; case offsetof(struct __sk_buff, flow_keys): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys), si->dst_reg, si->src_reg, offsetof(struct bpf_flow_dissector, flow_keys)); break; } return insn - insn_buf; } static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI); BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME); BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC); BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI); *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT); #else BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1)); #endif return insn; } static struct bpf_insn *bpf_convert_shinfo_access(__u8 dst_reg, __u8 skb_reg, struct bpf_insn *insn) { /* si->dst_reg = skb_shinfo(SKB); */ #ifdef NET_SKBUFF_DATA_USES_OFFSET *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), BPF_REG_AX, skb_reg, offsetof(struct sk_buff, end)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), dst_reg, skb_reg, offsetof(struct sk_buff, head)); *insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, BPF_REG_AX); #else *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), dst_reg, skb_reg, offsetof(struct sk_buff, end)); #endif return insn; } static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; #ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, read skb->tstamp as is if tstamp_type_access is true. */ if (!prog->tstamp_type_access) { /* AX is needed because src_reg and dst_reg could be the same */ __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); /* check if ingress mask bits is set */ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); *insn++ = BPF_JMP_A(4); *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1); *insn++ = BPF_JMP_A(2); /* skb->tc_at_ingress && skb->tstamp_type, * read 0 as the (rcv) timestamp. */ *insn++ = BPF_MOV64_IMM(value_reg, 0); *insn++ = BPF_JMP_A(1); } #endif *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg, offsetof(struct sk_buff, tstamp)); return insn; } static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->src_reg; __u8 skb_reg = si->dst_reg; #ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, write skb->tstamp as is if tstamp_type_access is true. * Otherwise, writing at ingress will have to clear the * skb->tstamp_type bit also. */ if (!prog->tstamp_type_access) { __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); /* Writing __sk_buff->tstamp as ingress, goto <clear> */ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); /* goto <store> */ *insn++ = BPF_JMP_A(2); /* <clear>: skb->tstamp_type */ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK); *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET); } #endif /* <store>: skb->tstamp = tstamp */ *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM, skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm); return insn; } #define BPF_EMIT_STORE(size, si, off) \ BPF_RAW_INSN(BPF_CLASS((si)->code) | (size) | BPF_MEM, \ (si)->dst_reg, (si)->src_reg, (off), (si)->imm) static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct __sk_buff, len): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, len, 4, target_size)); break; case offsetof(struct __sk_buff, protocol): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, protocol, 2, target_size)); break; case offsetof(struct __sk_buff, vlan_proto): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_proto, 2, target_size)); break; case offsetof(struct __sk_buff, priority): if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, bpf_target_off(struct sk_buff, priority, 4, target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, priority, 4, target_size)); break; case offsetof(struct __sk_buff, ingress_ifindex): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, skb_iif, 4, target_size)); break; case offsetof(struct __sk_buff, ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct net_device, ifindex, 4, target_size)); break; case offsetof(struct __sk_buff, hash): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, hash, 4, target_size)); break; case offsetof(struct __sk_buff, mark): if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, bpf_target_off(struct sk_buff, mark, 4, target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, mark, 4, target_size)); break; case offsetof(struct __sk_buff, pkt_type): *target_size = 1; *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, PKT_TYPE_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5); #endif break; case offsetof(struct __sk_buff, queue_mapping): if (type == BPF_WRITE) { u32 off = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size); if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) { *insn++ = BPF_JMP_A(0); /* noop */ break; } if (BPF_CLASS(si->code) == BPF_STX) *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); *insn++ = BPF_EMIT_STORE(BPF_H, si, off); } else { *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, queue_mapping, 2, target_size)); } break; case offsetof(struct __sk_buff, vlan_present): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_all, 4, target_size)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_ALU32_IMM(BPF_MOV, si->dst_reg, 1); break; case offsetof(struct __sk_buff, vlan_tci): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_tci, 2, target_size)); break; case offsetof(struct __sk_buff, cb[0]) ... offsetofend(struct __sk_buff, cb[4]) - 1: BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20); BUILD_BUG_ON((offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data)) % sizeof(__u64)); prog->cb_access = 1; off = si->off; off -= offsetof(struct __sk_buff, cb[0]); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, data); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off); else *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, tc_classid): BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2); off = si->off; off -= offsetof(struct __sk_buff, tc_classid); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, tc_classid); *target_size = 2; if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_H, si, off); else *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), si->dst_reg, si->src_reg, offsetof(struct sk_buff, data)); break; case offsetof(struct __sk_buff, data_meta): off = si->off; off -= offsetof(struct __sk_buff, data_meta); off += offsetof(struct sk_buff, cb); off += offsetof(struct bpf_skb_data_end, data_meta); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, data_end): off = si->off; off -= offsetof(struct __sk_buff, data_end); off += offsetof(struct sk_buff, cb); off += offsetof(struct bpf_skb_data_end, data_end); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, tc_index): #ifdef CONFIG_NET_SCHED if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_H, si, bpf_target_off(struct sk_buff, tc_index, 2, target_size)); else *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, tc_index, 2, target_size)); #else *target_size = 2; if (type == BPF_WRITE) *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); else *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, napi_id): #if defined(CONFIG_NET_RX_BUSY_POLL) *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, napi_id, 4, target_size)); *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #else *target_size = 4; *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, family): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_family, 2, target_size)); break; case offsetof(struct __sk_buff, remote_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_daddr, 4, target_size)); break; case offsetof(struct __sk_buff, local_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_rcv_saddr, 4, target_size)); break; case offsetof(struct __sk_buff, remote_ip6[0]) ... offsetof(struct __sk_buff, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct __sk_buff, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, local_ip6[0]) ... offsetof(struct __sk_buff, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct __sk_buff, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, remote_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_dport, 2, target_size)); #ifndef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); #endif break; case offsetof(struct __sk_buff, local_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_num, 2, target_size)); break; case offsetof(struct __sk_buff, tstamp): BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8); if (type == BPF_WRITE) insn = bpf_convert_tstamp_write(prog, si, insn); else insn = bpf_convert_tstamp_read(prog, si, insn); break; case offsetof(struct __sk_buff, tstamp_type): insn = bpf_convert_tstamp_type_read(si, insn); break; case offsetof(struct __sk_buff, gso_segs): insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, gso_segs, 2, target_size)); break; case offsetof(struct __sk_buff, gso_size): insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size), si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, gso_size, 2, target_size)); break; case offsetof(struct __sk_buff, wire_len): BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4); off = si->off; off -= offsetof(struct __sk_buff, wire_len); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, pkt_len); *target_size = 4; *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, sk): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); break; case offsetof(struct __sk_buff, hwtstamp): BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8); BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0); insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, hwtstamps, 8, target_size)); break; } return insn - insn_buf; } u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct bpf_sock, bound_dev_if): BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, offsetof(struct sock, sk_bound_dev_if)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_bound_dev_if)); break; case offsetof(struct bpf_sock, mark): BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, offsetof(struct sock, sk_mark)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_mark)); break; case offsetof(struct bpf_sock, priority): BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, offsetof(struct sock, sk_priority)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_priority)); break; case offsetof(struct bpf_sock, family): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_family), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_family, sizeof_field(struct sock_common, skc_family), target_size)); break; case offsetof(struct bpf_sock, type): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock, sk_type), si->dst_reg, si->src_reg, bpf_target_off(struct sock, sk_type, sizeof_field(struct sock, sk_type), target_size)); break; case offsetof(struct bpf_sock, protocol): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock, sk_protocol), si->dst_reg, si->src_reg, bpf_target_off(struct sock, sk_protocol, sizeof_field(struct sock, sk_protocol), target_size)); break; case offsetof(struct bpf_sock, src_ip4): *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_rcv_saddr, sizeof_field(struct sock_common, skc_rcv_saddr), target_size)); break; case offsetof(struct bpf_sock, dst_ip4): *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_daddr, sizeof_field(struct sock_common, skc_daddr), target_size)); break; case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) off = si->off; off -= offsetof(struct bpf_sock, src_ip6[0]); *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off( struct sock_common, skc_v6_rcv_saddr.s6_addr32[0], sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]), target_size) + off); #else (void)off; *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) off = si->off; off -= offsetof(struct bpf_sock, dst_ip6[0]); *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_v6_daddr.s6_addr32[0], sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]), target_size) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); *target_size = 4; #endif break; case offsetof(struct bpf_sock, src_port): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_num), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_num, sizeof_field(struct sock_common, skc_num), target_size)); break; case offsetof(struct bpf_sock, dst_port): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_dport), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_dport, sizeof_field(struct sock_common, skc_dport), target_size)); break; case offsetof(struct bpf_sock, state): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_state), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_state, sizeof_field(struct sock_common, skc_state), target_size)); break; case offsetof(struct bpf_sock, rx_queue_mapping): #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping), si->dst_reg, si->src_reg, bpf_target_off(struct sock, sk_rx_queue_mapping, sizeof_field(struct sock, sk_rx_queue_mapping), target_size)); *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING, 1); *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); #else *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); *target_size = 2; #endif break; } return insn - insn_buf; } static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct __sk_buff, ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct net_device, ifindex, 4, target_size)); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, target_size); } return insn - insn_buf; } static u32 xdp_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct xdp_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data)); break; case offsetof(struct xdp_md, data_meta): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_meta)); break; case offsetof(struct xdp_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_end)); break; case offsetof(struct xdp_md, ingress_ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, rxq)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), si->dst_reg, si->dst_reg, offsetof(struct xdp_rxq_info, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct net_device, ifindex)); break; case offsetof(struct xdp_md, rx_queue_index): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, rxq)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct xdp_rxq_info, queue_index)); break; case offsetof(struct xdp_md, egress_ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, txq)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev), si->dst_reg, si->dst_reg, offsetof(struct xdp_txq_info, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct net_device, ifindex)); break; } return insn - insn_buf; } /* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of * context Structure, F is Field in context structure that contains a pointer * to Nested Structure of type NS that has the field NF. * * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make * sure that SIZE is not greater than actual size of S.F.NF. * * If offset OFF is provided, the load happens from that offset relative to * offset of NF. */ #define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \ do { \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \ si->src_reg, offsetof(S, F)); \ *insn++ = BPF_LDX_MEM( \ SIZE, si->dst_reg, si->dst_reg, \ bpf_target_off(NS, NF, sizeof_field(NS, NF), \ target_size) \ + OFF); \ } while (0) #define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \ BPF_FIELD_SIZEOF(NS, NF), 0) /* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. * * In addition it uses Temporary Field TF (member of struct S) as the 3rd * "register" since two registers available in convert_ctx_access are not * enough: we can't override neither SRC, since it contains value to store, nor * DST since it contains pointer to context that may be used by later * instructions. But we need a temporary place to save pointer to nested * structure whose field we want to store to. */ #define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF) \ do { \ int tmp_reg = BPF_REG_9; \ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ --tmp_reg; \ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ --tmp_reg; \ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \ offsetof(S, TF)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ si->dst_reg, offsetof(S, F)); \ *insn++ = BPF_RAW_INSN(SIZE | BPF_MEM | BPF_CLASS(si->code), \ tmp_reg, si->src_reg, \ bpf_target_off(NS, NF, sizeof_field(NS, NF), \ target_size) \ + OFF, \ si->imm); \ *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ offsetof(S, TF)); \ } while (0) #define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \ TF) \ do { \ if (type == BPF_WRITE) { \ SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, \ OFF, TF); \ } else { \ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ S, NS, F, NF, SIZE, OFF); \ } \ } while (0) static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port); struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct bpf_sock_addr, user_family): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sockaddr, uaddr, sa_family); break; case offsetof(struct bpf_sock_addr, user_ip4): SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct sockaddr_in, uaddr, sin_addr, BPF_SIZE(si->code), 0, tmp_reg); break; case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): off = si->off; off -= offsetof(struct bpf_sock_addr, user_ip6[0]); SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); break; case offsetof(struct bpf_sock_addr, user_port): /* To get port we need to know sa_family first and then treat * sockaddr as either sockaddr_in or sockaddr_in6. * Though we can simplify since port field has same offset and * size in both structures. * Here we check this invariant and use just one of the * structures if it's true. */ BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) != offsetof(struct sockaddr_in6, sin6_port)); BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) != sizeof_field(struct sockaddr_in6, sin6_port)); /* Account for sin6_port being smaller than user_port. */ port_size = min(port_size, BPF_LDST_BYTES(si)); SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg); break; case offsetof(struct bpf_sock_addr, family): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sock, sk, sk_family); break; case offsetof(struct bpf_sock_addr, type): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sock, sk, sk_type); break; case offsetof(struct bpf_sock_addr, protocol): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sock, sk, sk_protocol); break; case offsetof(struct bpf_sock_addr, msg_src_ip4): /* Treat t_ctx as struct in_addr for msg_src_ip4. */ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct in_addr, t_ctx, s_addr, BPF_SIZE(si->code), 0, tmp_reg); break; case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): off = si->off; off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]); /* Treat t_ctx as struct in6_addr for msg_src_ip6. */ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct in6_addr, t_ctx, s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); break; case offsetof(struct bpf_sock_addr, sk): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_addr_kern, sk)); break; } return insn - insn_buf; } static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; /* Helper macro for adding read access to tcp_sock or sock fields. */ #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2; \ BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \ sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ fullsock_reg = reg; \ jmp += 2; \ } \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ fullsock_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ is_fullsock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ if (si->dst_reg == si->src_reg) \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ OBJ_FIELD), \ si->dst_reg, si->dst_reg, \ offsetof(OBJ, OBJ_FIELD)); \ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_JMP_A(1); \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ } \ } while (0) #define SOCK_OPS_GET_SK() \ do { \ int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ fullsock_reg = reg; \ jmp += 2; \ } \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ fullsock_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ is_fullsock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ if (si->dst_reg == si->src_reg) \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_JMP_A(1); \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ } \ } while (0) #define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \ SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock) /* Helper macro for adding write access to tcp_sock or sock fields. * The macro is called with two registers, dst_reg which contains a pointer * to ctx (context) and src_reg which contains the value that should be * stored. However, we need an additional register since we cannot overwrite * dst_reg because it may be used later in the program. * Instead we "borrow" one of the other register. We first save its value * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore * it at the end of the macro. */ #define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ int reg = BPF_REG_9; \ BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \ sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ is_fullsock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ *insn++ = BPF_RAW_INSN(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD) | \ BPF_MEM | BPF_CLASS(si->code), \ reg, si->src_reg, \ offsetof(OBJ, OBJ_FIELD), \ si->imm); \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ } while (0) #define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ do { \ if (TYPE == BPF_WRITE) \ SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ else \ SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ } while (0) switch (si->off) { case offsetof(struct bpf_sock_ops, op): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, op), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, op)); break; case offsetof(struct bpf_sock_ops, replylong[0]) ... offsetof(struct bpf_sock_ops, replylong[3]): BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) != sizeof_field(struct bpf_sock_ops_kern, reply)); BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) != sizeof_field(struct bpf_sock_ops_kern, replylong)); off = si->off; off -= offsetof(struct bpf_sock_ops, replylong[0]); off += offsetof(struct bpf_sock_ops_kern, replylong[0]); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, off); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); break; case offsetof(struct bpf_sock_ops, family): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_family)); break; case offsetof(struct bpf_sock_ops, remote_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_daddr)); break; case offsetof(struct bpf_sock_ops, local_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_rcv_saddr)); break; case offsetof(struct bpf_sock_ops, remote_ip6[0]) ... offsetof(struct bpf_sock_ops, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct bpf_sock_ops, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct bpf_sock_ops, local_ip6[0]) ... offsetof(struct bpf_sock_ops, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct bpf_sock_ops, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct bpf_sock_ops, remote_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_dport)); #ifndef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); #endif break; case offsetof(struct bpf_sock_ops, local_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; case offsetof(struct bpf_sock_ops, is_fullsock): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, is_fullsock), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, is_fullsock)); break; case offsetof(struct bpf_sock_ops, state): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_state)); break; case offsetof(struct bpf_sock_ops, rtt_min): BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != sizeof(struct minmax)); BUILD_BUG_ON(sizeof(struct minmax) < sizeof(struct minmax_sample)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct tcp_sock, rtt_min) + sizeof_field(struct minmax_sample, t)); break; case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, sk_txhash): SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, struct sock, type); break; case offsetof(struct bpf_sock_ops, snd_cwnd): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd); break; case offsetof(struct bpf_sock_ops, srtt_us): SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us); break; case offsetof(struct bpf_sock_ops, snd_ssthresh): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh); break; case offsetof(struct bpf_sock_ops, rcv_nxt): SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt); break; case offsetof(struct bpf_sock_ops, snd_nxt): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt); break; case offsetof(struct bpf_sock_ops, snd_una): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una); break; case offsetof(struct bpf_sock_ops, mss_cache): SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache); break; case offsetof(struct bpf_sock_ops, ecn_flags): SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags); break; case offsetof(struct bpf_sock_ops, rate_delivered): SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered); break; case offsetof(struct bpf_sock_ops, rate_interval_us): SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us); break; case offsetof(struct bpf_sock_ops, packets_out): SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out); break; case offsetof(struct bpf_sock_ops, retrans_out): SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out); break; case offsetof(struct bpf_sock_ops, total_retrans): SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans); break; case offsetof(struct bpf_sock_ops, segs_in): SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in); break; case offsetof(struct bpf_sock_ops, data_segs_in): SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in); break; case offsetof(struct bpf_sock_ops, segs_out): SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out); break; case offsetof(struct bpf_sock_ops, data_segs_out): SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out); break; case offsetof(struct bpf_sock_ops, lost_out): SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out); break; case offsetof(struct bpf_sock_ops, sacked_out): SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out); break; case offsetof(struct bpf_sock_ops, bytes_received): SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received); break; case offsetof(struct bpf_sock_ops, bytes_acked): SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked); break; case offsetof(struct bpf_sock_ops, sk): SOCK_OPS_GET_SK(); break; case offsetof(struct bpf_sock_ops, skb_data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb_data_end), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb_data_end)); break; case offsetof(struct bpf_sock_ops, skb_data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), si->dst_reg, si->dst_reg, offsetof(struct sk_buff, data)); break; case offsetof(struct bpf_sock_ops, skb_len): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), si->dst_reg, si->dst_reg, offsetof(struct sk_buff, len)); break; case offsetof(struct bpf_sock_ops, skb_tcp_flags): off = offsetof(struct sk_buff, cb); off += offsetof(struct tcp_skb_cb, tcp_flags); *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb, tcp_flags), si->dst_reg, si->dst_reg, off); break; case offsetof(struct bpf_sock_ops, skb_hwtstamp): { struct bpf_insn *jmp_on_null_skb; *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); /* Reserve one insn to test skb == NULL */ jmp_on_null_skb = insn++; insn = bpf_convert_shinfo_access(si->dst_reg, si->dst_reg, insn); *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, hwtstamps, 8, target_size)); *jmp_on_null_skb = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, insn - jmp_on_null_skb - 1); break; } } return insn - insn_buf; } /* data_end = skb->data + skb_headlen() */ static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si, struct bpf_insn *insn) { int reg; int temp_reg_off = offsetof(struct sk_buff, cb) + offsetof(struct sk_skb_cb, temp_reg); if (si->src_reg == si->dst_reg) { /* We need an extra register, choose and save a register. */ reg = BPF_REG_9; if (si->src_reg == reg || si->dst_reg == reg) reg--; if (si->src_reg == reg || si->dst_reg == reg) reg--; *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off); } else { reg = si->dst_reg; } /* reg = skb->data */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), reg, si->src_reg, offsetof(struct sk_buff, data)); /* AX = skb->len */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), BPF_REG_AX, si->src_reg, offsetof(struct sk_buff, len)); /* reg = skb->data + skb->len */ *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX); /* AX = skb->data_len */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len), BPF_REG_AX, si->src_reg, offsetof(struct sk_buff, data_len)); /* reg = skb->data + skb->len - skb->data_len */ *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX); if (si->src_reg == si->dst_reg) { /* Restore the saved register */ *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg); *insn++ = BPF_MOV64_REG(si->dst_reg, reg); *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off); } return insn; } static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct __sk_buff, data_end): insn = bpf_convert_data_end_access(si, insn); break; case offsetof(struct __sk_buff, cb[0]) ... offsetofend(struct __sk_buff, cb[4]) - 1: BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20); BUILD_BUG_ON((offsetof(struct sk_buff, cb) + offsetof(struct sk_skb_cb, data)) % sizeof(__u64)); prog->cb_access = 1; off = si->off; off -= offsetof(struct __sk_buff, cb[0]); off += offsetof(struct sk_buff, cb); off += offsetof(struct sk_skb_cb, data); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off); else *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, si->src_reg, off); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, target_size); } return insn - insn_buf; } static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; #if IS_ENABLED(CONFIG_IPV6) int off; #endif /* convert ctx uses the fact sg element is first in struct */ BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0); switch (si->off) { case offsetof(struct sk_msg_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), si->dst_reg, si->src_reg, offsetof(struct sk_msg, data)); break; case offsetof(struct sk_msg_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end), si->dst_reg, si->src_reg, offsetof(struct sk_msg, data_end)); break; case offsetof(struct sk_msg_md, family): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_family)); break; case offsetof(struct sk_msg_md, remote_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_daddr)); break; case offsetof(struct sk_msg_md, local_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_rcv_saddr)); break; case offsetof(struct sk_msg_md, remote_ip6[0]) ... offsetof(struct sk_msg_md, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct sk_msg_md, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct sk_msg_md, local_ip6[0]) ... offsetof(struct sk_msg_md, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct sk_msg_md, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct sk_msg_md, remote_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_dport)); #ifndef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); #endif break; case offsetof(struct sk_msg_md, local_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; case offsetof(struct sk_msg_md, size): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size), si->dst_reg, si->src_reg, offsetof(struct sk_msg_sg, size)); break; case offsetof(struct sk_msg_md, sk): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); break; } return insn - insn_buf; } const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_ld_abs = bpf_gen_ld_abs, }; const struct bpf_prog_ops sk_filter_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .get_func_proto = tc_cls_act_func_proto, .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, .btf_struct_access = tc_cls_act_btf_struct_access, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops xdp_verifier_ops = { .get_func_proto = xdp_func_proto, .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, .gen_prologue = bpf_noop_prologue, .btf_struct_access = xdp_btf_struct_access, }; const struct bpf_prog_ops xdp_prog_ops = { .test_run = bpf_prog_test_run_xdp, }; const struct bpf_verifier_ops cg_skb_verifier_ops = { .get_func_proto = cg_skb_func_proto, .is_valid_access = cg_skb_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops cg_skb_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_in_verifier_ops = { .get_func_proto = lwt_in_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops lwt_in_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_out_verifier_ops = { .get_func_proto = lwt_out_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops lwt_out_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_xmit_verifier_ops = { .get_func_proto = lwt_xmit_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, }; const struct bpf_prog_ops lwt_xmit_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { .get_func_proto = lwt_seg6local_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops lwt_seg6local_prog_ops = { }; const struct bpf_verifier_ops cg_sock_verifier_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, .convert_ctx_access = bpf_sock_convert_ctx_access, }; const struct bpf_prog_ops cg_sock_prog_ops = { }; const struct bpf_verifier_ops cg_sock_addr_verifier_ops = { .get_func_proto = sock_addr_func_proto, .is_valid_access = sock_addr_is_valid_access, .convert_ctx_access = sock_addr_convert_ctx_access, }; const struct bpf_prog_ops cg_sock_addr_prog_ops = { }; const struct bpf_verifier_ops sock_ops_verifier_ops = { .get_func_proto = sock_ops_func_proto, .is_valid_access = sock_ops_is_valid_access, .convert_ctx_access = sock_ops_convert_ctx_access, }; const struct bpf_prog_ops sock_ops_prog_ops = { }; const struct bpf_verifier_ops sk_skb_verifier_ops = { .get_func_proto = sk_skb_func_proto, .is_valid_access = sk_skb_is_valid_access, .convert_ctx_access = sk_skb_convert_ctx_access, .gen_prologue = sk_skb_prologue, }; const struct bpf_prog_ops sk_skb_prog_ops = { }; const struct bpf_verifier_ops sk_msg_verifier_ops = { .get_func_proto = sk_msg_func_proto, .is_valid_access = sk_msg_is_valid_access, .convert_ctx_access = sk_msg_convert_ctx_access, .gen_prologue = bpf_noop_prologue, }; const struct bpf_prog_ops sk_msg_prog_ops = { }; const struct bpf_verifier_ops flow_dissector_verifier_ops = { .get_func_proto = flow_dissector_func_proto, .is_valid_access = flow_dissector_is_valid_access, .convert_ctx_access = flow_dissector_convert_ctx_access, }; const struct bpf_prog_ops flow_dissector_prog_ops = { .test_run = bpf_prog_test_run_flow_dissector, }; int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; struct sk_filter *filter; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; filter = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); if (filter) { RCU_INIT_POINTER(sk->sk_filter, NULL); sk_filter_uncharge(sk, filter); ret = 0; } return ret; } EXPORT_SYMBOL_GPL(sk_detach_filter); int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len) { struct sock_fprog_kern *fprog; struct sk_filter *filter; int ret = 0; sockopt_lock_sock(sk); filter = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); if (!filter) goto out; /* We're copying the filter that has been originally attached, * so no conversion/decode needed anymore. eBPF programs that * have no original program cannot be dumped through this. */ ret = -EACCES; fprog = filter->prog->orig_prog; if (!fprog) goto out; ret = fprog->len; if (!len) /* User space only enquires number of filter blocks. */ goto out; ret = -EINVAL; if (len < fprog->len) goto out; ret = -EFAULT; if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog))) goto out; /* Instead of bytes, the API requests to return the number * of filter blocks. */ ret = fprog->len; out: sockopt_release_sock(sk); return ret; } #ifdef CONFIG_INET static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, struct sock_reuseport *reuse, struct sock *sk, struct sk_buff *skb, struct sock *migrating_sk, u32 hash) { reuse_kern->skb = skb; reuse_kern->sk = sk; reuse_kern->selected_sk = NULL; reuse_kern->migrating_sk = migrating_sk; reuse_kern->data_end = skb->data + skb_headlen(skb); reuse_kern->hash = hash; reuse_kern->reuseport_id = reuse->reuseport_id; reuse_kern->bind_inany = reuse->bind_inany; } struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, struct sock *migrating_sk, u32 hash) { struct sk_reuseport_kern reuse_kern; enum sk_action action; bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash); action = bpf_prog_run(prog, &reuse_kern); if (action == SK_PASS) return reuse_kern.selected_sk; else return ERR_PTR(-ECONNREFUSED); } BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, struct bpf_map *, map, void *, key, u32, flags) { bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY; struct sock_reuseport *reuse; struct sock *selected_sk; int err; selected_sk = map->ops->map_lookup_elem(map, key); if (!selected_sk) return -ENOENT; reuse = rcu_dereference(selected_sk->sk_reuseport_cb); if (!reuse) { /* reuseport_array has only sk with non NULL sk_reuseport_cb. * The only (!reuse) case here is - the sk has already been * unhashed (e.g. by close()), so treat it as -ENOENT. * * Other maps (e.g. sock_map) do not provide this guarantee and * the sk may never be in the reuseport group to begin with. */ err = is_sockarray ? -ENOENT : -EINVAL; goto error; } if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { struct sock *sk = reuse_kern->sk; if (sk->sk_protocol != selected_sk->sk_protocol) { err = -EPROTOTYPE; } else if (sk->sk_family != selected_sk->sk_family) { err = -EAFNOSUPPORT; } else { /* Catch all. Likely bound to a different sockaddr. */ err = -EBADFD; } goto error; } reuse_kern->selected_sk = selected_sk; return 0; error: /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ if (sk_is_refcounted(selected_sk)) sock_put(selected_sk); return err; } static const struct bpf_func_proto sk_select_reuseport_proto = { .func = sk_select_reuseport, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(sk_reuseport_load_bytes, const struct sk_reuseport_kern *, reuse_kern, u32, offset, void *, to, u32, len) { return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len); } static const struct bpf_func_proto sk_reuseport_load_bytes_proto = { .func = sk_reuseport_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; BPF_CALL_5(sk_reuseport_load_bytes_relative, const struct sk_reuseport_kern *, reuse_kern, u32, offset, void *, to, u32, len, u32, start_header) { return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to, len, start_header); } static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = { .func = sk_reuseport_load_bytes_relative, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; static const struct bpf_func_proto * sk_reuseport_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sk_select_reuseport: return &sk_select_reuseport_proto; case BPF_FUNC_skb_load_bytes: return &sk_reuseport_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &sk_reuseport_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_ptr_cookie_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id, prog); } } static bool sk_reuseport_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const u32 size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct sk_reuseport_md) || off % size || type != BPF_READ) return false; switch (off) { case offsetof(struct sk_reuseport_md, data): info->reg_type = PTR_TO_PACKET; return size == sizeof(__u64); case offsetof(struct sk_reuseport_md, data_end): info->reg_type = PTR_TO_PACKET_END; return size == sizeof(__u64); case offsetof(struct sk_reuseport_md, hash): return size == size_default; case offsetof(struct sk_reuseport_md, sk): info->reg_type = PTR_TO_SOCKET; return size == sizeof(__u64); case offsetof(struct sk_reuseport_md, migrating_sk): info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; return size == sizeof(__u64); /* Fields that allow narrowing */ case bpf_ctx_range(struct sk_reuseport_md, eth_protocol): if (size < sizeof_field(struct sk_buff, protocol)) return false; fallthrough; case bpf_ctx_range(struct sk_reuseport_md, ip_protocol): case bpf_ctx_range(struct sk_reuseport_md, bind_inany): case bpf_ctx_range(struct sk_reuseport_md, len): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); default: return false; } } #define SK_REUSEPORT_LOAD_FIELD(F) ({ \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \ si->dst_reg, si->src_reg, \ bpf_target_off(struct sk_reuseport_kern, F, \ sizeof_field(struct sk_reuseport_kern, F), \ target_size)); \ }) #define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \ SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ struct sk_buff, \ skb, \ SKB_FIELD) #define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD) \ SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ struct sock, \ sk, \ SK_FIELD) static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct sk_reuseport_md, data): SK_REUSEPORT_LOAD_SKB_FIELD(data); break; case offsetof(struct sk_reuseport_md, len): SK_REUSEPORT_LOAD_SKB_FIELD(len); break; case offsetof(struct sk_reuseport_md, eth_protocol): SK_REUSEPORT_LOAD_SKB_FIELD(protocol); break; case offsetof(struct sk_reuseport_md, ip_protocol): SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol); break; case offsetof(struct sk_reuseport_md, data_end): SK_REUSEPORT_LOAD_FIELD(data_end); break; case offsetof(struct sk_reuseport_md, hash): SK_REUSEPORT_LOAD_FIELD(hash); break; case offsetof(struct sk_reuseport_md, bind_inany): SK_REUSEPORT_LOAD_FIELD(bind_inany); break; case offsetof(struct sk_reuseport_md, sk): SK_REUSEPORT_LOAD_FIELD(sk); break; case offsetof(struct sk_reuseport_md, migrating_sk): SK_REUSEPORT_LOAD_FIELD(migrating_sk); break; } return insn - insn_buf; } const struct bpf_verifier_ops sk_reuseport_verifier_ops = { .get_func_proto = sk_reuseport_func_proto, .is_valid_access = sk_reuseport_is_valid_access, .convert_ctx_access = sk_reuseport_convert_ctx_access, }; const struct bpf_prog_ops sk_reuseport_prog_ops = { }; DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled); EXPORT_SYMBOL(bpf_sk_lookup_enabled); BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, struct sock *, sk, u64, flags) { if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE | BPF_SK_LOOKUP_F_NO_REUSEPORT))) return -EINVAL; if (unlikely(sk && sk_is_refcounted(sk))) return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */ if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN)) return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */ if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE)) return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */ /* Check if socket is suitable for packet L3/L4 protocol */ if (sk && sk->sk_protocol != ctx->protocol) return -EPROTOTYPE; if (sk && sk->sk_family != ctx->family && (sk->sk_family == AF_INET || ipv6_only_sock(sk))) return -EAFNOSUPPORT; if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE)) return -EEXIST; /* Select socket as lookup result */ ctx->selected_sk = sk; ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT; return 0; } static const struct bpf_func_proto bpf_sk_lookup_assign_proto = { .func = bpf_sk_lookup_assign, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL, .arg3_type = ARG_ANYTHING, }; static const struct bpf_func_proto * sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_assign: return &bpf_sk_lookup_assign_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } static bool sk_lookup_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct bpf_sk_lookup)) return false; if (off % size != 0) return false; if (type != BPF_READ) return false; switch (off) { case offsetof(struct bpf_sk_lookup, sk): info->reg_type = PTR_TO_SOCKET_OR_NULL; return size == sizeof(__u64); case bpf_ctx_range(struct bpf_sk_lookup, family): case bpf_ctx_range(struct bpf_sk_lookup, protocol): case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4): case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): case bpf_ctx_range(struct bpf_sk_lookup, local_port): case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex): bpf_ctx_record_field_size(info, sizeof(__u32)); return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); case bpf_ctx_range(struct bpf_sk_lookup, remote_port): /* Allow 4-byte access to 2-byte field for backward compatibility */ if (size == sizeof(__u32)) return true; bpf_ctx_record_field_size(info, sizeof(__be16)); return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16)); case offsetofend(struct bpf_sk_lookup, remote_port) ... offsetof(struct bpf_sk_lookup, local_ip4) - 1: /* Allow access to zero padding for backward compatibility */ bpf_ctx_record_field_size(info, sizeof(__u16)); return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16)); default: return false; } } static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct bpf_sk_lookup, sk): *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, offsetof(struct bpf_sk_lookup_kern, selected_sk)); break; case offsetof(struct bpf_sk_lookup, family): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, family, 2, target_size)); break; case offsetof(struct bpf_sk_lookup, protocol): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, protocol, 2, target_size)); break; case offsetof(struct bpf_sk_lookup, remote_ip4): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, v4.saddr, 4, target_size)); break; case offsetof(struct bpf_sk_lookup, local_ip4): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, v4.daddr, 4, target_size)); break; case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): { #if IS_ENABLED(CONFIG_IPV6) int off = si->off; off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]); off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, offsetof(struct bpf_sk_lookup_kern, v6.saddr)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; } case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): { #if IS_ENABLED(CONFIG_IPV6) int off = si->off; off -= offsetof(struct bpf_sk_lookup, local_ip6[0]); off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, offsetof(struct bpf_sk_lookup_kern, v6.daddr)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; } case offsetof(struct bpf_sk_lookup, remote_port): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, sport, 2, target_size)); break; case offsetofend(struct bpf_sk_lookup, remote_port): *target_size = 2; *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); break; case offsetof(struct bpf_sk_lookup, local_port): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, dport, 2, target_size)); break; case offsetof(struct bpf_sk_lookup, ingress_ifindex): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, ingress_ifindex, 4, target_size)); break; } return insn - insn_buf; } const struct bpf_prog_ops sk_lookup_prog_ops = { .test_run = bpf_prog_test_run_sk_lookup, }; const struct bpf_verifier_ops sk_lookup_verifier_ops = { .get_func_proto = sk_lookup_func_proto, .is_valid_access = sk_lookup_is_valid_access, .convert_ctx_access = sk_lookup_convert_ctx_access, }; #endif /* CONFIG_INET */ DEFINE_BPF_DISPATCHER(xdp) void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) { bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); } BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE) #define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type) BTF_SOCK_TYPE_xxx #undef BTF_SOCK_TYPE BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) { /* tcp6_sock type is not generated in dwarf and hence btf, * trigger an explicit type generation here. */ BTF_TYPE_EMIT(struct tcp6_sock); if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && sk->sk_family == AF_INET6) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { .func = bpf_skc_to_tcp6_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], }; BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk) { if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = { .func = bpf_skc_to_tcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP], }; BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk) { /* BTF types for tcp_timewait_sock and inet_timewait_sock are not * generated if CONFIG_INET=n. Trigger an explicit generation here. */ BTF_TYPE_EMIT(struct inet_timewait_sock); BTF_TYPE_EMIT(struct tcp_timewait_sock); #ifdef CONFIG_INET if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT) return (unsigned long)sk; #endif #if IS_BUILTIN(CONFIG_IPV6) if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT) return (unsigned long)sk; #endif return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = { .func = bpf_skc_to_tcp_timewait_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW], }; BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk) { #ifdef CONFIG_INET if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV) return (unsigned long)sk; #endif #if IS_BUILTIN(CONFIG_IPV6) if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV) return (unsigned long)sk; #endif return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { .func = bpf_skc_to_tcp_request_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], }; BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk) { /* udp6_sock type is not generated in dwarf and hence btf, * trigger an explicit type generation here. */ BTF_TYPE_EMIT(struct udp6_sock); if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP && sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { .func = bpf_skc_to_udp6_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], }; BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk) { /* unix_sock type is not generated in dwarf and hence btf, * trigger an explicit type generation here. */ BTF_TYPE_EMIT(struct unix_sock); if (sk && sk_fullsock(sk) && sk->sk_family == AF_UNIX) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_unix_sock_proto = { .func = bpf_skc_to_unix_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UNIX], }; BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk) { BTF_TYPE_EMIT(struct mptcp_sock); return (unsigned long)bpf_mptcp_sock_from_subflow(sk); } const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = { .func = bpf_skc_to_mptcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP], }; BPF_CALL_1(bpf_sock_from_file, struct file *, file) { return (unsigned long)sock_from_file(file); } BTF_ID_LIST(bpf_sock_from_file_btf_ids) BTF_ID(struct, socket) BTF_ID(struct, file) const struct bpf_func_proto bpf_sock_from_file_proto = { .func = bpf_sock_from_file, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .ret_btf_id = &bpf_sock_from_file_btf_ids[0], .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_sock_from_file_btf_ids[1], }; static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func; switch (func_id) { case BPF_FUNC_skc_to_tcp6_sock: func = &bpf_skc_to_tcp6_sock_proto; break; case BPF_FUNC_skc_to_tcp_sock: func = &bpf_skc_to_tcp_sock_proto; break; case BPF_FUNC_skc_to_tcp_timewait_sock: func = &bpf_skc_to_tcp_timewait_sock_proto; break; case BPF_FUNC_skc_to_tcp_request_sock: func = &bpf_skc_to_tcp_request_sock_proto; break; case BPF_FUNC_skc_to_udp6_sock: func = &bpf_skc_to_udp6_sock_proto; break; case BPF_FUNC_skc_to_unix_sock: func = &bpf_skc_to_unix_sock_proto; break; case BPF_FUNC_skc_to_mptcp_sock: func = &bpf_skc_to_mptcp_sock_proto; break; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id, prog); } if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) return NULL; return func; } __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, struct bpf_dynptr *ptr__uninit) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; struct sk_buff *skb = (struct sk_buff *)s; if (flags) { bpf_dynptr_set_null(ptr); return -EINVAL; } bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len); return 0; } __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags, struct bpf_dynptr *ptr__uninit) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; struct xdp_buff *xdp = (struct xdp_buff *)x; if (flags) { bpf_dynptr_set_null(ptr); return -EINVAL; } bpf_dynptr_init(ptr, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp)); return 0; } __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern, const u8 *sun_path, u32 sun_path__sz) { struct sockaddr_un *un; if (sa_kern->sk->sk_family != AF_UNIX) return -EINVAL; /* We do not allow changing the address to unnamed or larger than the * maximum allowed address size for a unix sockaddr. */ if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX) return -EINVAL; un = (struct sockaddr_un *)sa_kern->uaddr; memcpy(un->sun_path, sun_path, sun_path__sz); sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz; return 0; } __bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk, struct bpf_tcp_req_attrs *attrs, int attrs__sz) { #if IS_ENABLED(CONFIG_SYN_COOKIES) struct sk_buff *skb = (struct sk_buff *)s; const struct request_sock_ops *ops; struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct request_sock *req; struct net *net; __u16 min_mss; u32 tsoff = 0; if (attrs__sz != sizeof(*attrs) || attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2]) return -EINVAL; if (!skb_at_tc_ingress(skb)) return -EINVAL; net = dev_net(skb->dev); if (net != sock_net(sk)) return -ENETUNREACH; switch (skb->protocol) { case htons(ETH_P_IP): ops = &tcp_request_sock_ops; min_mss = 536; break; #if IS_BUILTIN(CONFIG_IPV6) case htons(ETH_P_IPV6): ops = &tcp6_request_sock_ops; min_mss = IPV6_MIN_MTU - 60; break; #endif default: return -EINVAL; } if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN || sk_is_mptcp(sk)) return -EINVAL; if (attrs->mss < min_mss) return -EINVAL; if (attrs->wscale_ok) { if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) return -EINVAL; if (attrs->snd_wscale > TCP_MAX_WSCALE || attrs->rcv_wscale > TCP_MAX_WSCALE) return -EINVAL; } if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) return -EINVAL; if (attrs->tstamp_ok) { if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) return -EINVAL; tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns()); } req = inet_reqsk_alloc(ops, sk, false); if (!req) return -ENOMEM; ireq = inet_rsk(req); treq = tcp_rsk(req); req->rsk_listener = sk; req->syncookie = 1; req->mss = attrs->mss; req->ts_recent = attrs->rcv_tsval; ireq->snd_wscale = attrs->snd_wscale; ireq->rcv_wscale = attrs->rcv_wscale; ireq->tstamp_ok = !!attrs->tstamp_ok; ireq->sack_ok = !!attrs->sack_ok; ireq->wscale_ok = !!attrs->wscale_ok; ireq->ecn_ok = !!attrs->ecn_ok; treq->req_usec_ts = !!attrs->usec_ts_ok; treq->ts_off = tsoff; skb_orphan(skb); skb->sk = req_to_sk(req); skb->destructor = sock_pfree; return 0; #else return -EOPNOTSUPP; #endif } __bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, struct bpf_dynptr *ptr__uninit) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; int err; err = bpf_dynptr_from_skb(skb, flags, ptr__uninit); if (err) return err; bpf_dynptr_set_rdonly(ptr); return 0; } BTF_KFUNCS_START(bpf_kfunc_check_set_skb) BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr) BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr) BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_skb, }; static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_xdp, }; static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_sock_addr, }; static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_tcp_reqsk, }; static int __init bpf_kfunc_init(void) { int ret; ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); } late_initcall(bpf_kfunc_init); __bpf_kfunc_start_defs(); /* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code. * * The function expects a non-NULL pointer to a socket, and invokes the * protocol specific socket destroy handlers. * * The helper can only be called from BPF contexts that have acquired the socket * locks. * * Parameters: * @sock: Pointer to socket to be destroyed * * Return: * On error, may return EPROTONOSUPPORT, EINVAL. * EPROTONOSUPPORT if protocol specific destroy handler is not supported. * 0 otherwise */ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) { struct sock *sk = (struct sock *)sock; /* The locking semantics that allow for synchronous execution of the * destroy handlers are only supported for TCP and UDP. * Supporting protocols will need to acquire sock lock in the BPF context * prior to invoking this kfunc. */ if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP && sk->sk_protocol != IPPROTO_UDP)) return -EOPNOTSUPP; return sk->sk_prot->diag_destroy(sk, ECONNABORTED); } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids) BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids) static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) { if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) && prog->expected_attach_type != BPF_TRACE_ITER) return -EACCES; return 0; } static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = { .owner = THIS_MODULE, .set = &bpf_sk_iter_kfunc_ids, .filter = tracing_iter_filter, }; static int init_subsystem(void) { return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set); } late_initcall(init_subsystem); |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Based on arch/arm/include/asm/traps.h * * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_TRAP_H #define __ASM_TRAP_H #include <linux/list.h> #include <asm/esr.h> #include <asm/ptrace.h> #include <asm/sections.h> #ifdef CONFIG_ARMV8_DEPRECATED bool try_emulate_armv8_deprecated(struct pt_regs *regs, u32 insn); #else static inline bool try_emulate_armv8_deprecated(struct pt_regs *regs, u32 insn) { return false; } #endif /* CONFIG_ARMV8_DEPRECATED */ void force_signal_inject(int signal, int code, unsigned long address, unsigned long err); void arm64_notify_segfault(unsigned long addr); void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str); void arm64_force_sig_fault_pkey(unsigned long far, const char *str, int pkey); void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str); void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str); int early_brk64(unsigned long addr, unsigned long esr, struct pt_regs *regs); /* * Move regs->pc to next instruction and do necessary setup before it * is executed. */ void arm64_skip_faulting_instruction(struct pt_regs *regs, unsigned long size); static inline int __in_irqentry_text(unsigned long ptr) { return ptr >= (unsigned long)&__irqentry_text_start && ptr < (unsigned long)&__irqentry_text_end; } static inline int in_entry_text(unsigned long ptr) { return ptr >= (unsigned long)&__entry_text_start && ptr < (unsigned long)&__entry_text_end; } /* * CPUs with the RAS extensions have an Implementation-Defined-Syndrome bit * to indicate whether this ESR has a RAS encoding. CPUs without this feature * have a ISS-Valid bit in the same position. * If this bit is set, we know its not a RAS SError. * If its clear, we need to know if the CPU supports RAS. Uncategorized RAS * errors share the same encoding as an all-zeros encoding from a CPU that * doesn't support RAS. */ static inline bool arm64_is_ras_serror(unsigned long esr) { WARN_ON(preemptible()); if (esr & ESR_ELx_IDS) return false; if (this_cpu_has_cap(ARM64_HAS_RAS_EXTN)) return true; else return false; } /* * Return the AET bits from a RAS SError's ESR. * * It is implementation defined whether Uncategorized errors are containable. * We treat them as Uncontainable. * Non-RAS SError's are reported as Uncontained/Uncategorized. */ static inline unsigned long arm64_ras_serror_get_severity(unsigned long esr) { unsigned long aet = esr & ESR_ELx_AET; if (!arm64_is_ras_serror(esr)) { /* Not a RAS error, we can't interpret the ESR. */ return ESR_ELx_AET_UC; } /* * AET is RES0 if 'the value returned in the DFSC field is not * [ESR_ELx_FSC_SERROR]' */ if ((esr & ESR_ELx_FSC) != ESR_ELx_FSC_SERROR) { /* No severity information : Uncategorized */ return ESR_ELx_AET_UC; } return aet; } bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned long esr); void __noreturn arm64_serror_panic(struct pt_regs *regs, unsigned long esr); static inline void arm64_mops_reset_regs(struct user_pt_regs *regs, unsigned long esr) { bool wrong_option = esr & ESR_ELx_MOPS_ISS_WRONG_OPTION; bool option_a = esr & ESR_ELx_MOPS_ISS_OPTION_A; int dstreg = ESR_ELx_MOPS_ISS_DESTREG(esr); int srcreg = ESR_ELx_MOPS_ISS_SRCREG(esr); int sizereg = ESR_ELx_MOPS_ISS_SIZEREG(esr); unsigned long dst, src, size; dst = regs->regs[dstreg]; src = regs->regs[srcreg]; size = regs->regs[sizereg]; /* * Put the registers back in the original format suitable for a * prologue instruction, using the generic return routine from the * Arm ARM (DDI 0487I.a) rules CNTMJ and MWFQH. */ if (esr & ESR_ELx_MOPS_ISS_MEM_INST) { /* SET* instruction */ if (option_a ^ wrong_option) { /* Format is from Option A; forward set */ regs->regs[dstreg] = dst + size; regs->regs[sizereg] = -size; } } else { /* CPY* instruction */ if (!(option_a ^ wrong_option)) { /* Format is from Option B */ if (regs->pstate & PSR_N_BIT) { /* Backward copy */ regs->regs[dstreg] = dst - size; regs->regs[srcreg] = src - size; } } else { /* Format is from Option A */ if (size & BIT(63)) { /* Forward copy */ regs->regs[dstreg] = dst + size; regs->regs[srcreg] = src + size; regs->regs[sizereg] = -size; } } } if (esr & ESR_ELx_MOPS_ISS_FROM_EPILOGUE) regs->pc -= 8; else regs->pc -= 4; } #endif |
| 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/netprio_cgroup.c Priority Control Group * * Authors: Neil Horman <nhorman@tuxdriver.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/cgroup.h> #include <linux/rcupdate.h> #include <linux/atomic.h> #include <linux/sched/task.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/sock.h> #include <net/netprio_cgroup.h> #include <linux/fdtable.h> /* * netprio allocates per-net_device priomap array which is indexed by * css->id. Limiting css ID to 16bits doesn't lose anything. */ #define NETPRIO_ID_MAX USHRT_MAX #define PRIOMAP_MIN_SZ 128 /* * Extend @dev->priomap so that it's large enough to accommodate * @target_idx. @dev->priomap.priomap_len > @target_idx after successful * return. Must be called under rtnl lock. */ static int extend_netdev_table(struct net_device *dev, u32 target_idx) { struct netprio_map *old, *new; size_t new_sz, new_len; /* is the existing priomap large enough? */ old = rtnl_dereference(dev->priomap); if (old && old->priomap_len > target_idx) return 0; /* * Determine the new size. Let's keep it power-of-two. We start * from PRIOMAP_MIN_SZ and double it until it's large enough to * accommodate @target_idx. */ new_sz = PRIOMAP_MIN_SZ; while (true) { new_len = (new_sz - offsetof(struct netprio_map, priomap)) / sizeof(new->priomap[0]); if (new_len > target_idx) break; new_sz *= 2; /* overflowed? */ if (WARN_ON(new_sz < PRIOMAP_MIN_SZ)) return -ENOSPC; } /* allocate & copy */ new = kzalloc(new_sz, GFP_KERNEL); if (!new) return -ENOMEM; if (old) memcpy(new->priomap, old->priomap, old->priomap_len * sizeof(old->priomap[0])); new->priomap_len = new_len; /* install the new priomap */ rcu_assign_pointer(dev->priomap, new); if (old) kfree_rcu(old, rcu); return 0; } /** * netprio_prio - return the effective netprio of a cgroup-net_device pair * @css: css part of the target pair * @dev: net_device part of the target pair * * Should be called under RCU read or rtnl lock. */ static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev) { struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); int id = css->id; if (map && id < map->priomap_len) return map->priomap[id]; return 0; } /** * netprio_set_prio - set netprio on a cgroup-net_device pair * @css: css part of the target pair * @dev: net_device part of the target pair * @prio: prio to set * * Set netprio to @prio on @css-@dev pair. Should be called under rtnl * lock and may fail under memory pressure for non-zero @prio. */ static int netprio_set_prio(struct cgroup_subsys_state *css, struct net_device *dev, u32 prio) { struct netprio_map *map; int id = css->id; int ret; /* avoid extending priomap for zero writes */ map = rtnl_dereference(dev->priomap); if (!prio && (!map || map->priomap_len <= id)) return 0; ret = extend_netdev_table(dev, id); if (ret) return ret; map = rtnl_dereference(dev->priomap); map->priomap[id] = prio; return 0; } static struct cgroup_subsys_state * cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_subsys_state *css; css = kzalloc(sizeof(*css), GFP_KERNEL); if (!css) return ERR_PTR(-ENOMEM); return css; } static int cgrp_css_online(struct cgroup_subsys_state *css) { struct cgroup_subsys_state *parent_css = css->parent; struct net_device *dev; int ret = 0; if (css->id > NETPRIO_ID_MAX) return -ENOSPC; if (!parent_css) return 0; rtnl_lock(); /* * Inherit prios from the parent. As all prios are set during * onlining, there is no need to clear them on offline. */ for_each_netdev(&init_net, dev) { u32 prio = netprio_prio(parent_css, dev); ret = netprio_set_prio(css, dev, prio); if (ret) break; } rtnl_unlock(); return ret; } static void cgrp_css_free(struct cgroup_subsys_state *css) { kfree(css); } static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) { return css->id; } static int read_priomap(struct seq_file *sf, void *v) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) seq_printf(sf, "%s %u\n", dev->name, netprio_prio(seq_css(sf), dev)); rcu_read_unlock(); return 0; } static ssize_t write_priomap(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { char devname[IFNAMSIZ + 1]; struct net_device *dev; u32 prio; int ret; if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) return -EINVAL; dev = dev_get_by_name(&init_net, devname); if (!dev) return -ENODEV; rtnl_lock(); ret = netprio_set_prio(of_css(of), dev, prio); rtnl_unlock(); dev_put(dev); return ret ?: nbytes; } static int update_netprio(const void *v, struct file *file, unsigned n) { struct socket *sock = sock_from_file(file); if (sock) sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, (unsigned long)v); return 0; } static void net_prio_attach(struct cgroup_taskset *tset) { struct task_struct *p; struct cgroup_subsys_state *css; cgroup_taskset_for_each(p, css, tset) { void *v = (void *)(unsigned long)css->id; task_lock(p); iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); } } static struct cftype ss_files[] = { { .name = "prioidx", .read_u64 = read_prioidx, }, { .name = "ifpriomap", .seq_show = read_priomap, .write = write_priomap, }, { } /* terminate */ }; struct cgroup_subsys net_prio_cgrp_subsys = { .css_alloc = cgrp_css_alloc, .css_online = cgrp_css_online, .css_free = cgrp_css_free, .attach = net_prio_attach, .legacy_cftypes = ss_files, }; static int netprio_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netprio_map *old; /* * Note this is called with rtnl_lock held so we have update side * protection on our rcu assignments */ switch (event) { case NETDEV_UNREGISTER: old = rtnl_dereference(dev->priomap); RCU_INIT_POINTER(dev->priomap, NULL); if (old) kfree_rcu(old, rcu); break; } return NOTIFY_DONE; } static struct notifier_block netprio_device_notifier = { .notifier_call = netprio_device_event }; static int __init init_cgroup_netprio(void) { register_netdevice_notifier(&netprio_device_notifier); return 0; } subsys_initcall(init_cgroup_netprio); |
| 4 4 4 4 202 203 202 202 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 | // SPDX-License-Identifier: GPL-2.0 /* * Implementation of the multi-level security (MLS) policy. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ /* * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> * Support for enhanced MLS infrastructure. * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. * * Updated: Hewlett-Packard <paul@paul-moore.com> * Added support to import/export the MLS label from NetLabel * Copyright (C) Hewlett-Packard Development Company, L.P., 2006 */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/errno.h> #include <net/netlabel.h> #include "sidtab.h" #include "mls.h" #include "policydb.h" #include "services.h" /* * Return the length in bytes for the MLS fields of the * security context string representation of `context'. */ int mls_compute_context_len(struct policydb *p, struct context *context) { int i, l, len, head, prev; char *nm; struct ebitmap *e; struct ebitmap_node *node; if (!p->mls_enabled) return 0; len = 1; /* for the beginning ":" */ for (l = 0; l < 2; l++) { u32 index_sens = context->range.level[l].sens; len += strlen(sym_name(p, SYM_LEVELS, index_sens - 1)); /* categories */ head = -2; prev = -2; e = &context->range.level[l].cat; ebitmap_for_each_positive_bit(e, node, i) { if (i - prev > 1) { /* one or more negative bits are skipped */ if (head != prev) { nm = sym_name(p, SYM_CATS, prev); len += strlen(nm) + 1; } nm = sym_name(p, SYM_CATS, i); len += strlen(nm) + 1; head = i; } prev = i; } if (prev != head) { nm = sym_name(p, SYM_CATS, prev); len += strlen(nm) + 1; } if (l == 0) { if (mls_level_eq(&context->range.level[0], &context->range.level[1])) break; else len++; } } return len; } /* * Write the security context string representation of * the MLS fields of `context' into the string `*scontext'. * Update `*scontext' to point to the end of the MLS fields. */ void mls_sid_to_context(struct policydb *p, struct context *context, char **scontext) { char *scontextp, *nm; int i, l, head, prev; struct ebitmap *e; struct ebitmap_node *node; if (!p->mls_enabled) return; scontextp = *scontext; *scontextp = ':'; scontextp++; for (l = 0; l < 2; l++) { strcpy(scontextp, sym_name(p, SYM_LEVELS, context->range.level[l].sens - 1)); scontextp += strlen(scontextp); /* categories */ head = -2; prev = -2; e = &context->range.level[l].cat; ebitmap_for_each_positive_bit(e, node, i) { if (i - prev > 1) { /* one or more negative bits are skipped */ if (prev != head) { if (prev - head > 1) *scontextp++ = '.'; else *scontextp++ = ','; nm = sym_name(p, SYM_CATS, prev); strcpy(scontextp, nm); scontextp += strlen(nm); } if (prev < 0) *scontextp++ = ':'; else *scontextp++ = ','; nm = sym_name(p, SYM_CATS, i); strcpy(scontextp, nm); scontextp += strlen(nm); head = i; } prev = i; } if (prev != head) { if (prev - head > 1) *scontextp++ = '.'; else *scontextp++ = ','; nm = sym_name(p, SYM_CATS, prev); strcpy(scontextp, nm); scontextp += strlen(nm); } if (l == 0) { if (mls_level_eq(&context->range.level[0], &context->range.level[1])) break; else *scontextp++ = '-'; } } *scontext = scontextp; } int mls_level_isvalid(struct policydb *p, struct mls_level *l) { struct level_datum *levdatum; if (!l->sens || l->sens > p->p_levels.nprim) return 0; levdatum = symtab_search(&p->p_levels, sym_name(p, SYM_LEVELS, l->sens - 1)); if (!levdatum) return 0; /* * Return 1 iff all the bits set in l->cat are also be set in * levdatum->level->cat and no bit in l->cat is larger than * p->p_cats.nprim. */ return ebitmap_contains(&levdatum->level.cat, &l->cat, p->p_cats.nprim); } int mls_range_isvalid(struct policydb *p, struct mls_range *r) { return (mls_level_isvalid(p, &r->level[0]) && mls_level_isvalid(p, &r->level[1]) && mls_level_dom(&r->level[1], &r->level[0])); } /* * Return 1 if the MLS fields in the security context * structure `c' are valid. Return 0 otherwise. */ int mls_context_isvalid(struct policydb *p, struct context *c) { struct user_datum *usrdatum; if (!p->mls_enabled) return 1; if (!mls_range_isvalid(p, &c->range)) return 0; if (c->role == OBJECT_R_VAL) return 1; /* * User must be authorized for the MLS range. */ if (!c->user || c->user > p->p_users.nprim) return 0; usrdatum = p->user_val_to_struct[c->user - 1]; if (!mls_range_contains(usrdatum->range, c->range)) return 0; /* user may not be associated with range */ return 1; } /* * Set the MLS fields in the security context structure * `context' based on the string representation in * the string `scontext'. * * This function modifies the string in place, inserting * NULL characters to terminate the MLS fields. * * If a def_sid is provided and no MLS field is present, * copy the MLS field of the associated default context. * Used for upgraded to MLS systems where objects may lack * MLS fields. * * Policy read-lock must be held for sidtab lookup. * */ int mls_context_to_sid(struct policydb *pol, char oldc, char *scontext, struct context *context, struct sidtab *s, u32 def_sid) { char *sensitivity, *cur_cat, *next_cat, *rngptr; struct level_datum *levdatum; struct cat_datum *catdatum, *rngdatum; u32 i; int l, rc; char *rangep[2]; if (!pol->mls_enabled) { /* * With no MLS, only return -EINVAL if there is a MLS field * and it did not come from an xattr. */ if (oldc && def_sid == SECSID_NULL) return -EINVAL; return 0; } /* * No MLS component to the security context, try and map to * default if provided. */ if (!oldc) { struct context *defcon; if (def_sid == SECSID_NULL) return -EINVAL; defcon = sidtab_search(s, def_sid); if (!defcon) return -EINVAL; return mls_context_cpy(context, defcon); } /* * If we're dealing with a range, figure out where the two parts * of the range begin. */ rangep[0] = scontext; rangep[1] = strchr(scontext, '-'); if (rangep[1]) { rangep[1][0] = '\0'; rangep[1]++; } /* For each part of the range: */ for (l = 0; l < 2; l++) { /* Split sensitivity and category set. */ sensitivity = rangep[l]; if (sensitivity == NULL) break; next_cat = strchr(sensitivity, ':'); if (next_cat) *(next_cat++) = '\0'; /* Parse sensitivity. */ levdatum = symtab_search(&pol->p_levels, sensitivity); if (!levdatum) return -EINVAL; context->range.level[l].sens = levdatum->level.sens; /* Extract category set. */ while (next_cat != NULL) { cur_cat = next_cat; next_cat = strchr(next_cat, ','); if (next_cat != NULL) *(next_cat++) = '\0'; /* Separate into range if exists */ rngptr = strchr(cur_cat, '.'); if (rngptr != NULL) { /* Remove '.' */ *rngptr++ = '\0'; } catdatum = symtab_search(&pol->p_cats, cur_cat); if (!catdatum) return -EINVAL; rc = ebitmap_set_bit(&context->range.level[l].cat, catdatum->value - 1, 1); if (rc) return rc; /* If range, set all categories in range */ if (rngptr == NULL) continue; rngdatum = symtab_search(&pol->p_cats, rngptr); if (!rngdatum) return -EINVAL; if (catdatum->value >= rngdatum->value) return -EINVAL; for (i = catdatum->value; i < rngdatum->value; i++) { rc = ebitmap_set_bit( &context->range.level[l].cat, i, 1); if (rc) return rc; } } } /* If we didn't see a '-', the range start is also the range end. */ if (rangep[1] == NULL) { context->range.level[1].sens = context->range.level[0].sens; rc = ebitmap_cpy(&context->range.level[1].cat, &context->range.level[0].cat); if (rc) return rc; } return 0; } /* * Set the MLS fields in the security context structure * `context' based on the string representation in * the string `str'. This function will allocate temporary memory with the * given constraints of gfp_mask. */ int mls_from_string(struct policydb *p, char *str, struct context *context, gfp_t gfp_mask) { char *tmpstr; int rc; if (!p->mls_enabled) return -EINVAL; tmpstr = kstrdup(str, gfp_mask); if (!tmpstr) { rc = -ENOMEM; } else { rc = mls_context_to_sid(p, ':', tmpstr, context, NULL, SECSID_NULL); kfree(tmpstr); } return rc; } /* * Copies the MLS range `range' into `context'. */ int mls_range_set(struct context *context, struct mls_range *range) { int l, rc = 0; /* Copy the MLS range into the context */ for (l = 0; l < 2; l++) { context->range.level[l].sens = range->level[l].sens; rc = ebitmap_cpy(&context->range.level[l].cat, &range->level[l].cat); if (rc) break; } return rc; } int mls_setup_user_range(struct policydb *p, struct context *fromcon, struct user_datum *user, struct context *usercon) { if (p->mls_enabled) { struct mls_level *fromcon_sen = &(fromcon->range.level[0]); struct mls_level *fromcon_clr = &(fromcon->range.level[1]); struct mls_level *user_low = &(user->range.level[0]); struct mls_level *user_clr = &(user->range.level[1]); struct mls_level *user_def = &(user->dfltlevel); struct mls_level *usercon_sen = &(usercon->range.level[0]); struct mls_level *usercon_clr = &(usercon->range.level[1]); /* Honor the user's default level if we can */ if (mls_level_between(user_def, fromcon_sen, fromcon_clr)) *usercon_sen = *user_def; else if (mls_level_between(fromcon_sen, user_def, user_clr)) *usercon_sen = *fromcon_sen; else if (mls_level_between(fromcon_clr, user_low, user_def)) *usercon_sen = *user_low; else return -EINVAL; /* Lower the clearance of available contexts if the clearance of "fromcon" is lower than that of the user's default clearance (but only if the "fromcon" clearance dominates the user's computed sensitivity level) */ if (mls_level_dom(user_clr, fromcon_clr)) *usercon_clr = *fromcon_clr; else if (mls_level_dom(fromcon_clr, user_clr)) *usercon_clr = *user_clr; else return -EINVAL; } return 0; } /* * Convert the MLS fields in the security context * structure `oldc' from the values specified in the * policy `oldp' to the values specified in the policy `newp', * storing the resulting context in `newc'. */ int mls_convert_context(struct policydb *oldp, struct policydb *newp, struct context *oldc, struct context *newc) { struct level_datum *levdatum; struct cat_datum *catdatum; struct ebitmap_node *node; u32 i; int l; if (!oldp->mls_enabled || !newp->mls_enabled) return 0; for (l = 0; l < 2; l++) { char *name = sym_name(oldp, SYM_LEVELS, oldc->range.level[l].sens - 1); levdatum = symtab_search(&newp->p_levels, name); if (!levdatum) return -EINVAL; newc->range.level[l].sens = levdatum->level.sens; ebitmap_for_each_positive_bit(&oldc->range.level[l].cat, node, i) { int rc; catdatum = symtab_search(&newp->p_cats, sym_name(oldp, SYM_CATS, i)); if (!catdatum) return -EINVAL; rc = ebitmap_set_bit(&newc->range.level[l].cat, catdatum->value - 1, 1); if (rc) return rc; } } return 0; } int mls_compute_sid(struct policydb *p, struct context *scontext, struct context *tcontext, u16 tclass, u32 specified, struct context *newcontext, bool sock) { struct range_trans rtr; struct mls_range *r; struct class_datum *cladatum; char default_range = 0; if (!p->mls_enabled) return 0; switch (specified) { case AVTAB_TRANSITION: /* Look for a range transition rule. */ rtr.source_type = scontext->type; rtr.target_type = tcontext->type; rtr.target_class = tclass; r = policydb_rangetr_search(p, &rtr); if (r) return mls_range_set(newcontext, r); if (tclass && tclass <= p->p_classes.nprim) { cladatum = p->class_val_to_struct[tclass - 1]; if (cladatum) default_range = cladatum->default_range; } switch (default_range) { case DEFAULT_SOURCE_LOW: return mls_context_cpy_low(newcontext, scontext); case DEFAULT_SOURCE_HIGH: return mls_context_cpy_high(newcontext, scontext); case DEFAULT_SOURCE_LOW_HIGH: return mls_context_cpy(newcontext, scontext); case DEFAULT_TARGET_LOW: return mls_context_cpy_low(newcontext, tcontext); case DEFAULT_TARGET_HIGH: return mls_context_cpy_high(newcontext, tcontext); case DEFAULT_TARGET_LOW_HIGH: return mls_context_cpy(newcontext, tcontext); case DEFAULT_GLBLUB: return mls_context_glblub(newcontext, scontext, tcontext); } fallthrough; case AVTAB_CHANGE: if ((tclass == p->process_class) || sock) /* Use the process MLS attributes. */ return mls_context_cpy(newcontext, scontext); else /* Use the process effective MLS attributes. */ return mls_context_cpy_low(newcontext, scontext); case AVTAB_MEMBER: /* Use the process effective MLS attributes. */ return mls_context_cpy_low(newcontext, scontext); } return -EINVAL; } #ifdef CONFIG_NETLABEL /** * mls_export_netlbl_lvl - Export the MLS sensitivity levels to NetLabel * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Given the security context copy the low MLS sensitivity level into the * NetLabel MLS sensitivity level field. * */ void mls_export_netlbl_lvl(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { if (!p->mls_enabled) return; secattr->attr.mls.lvl = context->range.level[0].sens - 1; secattr->flags |= NETLBL_SECATTR_MLS_LVL; } /** * mls_import_netlbl_lvl - Import the NetLabel MLS sensitivity levels * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Given the security context and the NetLabel security attributes, copy the * NetLabel MLS sensitivity level into the context. * */ void mls_import_netlbl_lvl(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { if (!p->mls_enabled) return; context->range.level[0].sens = secattr->attr.mls.lvl + 1; context->range.level[1].sens = context->range.level[0].sens; } /** * mls_export_netlbl_cat - Export the MLS categories to NetLabel * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Given the security context copy the low MLS categories into the NetLabel * MLS category field. Returns zero on success, negative values on failure. * */ int mls_export_netlbl_cat(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { int rc; if (!p->mls_enabled) return 0; rc = ebitmap_netlbl_export(&context->range.level[0].cat, &secattr->attr.mls.cat); if (rc == 0 && secattr->attr.mls.cat != NULL) secattr->flags |= NETLBL_SECATTR_MLS_CAT; return rc; } /** * mls_import_netlbl_cat - Import the MLS categories from NetLabel * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Copy the NetLabel security attributes into the SELinux context; since the * NetLabel security attribute only contains a single MLS category use it for * both the low and high categories of the context. Returns zero on success, * negative values on failure. * */ int mls_import_netlbl_cat(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { int rc; if (!p->mls_enabled) return 0; rc = ebitmap_netlbl_import(&context->range.level[0].cat, secattr->attr.mls.cat); if (rc) goto import_netlbl_cat_failure; memcpy(&context->range.level[1].cat, &context->range.level[0].cat, sizeof(context->range.level[0].cat)); return 0; import_netlbl_cat_failure: ebitmap_destroy(&context->range.level[0].cat); return rc; } #endif /* CONFIG_NETLABEL */ |
| 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_TASK_H #define _LINUX_SCHED_TASK_H /* * Interface between the scheduler and various task lifetime (fork()/exit()) * functionality: */ #include <linux/rcupdate.h> #include <linux/refcount.h> #include <linux/sched.h> #include <linux/uaccess.h> struct task_struct; struct rusage; union thread_union; struct css_set; /* All the bits taken by the old clone syscall. */ #define CLONE_LEGACY_FLAGS 0xffffffffULL struct kernel_clone_args { u64 flags; int __user *pidfd; int __user *child_tid; int __user *parent_tid; const char *name; int exit_signal; u32 kthread:1; u32 io_thread:1; u32 user_worker:1; u32 no_files:1; unsigned long stack; unsigned long stack_size; unsigned long tls; pid_t *set_tid; /* Number of elements in *set_tid */ size_t set_tid_size; int cgroup; int idle; int (*fn)(void *); void *fn_arg; struct cgroup *cgrp; struct css_set *cset; }; /* * This serializes "schedule()" and also protects * the run-queue from deletions/modifications (but * _adding_ to the beginning of the run-queue has * a separate lock). */ extern rwlock_t tasklist_lock; extern spinlock_t mmlist_lock; extern union thread_union init_thread_union; extern struct task_struct init_task; extern int lockdep_tasklist_lock_is_held(void); extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void sched_cancel_fork(struct task_struct *p); extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); void __noreturn make_task_dead(int signr); extern void mm_cache_init(void); extern void proc_caches_init(void); extern void fork_init(void); extern void release_task(struct task_struct * p); extern int copy_thread(struct task_struct *, const struct kernel_clone_args *); extern void flush_thread(void); #ifdef CONFIG_HAVE_EXIT_THREAD extern void exit_thread(struct task_struct *tsk); #else static inline void exit_thread(struct task_struct *tsk) { } #endif extern __noreturn void do_group_exit(int); extern void exit_files(struct task_struct *); extern void exit_itimers(struct task_struct *); extern pid_t kernel_clone(struct kernel_clone_args *kargs); struct task_struct *copy_process(struct pid *pid, int trace, int node, struct kernel_clone_args *args); struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node); struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, unsigned long flags); extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags); extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); int kernel_wait(pid_t pid, int *stat); extern void free_task(struct task_struct *tsk); /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP extern void sched_exec(void); #else #define sched_exec() {} #endif static inline struct task_struct *get_task_struct(struct task_struct *t) { refcount_inc(&t->usage); return t; } static inline struct task_struct *tryget_task_struct(struct task_struct *t) { return refcount_inc_not_zero(&t->usage) ? t : NULL; } extern void __put_task_struct(struct task_struct *t); extern void __put_task_struct_rcu_cb(struct rcu_head *rhp); static inline void put_task_struct(struct task_struct *t) { if (!refcount_dec_and_test(&t->usage)) return; /* * In !RT, it is always safe to call __put_task_struct(). * Under RT, we can only call it in preemptible context. */ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP); lock_map_acquire_try(&put_task_map); __put_task_struct(t); lock_map_release(&put_task_map); return; } /* * under PREEMPT_RT, we can't call put_task_struct * in atomic context because it will indirectly * acquire sleeping locks. * * call_rcu() will schedule delayed_put_task_struct_rcu() * to be called in process context. * * __put_task_struct() is called when * refcount_dec_and_test(&t->usage) succeeds. * * This means that it can't "conflict" with * put_task_struct_rcu_user() which abuses ->rcu the same * way; rcu_users has a reference so task->usage can't be * zero after rcu_users 1 -> 0 transition. * * delayed_free_task() also uses ->rcu, but it is only called * when it fails to fork a process. Therefore, there is no * way it can conflict with put_task_struct(). */ call_rcu(&t->rcu, __put_task_struct_rcu_cb); } DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T)) static inline void put_task_struct_many(struct task_struct *t, int nr) { if (refcount_sub_and_test(nr, &t->usage)) __put_task_struct(t); } void put_task_struct_rcu_user(struct task_struct *task); /* Free all architecture-specific resources held by a thread. */ void release_thread(struct task_struct *dead_task); #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT extern int arch_task_struct_size __read_mostly; #else # define arch_task_struct_size (sizeof(struct task_struct)) #endif #ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST /* * If an architecture has not declared a thread_struct whitelist we * must assume something there may need to be copied to userspace. */ static inline void arch_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { *offset = 0; /* Handle dynamically sized thread_struct. */ *size = arch_task_struct_size - offsetof(struct task_struct, thread); } #endif #ifdef CONFIG_VMAP_STACK static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) { return t->stack_vm_area; } #else static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) { return NULL; } #endif /* * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also * pins the final release of task.io_context. Also protects ->cpuset and * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. * * Nests both inside and outside of read_lock(&tasklist_lock). * It must not be nested with write_lock_irq(&tasklist_lock), * neither inside nor outside. */ static inline void task_lock(struct task_struct *p) { spin_lock(&p->alloc_lock); } static inline void task_unlock(struct task_struct *p) { spin_unlock(&p->alloc_lock); } DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T)) #endif /* _LINUX_SCHED_TASK_H */ |
| 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 47 48 48 48 47 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 47 48 47 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 | // SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/signal.c * * Copyright (C) 1995-2009 Russell King * Copyright (C) 2012 ARM Ltd. */ #include <linux/cache.h> #include <linux/compat.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/signal.h> #include <linux/freezer.h> #include <linux/stddef.h> #include <linux/uaccess.h> #include <linux/sizes.h> #include <linux/string.h> #include <linux/ratelimit.h> #include <linux/rseq.h> #include <linux/syscalls.h> #include <linux/pkeys.h> #include <asm/daifflags.h> #include <asm/debug-monitors.h> #include <asm/elf.h> #include <asm/exception.h> #include <asm/cacheflush.h> #include <asm/gcs.h> #include <asm/ucontext.h> #include <asm/unistd.h> #include <asm/fpsimd.h> #include <asm/ptrace.h> #include <asm/syscall.h> #include <asm/signal32.h> #include <asm/traps.h> #include <asm/vdso.h> #define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK) /* * Do a signal return; undo the signal stack. These are aligned to 128-bit. */ struct rt_sigframe { struct siginfo info; struct ucontext uc; }; struct rt_sigframe_user_layout { struct rt_sigframe __user *sigframe; struct frame_record __user *next_frame; unsigned long size; /* size of allocated sigframe data */ unsigned long limit; /* largest allowed size */ unsigned long fpsimd_offset; unsigned long esr_offset; unsigned long gcs_offset; unsigned long sve_offset; unsigned long tpidr2_offset; unsigned long za_offset; unsigned long zt_offset; unsigned long fpmr_offset; unsigned long poe_offset; unsigned long extra_offset; unsigned long end_offset; }; /* * Holds any EL0-controlled state that influences unprivileged memory accesses. * This includes both accesses done in userspace and uaccess done in the kernel. * * This state needs to be carefully managed to ensure that it doesn't cause * uaccess to fail when setting up the signal frame, and the signal handler * itself also expects a well-defined state when entered. */ struct user_access_state { u64 por_el0; }; #define TERMINATOR_SIZE round_up(sizeof(struct _aarch64_ctx), 16) #define EXTRA_CONTEXT_SIZE round_up(sizeof(struct extra_context), 16) /* * Save the user access state into ua_state and reset it to disable any * restrictions. */ static void save_reset_user_access_state(struct user_access_state *ua_state) { if (system_supports_poe()) { u64 por_enable_all = 0; for (int pkey = 0; pkey < arch_max_pkey(); pkey++) por_enable_all |= POE_RXW << (pkey * POR_BITS_PER_PKEY); ua_state->por_el0 = read_sysreg_s(SYS_POR_EL0); write_sysreg_s(por_enable_all, SYS_POR_EL0); /* Ensure that any subsequent uaccess observes the updated value */ isb(); } } /* * Set the user access state for invoking the signal handler. * * No uaccess should be done after that function is called. */ static void set_handler_user_access_state(void) { if (system_supports_poe()) write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); } /* * Restore the user access state to the values saved in ua_state. * * No uaccess should be done after that function is called. */ static void restore_user_access_state(const struct user_access_state *ua_state) { if (system_supports_poe()) write_sysreg_s(ua_state->por_el0, SYS_POR_EL0); } static void init_user_layout(struct rt_sigframe_user_layout *user) { const size_t reserved_size = sizeof(user->sigframe->uc.uc_mcontext.__reserved); memset(user, 0, sizeof(*user)); user->size = offsetof(struct rt_sigframe, uc.uc_mcontext.__reserved); user->limit = user->size + reserved_size; user->limit -= TERMINATOR_SIZE; user->limit -= EXTRA_CONTEXT_SIZE; /* Reserve space for extension and terminator ^ */ } static size_t sigframe_size(struct rt_sigframe_user_layout const *user) { return round_up(max(user->size, sizeof(struct rt_sigframe)), 16); } /* * Sanity limit on the approximate maximum size of signal frame we'll * try to generate. Stack alignment padding and the frame record are * not taken into account. This limit is not a guarantee and is * NOT ABI. */ #define SIGFRAME_MAXSZ SZ_256K static int __sigframe_alloc(struct rt_sigframe_user_layout *user, unsigned long *offset, size_t size, bool extend) { size_t padded_size = round_up(size, 16); if (padded_size > user->limit - user->size && !user->extra_offset && extend) { int ret; user->limit += EXTRA_CONTEXT_SIZE; ret = __sigframe_alloc(user, &user->extra_offset, sizeof(struct extra_context), false); if (ret) { user->limit -= EXTRA_CONTEXT_SIZE; return ret; } /* Reserve space for the __reserved[] terminator */ user->size += TERMINATOR_SIZE; /* * Allow expansion up to SIGFRAME_MAXSZ, ensuring space for * the terminator: */ user->limit = SIGFRAME_MAXSZ - TERMINATOR_SIZE; } /* Still not enough space? Bad luck! */ if (padded_size > user->limit - user->size) return -ENOMEM; *offset = user->size; user->size += padded_size; return 0; } /* * Allocate space for an optional record of <size> bytes in the user * signal frame. The offset from the signal frame base address to the * allocated block is assigned to *offset. */ static int sigframe_alloc(struct rt_sigframe_user_layout *user, unsigned long *offset, size_t size) { return __sigframe_alloc(user, offset, size, true); } /* Allocate the null terminator record and prevent further allocations */ static int sigframe_alloc_end(struct rt_sigframe_user_layout *user) { int ret; /* Un-reserve the space reserved for the terminator: */ user->limit += TERMINATOR_SIZE; ret = sigframe_alloc(user, &user->end_offset, sizeof(struct _aarch64_ctx)); if (ret) return ret; /* Prevent further allocation: */ user->limit = user->size; return 0; } static void __user *apply_user_offset( struct rt_sigframe_user_layout const *user, unsigned long offset) { char __user *base = (char __user *)user->sigframe; return base + offset; } struct user_ctxs { struct fpsimd_context __user *fpsimd; u32 fpsimd_size; struct sve_context __user *sve; u32 sve_size; struct tpidr2_context __user *tpidr2; u32 tpidr2_size; struct za_context __user *za; u32 za_size; struct zt_context __user *zt; u32 zt_size; struct fpmr_context __user *fpmr; u32 fpmr_size; struct poe_context __user *poe; u32 poe_size; struct gcs_context __user *gcs; u32 gcs_size; }; static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) { struct user_fpsimd_state const *fpsimd = ¤t->thread.uw.fpsimd_state; int err; /* copy the FP and status/control registers */ err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); __put_user_error(fpsimd->fpsr, &ctx->fpsr, err); __put_user_error(fpsimd->fpcr, &ctx->fpcr, err); /* copy the magic/size information */ __put_user_error(FPSIMD_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(struct fpsimd_context), &ctx->head.size, err); return err ? -EFAULT : 0; } static int restore_fpsimd_context(struct user_ctxs *user) { struct user_fpsimd_state fpsimd; int err = 0; /* check the size information */ if (user->fpsimd_size != sizeof(struct fpsimd_context)) return -EINVAL; /* copy the FP and status/control registers */ err = __copy_from_user(fpsimd.vregs, &(user->fpsimd->vregs), sizeof(fpsimd.vregs)); __get_user_error(fpsimd.fpsr, &(user->fpsimd->fpsr), err); __get_user_error(fpsimd.fpcr, &(user->fpsimd->fpcr), err); clear_thread_flag(TIF_SVE); current->thread.fp_type = FP_STATE_FPSIMD; /* load the hardware registers from the fpsimd_state structure */ if (!err) fpsimd_update_current_state(&fpsimd); return err ? -EFAULT : 0; } static int preserve_fpmr_context(struct fpmr_context __user *ctx) { int err = 0; current->thread.uw.fpmr = read_sysreg_s(SYS_FPMR); __put_user_error(FPMR_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); __put_user_error(current->thread.uw.fpmr, &ctx->fpmr, err); return err; } static int restore_fpmr_context(struct user_ctxs *user) { u64 fpmr; int err = 0; if (user->fpmr_size != sizeof(*user->fpmr)) return -EINVAL; __get_user_error(fpmr, &user->fpmr->fpmr, err); if (!err) write_sysreg_s(fpmr, SYS_FPMR); return err; } static int preserve_poe_context(struct poe_context __user *ctx, const struct user_access_state *ua_state) { int err = 0; __put_user_error(POE_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); __put_user_error(ua_state->por_el0, &ctx->por_el0, err); return err; } static int restore_poe_context(struct user_ctxs *user, struct user_access_state *ua_state) { u64 por_el0; int err = 0; if (user->poe_size != sizeof(*user->poe)) return -EINVAL; __get_user_error(por_el0, &(user->poe->por_el0), err); if (!err) ua_state->por_el0 = por_el0; return err; } #ifdef CONFIG_ARM64_SVE static int preserve_sve_context(struct sve_context __user *ctx) { int err = 0; u16 reserved[ARRAY_SIZE(ctx->__reserved)]; u16 flags = 0; unsigned int vl = task_get_sve_vl(current); unsigned int vq = 0; if (thread_sm_enabled(¤t->thread)) { vl = task_get_sme_vl(current); vq = sve_vq_from_vl(vl); flags |= SVE_SIG_FLAG_SM; } else if (current->thread.fp_type == FP_STATE_SVE) { vq = sve_vq_from_vl(vl); } memset(reserved, 0, sizeof(reserved)); __put_user_error(SVE_MAGIC, &ctx->head.magic, err); __put_user_error(round_up(SVE_SIG_CONTEXT_SIZE(vq), 16), &ctx->head.size, err); __put_user_error(vl, &ctx->vl, err); __put_user_error(flags, &ctx->flags, err); BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); if (vq) { /* * This assumes that the SVE state has already been saved to * the task struct by calling the function * fpsimd_signal_preserve_current_state(). */ err |= __copy_to_user((char __user *)ctx + SVE_SIG_REGS_OFFSET, current->thread.sve_state, SVE_SIG_REGS_SIZE(vq)); } return err ? -EFAULT : 0; } static int restore_sve_fpsimd_context(struct user_ctxs *user) { int err = 0; unsigned int vl, vq; struct user_fpsimd_state fpsimd; u16 user_vl, flags; if (user->sve_size < sizeof(*user->sve)) return -EINVAL; __get_user_error(user_vl, &(user->sve->vl), err); __get_user_error(flags, &(user->sve->flags), err); if (err) return err; if (flags & SVE_SIG_FLAG_SM) { if (!system_supports_sme()) return -EINVAL; vl = task_get_sme_vl(current); } else { /* * A SME only system use SVE for streaming mode so can * have a SVE formatted context with a zero VL and no * payload data. */ if (!system_supports_sve() && !system_supports_sme()) return -EINVAL; vl = task_get_sve_vl(current); } if (user_vl != vl) return -EINVAL; if (user->sve_size == sizeof(*user->sve)) { clear_thread_flag(TIF_SVE); current->thread.svcr &= ~SVCR_SM_MASK; current->thread.fp_type = FP_STATE_FPSIMD; goto fpsimd_only; } vq = sve_vq_from_vl(vl); if (user->sve_size < SVE_SIG_CONTEXT_SIZE(vq)) return -EINVAL; /* * Careful: we are about __copy_from_user() directly into * thread.sve_state with preemption enabled, so protection is * needed to prevent a racing context switch from writing stale * registers back over the new data. */ fpsimd_flush_task_state(current); /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ sve_alloc(current, true); if (!current->thread.sve_state) { clear_thread_flag(TIF_SVE); return -ENOMEM; } err = __copy_from_user(current->thread.sve_state, (char __user const *)user->sve + SVE_SIG_REGS_OFFSET, SVE_SIG_REGS_SIZE(vq)); if (err) return -EFAULT; if (flags & SVE_SIG_FLAG_SM) current->thread.svcr |= SVCR_SM_MASK; else set_thread_flag(TIF_SVE); current->thread.fp_type = FP_STATE_SVE; fpsimd_only: /* copy the FP and status/control registers */ /* restore_sigframe() already checked that user->fpsimd != NULL. */ err = __copy_from_user(fpsimd.vregs, user->fpsimd->vregs, sizeof(fpsimd.vregs)); __get_user_error(fpsimd.fpsr, &user->fpsimd->fpsr, err); __get_user_error(fpsimd.fpcr, &user->fpsimd->fpcr, err); /* load the hardware registers from the fpsimd_state structure */ if (!err) fpsimd_update_current_state(&fpsimd); return err ? -EFAULT : 0; } #else /* ! CONFIG_ARM64_SVE */ static int restore_sve_fpsimd_context(struct user_ctxs *user) { WARN_ON_ONCE(1); return -EINVAL; } /* Turn any non-optimised out attempts to use this into a link error: */ extern int preserve_sve_context(void __user *ctx); #endif /* ! CONFIG_ARM64_SVE */ #ifdef CONFIG_ARM64_SME static int preserve_tpidr2_context(struct tpidr2_context __user *ctx) { int err = 0; current->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); __put_user_error(TPIDR2_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); __put_user_error(current->thread.tpidr2_el0, &ctx->tpidr2, err); return err; } static int restore_tpidr2_context(struct user_ctxs *user) { u64 tpidr2_el0; int err = 0; if (user->tpidr2_size != sizeof(*user->tpidr2)) return -EINVAL; __get_user_error(tpidr2_el0, &user->tpidr2->tpidr2, err); if (!err) write_sysreg_s(tpidr2_el0, SYS_TPIDR2_EL0); return err; } static int preserve_za_context(struct za_context __user *ctx) { int err = 0; u16 reserved[ARRAY_SIZE(ctx->__reserved)]; unsigned int vl = task_get_sme_vl(current); unsigned int vq; if (thread_za_enabled(¤t->thread)) vq = sve_vq_from_vl(vl); else vq = 0; memset(reserved, 0, sizeof(reserved)); __put_user_error(ZA_MAGIC, &ctx->head.magic, err); __put_user_error(round_up(ZA_SIG_CONTEXT_SIZE(vq), 16), &ctx->head.size, err); __put_user_error(vl, &ctx->vl, err); BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); if (vq) { /* * This assumes that the ZA state has already been saved to * the task struct by calling the function * fpsimd_signal_preserve_current_state(). */ err |= __copy_to_user((char __user *)ctx + ZA_SIG_REGS_OFFSET, current->thread.sme_state, ZA_SIG_REGS_SIZE(vq)); } return err ? -EFAULT : 0; } static int restore_za_context(struct user_ctxs *user) { int err = 0; unsigned int vq; u16 user_vl; if (user->za_size < sizeof(*user->za)) return -EINVAL; __get_user_error(user_vl, &(user->za->vl), err); if (err) return err; if (user_vl != task_get_sme_vl(current)) return -EINVAL; if (user->za_size == sizeof(*user->za)) { current->thread.svcr &= ~SVCR_ZA_MASK; return 0; } vq = sve_vq_from_vl(user_vl); if (user->za_size < ZA_SIG_CONTEXT_SIZE(vq)) return -EINVAL; /* * Careful: we are about __copy_from_user() directly into * thread.sme_state with preemption enabled, so protection is * needed to prevent a racing context switch from writing stale * registers back over the new data. */ fpsimd_flush_task_state(current); /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ sme_alloc(current, true); if (!current->thread.sme_state) { current->thread.svcr &= ~SVCR_ZA_MASK; clear_thread_flag(TIF_SME); return -ENOMEM; } err = __copy_from_user(current->thread.sme_state, (char __user const *)user->za + ZA_SIG_REGS_OFFSET, ZA_SIG_REGS_SIZE(vq)); if (err) return -EFAULT; set_thread_flag(TIF_SME); current->thread.svcr |= SVCR_ZA_MASK; return 0; } static int preserve_zt_context(struct zt_context __user *ctx) { int err = 0; u16 reserved[ARRAY_SIZE(ctx->__reserved)]; if (WARN_ON(!thread_za_enabled(¤t->thread))) return -EINVAL; memset(reserved, 0, sizeof(reserved)); __put_user_error(ZT_MAGIC, &ctx->head.magic, err); __put_user_error(round_up(ZT_SIG_CONTEXT_SIZE(1), 16), &ctx->head.size, err); __put_user_error(1, &ctx->nregs, err); BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); /* * This assumes that the ZT state has already been saved to * the task struct by calling the function * fpsimd_signal_preserve_current_state(). */ err |= __copy_to_user((char __user *)ctx + ZT_SIG_REGS_OFFSET, thread_zt_state(¤t->thread), ZT_SIG_REGS_SIZE(1)); return err ? -EFAULT : 0; } static int restore_zt_context(struct user_ctxs *user) { int err; u16 nregs; /* ZA must be restored first for this check to be valid */ if (!thread_za_enabled(¤t->thread)) return -EINVAL; if (user->zt_size != ZT_SIG_CONTEXT_SIZE(1)) return -EINVAL; if (__copy_from_user(&nregs, &(user->zt->nregs), sizeof(nregs))) return -EFAULT; if (nregs != 1) return -EINVAL; /* * Careful: we are about __copy_from_user() directly into * thread.zt_state with preemption enabled, so protection is * needed to prevent a racing context switch from writing stale * registers back over the new data. */ fpsimd_flush_task_state(current); /* From now, fpsimd_thread_switch() won't touch ZT in thread state */ err = __copy_from_user(thread_zt_state(¤t->thread), (char __user const *)user->zt + ZT_SIG_REGS_OFFSET, ZT_SIG_REGS_SIZE(1)); if (err) return -EFAULT; return 0; } #else /* ! CONFIG_ARM64_SME */ /* Turn any non-optimised out attempts to use these into a link error: */ extern int preserve_tpidr2_context(void __user *ctx); extern int restore_tpidr2_context(struct user_ctxs *user); extern int preserve_za_context(void __user *ctx); extern int restore_za_context(struct user_ctxs *user); extern int preserve_zt_context(void __user *ctx); extern int restore_zt_context(struct user_ctxs *user); #endif /* ! CONFIG_ARM64_SME */ #ifdef CONFIG_ARM64_GCS static int preserve_gcs_context(struct gcs_context __user *ctx) { int err = 0; u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0); /* * If GCS is enabled we will add a cap token to the frame, * include it in the GCSPR_EL0 we report to support stack * switching via sigreturn if GCS is enabled. We do not allow * enabling via sigreturn so the token is only relevant for * threads with GCS enabled. */ if (task_gcs_el0_enabled(current)) gcspr -= 8; __put_user_error(GCS_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); __put_user_error(gcspr, &ctx->gcspr, err); __put_user_error(0, &ctx->reserved, err); __put_user_error(current->thread.gcs_el0_mode, &ctx->features_enabled, err); return err; } static int restore_gcs_context(struct user_ctxs *user) { u64 gcspr, enabled; int err = 0; if (user->gcs_size != sizeof(*user->gcs)) return -EINVAL; __get_user_error(gcspr, &user->gcs->gcspr, err); __get_user_error(enabled, &user->gcs->features_enabled, err); if (err) return err; /* Don't allow unknown modes */ if (enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) return -EINVAL; err = gcs_check_locked(current, enabled); if (err != 0) return err; /* Don't allow enabling */ if (!task_gcs_el0_enabled(current) && (enabled & PR_SHADOW_STACK_ENABLE)) return -EINVAL; /* If we are disabling disable everything */ if (!(enabled & PR_SHADOW_STACK_ENABLE)) enabled = 0; current->thread.gcs_el0_mode = enabled; /* * We let userspace set GCSPR_EL0 to anything here, we will * validate later in gcs_restore_signal(). */ write_sysreg_s(gcspr, SYS_GCSPR_EL0); return 0; } #else /* ! CONFIG_ARM64_GCS */ /* Turn any non-optimised out attempts to use these into a link error: */ extern int preserve_gcs_context(void __user *ctx); extern int restore_gcs_context(struct user_ctxs *user); #endif /* ! CONFIG_ARM64_GCS */ static int parse_user_sigframe(struct user_ctxs *user, struct rt_sigframe __user *sf) { struct sigcontext __user *const sc = &sf->uc.uc_mcontext; struct _aarch64_ctx __user *head; char __user *base = (char __user *)&sc->__reserved; size_t offset = 0; size_t limit = sizeof(sc->__reserved); bool have_extra_context = false; char const __user *const sfp = (char const __user *)sf; user->fpsimd = NULL; user->sve = NULL; user->tpidr2 = NULL; user->za = NULL; user->zt = NULL; user->fpmr = NULL; user->poe = NULL; user->gcs = NULL; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; while (1) { int err = 0; u32 magic, size; char const __user *userp; struct extra_context const __user *extra; u64 extra_datap; u32 extra_size; struct _aarch64_ctx const __user *end; u32 end_magic, end_size; if (limit - offset < sizeof(*head)) goto invalid; if (!IS_ALIGNED(offset, 16)) goto invalid; head = (struct _aarch64_ctx __user *)(base + offset); __get_user_error(magic, &head->magic, err); __get_user_error(size, &head->size, err); if (err) return err; if (limit - offset < size) goto invalid; switch (magic) { case 0: if (size) goto invalid; goto done; case FPSIMD_MAGIC: if (!system_supports_fpsimd()) goto invalid; if (user->fpsimd) goto invalid; user->fpsimd = (struct fpsimd_context __user *)head; user->fpsimd_size = size; break; case ESR_MAGIC: /* ignore */ break; case POE_MAGIC: if (!system_supports_poe()) goto invalid; if (user->poe) goto invalid; user->poe = (struct poe_context __user *)head; user->poe_size = size; break; case SVE_MAGIC: if (!system_supports_sve() && !system_supports_sme()) goto invalid; if (user->sve) goto invalid; user->sve = (struct sve_context __user *)head; user->sve_size = size; break; case TPIDR2_MAGIC: if (!system_supports_tpidr2()) goto invalid; if (user->tpidr2) goto invalid; user->tpidr2 = (struct tpidr2_context __user *)head; user->tpidr2_size = size; break; case ZA_MAGIC: if (!system_supports_sme()) goto invalid; if (user->za) goto invalid; user->za = (struct za_context __user *)head; user->za_size = size; break; case ZT_MAGIC: if (!system_supports_sme2()) goto invalid; if (user->zt) goto invalid; user->zt = (struct zt_context __user *)head; user->zt_size = size; break; case FPMR_MAGIC: if (!system_supports_fpmr()) goto invalid; if (user->fpmr) goto invalid; user->fpmr = (struct fpmr_context __user *)head; user->fpmr_size = size; break; case GCS_MAGIC: if (!system_supports_gcs()) goto invalid; if (user->gcs) goto invalid; user->gcs = (struct gcs_context __user *)head; user->gcs_size = size; break; case EXTRA_MAGIC: if (have_extra_context) goto invalid; if (size < sizeof(*extra)) goto invalid; userp = (char const __user *)head; extra = (struct extra_context const __user *)userp; userp += size; __get_user_error(extra_datap, &extra->datap, err); __get_user_error(extra_size, &extra->size, err); if (err) return err; /* Check for the dummy terminator in __reserved[]: */ if (limit - offset - size < TERMINATOR_SIZE) goto invalid; end = (struct _aarch64_ctx const __user *)userp; userp += TERMINATOR_SIZE; __get_user_error(end_magic, &end->magic, err); __get_user_error(end_size, &end->size, err); if (err) return err; if (end_magic || end_size) goto invalid; /* Prevent looping/repeated parsing of extra_context */ have_extra_context = true; base = (__force void __user *)extra_datap; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; if (!IS_ALIGNED(extra_size, 16)) goto invalid; if (base != userp) goto invalid; /* Reject "unreasonably large" frames: */ if (extra_size > sfp + SIGFRAME_MAXSZ - userp) goto invalid; /* * Ignore trailing terminator in __reserved[] * and start parsing extra data: */ offset = 0; limit = extra_size; if (!access_ok(base, limit)) goto invalid; continue; default: goto invalid; } if (size < sizeof(*head)) goto invalid; if (limit - offset < size) goto invalid; offset += size; } done: return 0; invalid: return -EINVAL; } static int restore_sigframe(struct pt_regs *regs, struct rt_sigframe __user *sf, struct user_access_state *ua_state) { sigset_t set; int i, err; struct user_ctxs user; err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set)); if (err == 0) set_current_blocked(&set); for (i = 0; i < 31; i++) __get_user_error(regs->regs[i], &sf->uc.uc_mcontext.regs[i], err); __get_user_error(regs->sp, &sf->uc.uc_mcontext.sp, err); __get_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err); __get_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err); /* * Avoid sys_rt_sigreturn() restarting. */ forget_syscall(regs); err |= !valid_user_regs(®s->user_regs, current); if (err == 0) err = parse_user_sigframe(&user, sf); if (err == 0 && system_supports_fpsimd()) { if (!user.fpsimd) return -EINVAL; if (user.sve) err = restore_sve_fpsimd_context(&user); else err = restore_fpsimd_context(&user); } if (err == 0 && system_supports_gcs() && user.gcs) err = restore_gcs_context(&user); if (err == 0 && system_supports_tpidr2() && user.tpidr2) err = restore_tpidr2_context(&user); if (err == 0 && system_supports_fpmr() && user.fpmr) err = restore_fpmr_context(&user); if (err == 0 && system_supports_sme() && user.za) err = restore_za_context(&user); if (err == 0 && system_supports_sme2() && user.zt) err = restore_zt_context(&user); if (err == 0 && system_supports_poe() && user.poe) err = restore_poe_context(&user, ua_state); return err; } #ifdef CONFIG_ARM64_GCS static int gcs_restore_signal(void) { u64 gcspr_el0, cap; int ret; if (!system_supports_gcs()) return 0; if (!(current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE)) return 0; gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); /* * Ensure that any changes to the GCS done via GCS operations * are visible to the normal reads we do to validate the * token. */ gcsb_dsync(); /* * GCSPR_EL0 should be pointing at a capped GCS, read the cap. * We don't enforce that this is in a GCS page, if it is not * then faults will be generated on GCS operations - the main * concern is to protect GCS pages. */ ret = copy_from_user(&cap, (unsigned long __user *)gcspr_el0, sizeof(cap)); if (ret) return -EFAULT; /* * Check that the cap is the actual GCS before replacing it. */ if (cap != GCS_SIGNAL_CAP(gcspr_el0)) return -EINVAL; /* Invalidate the token to prevent reuse */ put_user_gcs(0, (unsigned long __user *)gcspr_el0, &ret); if (ret != 0) return -EFAULT; write_sysreg_s(gcspr_el0 + 8, SYS_GCSPR_EL0); return 0; } #else static int gcs_restore_signal(void) { return 0; } #endif SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; struct user_access_state ua_state; /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; /* * Since we stacked the signal on a 128-bit boundary, then 'sp' should * be word aligned here. */ if (regs->sp & 15) goto badframe; frame = (struct rt_sigframe __user *)regs->sp; if (!access_ok(frame, sizeof (*frame))) goto badframe; if (restore_sigframe(regs, frame, &ua_state)) goto badframe; if (gcs_restore_signal()) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) goto badframe; restore_user_access_state(&ua_state); return regs->regs[0]; badframe: arm64_notify_segfault(regs->sp); return 0; } /* * Determine the layout of optional records in the signal frame * * add_all: if true, lays out the biggest possible signal frame for * this task; otherwise, generates a layout for the current state * of the task. */ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, bool add_all) { int err; if (system_supports_fpsimd()) { err = sigframe_alloc(user, &user->fpsimd_offset, sizeof(struct fpsimd_context)); if (err) return err; } /* fault information, if valid */ if (add_all || current->thread.fault_code) { err = sigframe_alloc(user, &user->esr_offset, sizeof(struct esr_context)); if (err) return err; } #ifdef CONFIG_ARM64_GCS if (system_supports_gcs() && (add_all || current->thread.gcspr_el0)) { err = sigframe_alloc(user, &user->gcs_offset, sizeof(struct gcs_context)); if (err) return err; } #endif if (system_supports_sve() || system_supports_sme()) { unsigned int vq = 0; if (add_all || current->thread.fp_type == FP_STATE_SVE || thread_sm_enabled(¤t->thread)) { int vl = max(sve_max_vl(), sme_max_vl()); if (!add_all) vl = thread_get_cur_vl(¤t->thread); vq = sve_vq_from_vl(vl); } err = sigframe_alloc(user, &user->sve_offset, SVE_SIG_CONTEXT_SIZE(vq)); if (err) return err; } if (system_supports_tpidr2()) { err = sigframe_alloc(user, &user->tpidr2_offset, sizeof(struct tpidr2_context)); if (err) return err; } if (system_supports_sme()) { unsigned int vl; unsigned int vq = 0; if (add_all) vl = sme_max_vl(); else vl = task_get_sme_vl(current); if (thread_za_enabled(¤t->thread)) vq = sve_vq_from_vl(vl); err = sigframe_alloc(user, &user->za_offset, ZA_SIG_CONTEXT_SIZE(vq)); if (err) return err; } if (system_supports_sme2()) { if (add_all || thread_za_enabled(¤t->thread)) { err = sigframe_alloc(user, &user->zt_offset, ZT_SIG_CONTEXT_SIZE(1)); if (err) return err; } } if (system_supports_fpmr()) { err = sigframe_alloc(user, &user->fpmr_offset, sizeof(struct fpmr_context)); if (err) return err; } if (system_supports_poe()) { err = sigframe_alloc(user, &user->poe_offset, sizeof(struct poe_context)); if (err) return err; } return sigframe_alloc_end(user); } static int setup_sigframe(struct rt_sigframe_user_layout *user, struct pt_regs *regs, sigset_t *set, const struct user_access_state *ua_state) { int i, err = 0; struct rt_sigframe __user *sf = user->sigframe; /* set up the stack frame for unwinding */ __put_user_error(regs->regs[29], &user->next_frame->fp, err); __put_user_error(regs->regs[30], &user->next_frame->lr, err); for (i = 0; i < 31; i++) __put_user_error(regs->regs[i], &sf->uc.uc_mcontext.regs[i], err); __put_user_error(regs->sp, &sf->uc.uc_mcontext.sp, err); __put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err); __put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err); __put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err); err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set)); if (err == 0 && system_supports_fpsimd()) { struct fpsimd_context __user *fpsimd_ctx = apply_user_offset(user, user->fpsimd_offset); err |= preserve_fpsimd_context(fpsimd_ctx); } /* fault information, if valid */ if (err == 0 && user->esr_offset) { struct esr_context __user *esr_ctx = apply_user_offset(user, user->esr_offset); __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err); __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err); __put_user_error(current->thread.fault_code, &esr_ctx->esr, err); } if (system_supports_gcs() && err == 0 && user->gcs_offset) { struct gcs_context __user *gcs_ctx = apply_user_offset(user, user->gcs_offset); err |= preserve_gcs_context(gcs_ctx); } /* Scalable Vector Extension state (including streaming), if present */ if ((system_supports_sve() || system_supports_sme()) && err == 0 && user->sve_offset) { struct sve_context __user *sve_ctx = apply_user_offset(user, user->sve_offset); err |= preserve_sve_context(sve_ctx); } /* TPIDR2 if supported */ if (system_supports_tpidr2() && err == 0) { struct tpidr2_context __user *tpidr2_ctx = apply_user_offset(user, user->tpidr2_offset); err |= preserve_tpidr2_context(tpidr2_ctx); } /* FPMR if supported */ if (system_supports_fpmr() && err == 0) { struct fpmr_context __user *fpmr_ctx = apply_user_offset(user, user->fpmr_offset); err |= preserve_fpmr_context(fpmr_ctx); } if (system_supports_poe() && err == 0) { struct poe_context __user *poe_ctx = apply_user_offset(user, user->poe_offset); err |= preserve_poe_context(poe_ctx, ua_state); } /* ZA state if present */ if (system_supports_sme() && err == 0 && user->za_offset) { struct za_context __user *za_ctx = apply_user_offset(user, user->za_offset); err |= preserve_za_context(za_ctx); } /* ZT state if present */ if (system_supports_sme2() && err == 0 && user->zt_offset) { struct zt_context __user *zt_ctx = apply_user_offset(user, user->zt_offset); err |= preserve_zt_context(zt_ctx); } if (err == 0 && user->extra_offset) { char __user *sfp = (char __user *)user->sigframe; char __user *userp = apply_user_offset(user, user->extra_offset); struct extra_context __user *extra; struct _aarch64_ctx __user *end; u64 extra_datap; u32 extra_size; extra = (struct extra_context __user *)userp; userp += EXTRA_CONTEXT_SIZE; end = (struct _aarch64_ctx __user *)userp; userp += TERMINATOR_SIZE; /* * extra_datap is just written to the signal frame. * The value gets cast back to a void __user * * during sigreturn. */ extra_datap = (__force u64)userp; extra_size = sfp + round_up(user->size, 16) - userp; __put_user_error(EXTRA_MAGIC, &extra->head.magic, err); __put_user_error(EXTRA_CONTEXT_SIZE, &extra->head.size, err); __put_user_error(extra_datap, &extra->datap, err); __put_user_error(extra_size, &extra->size, err); /* Add the terminator */ __put_user_error(0, &end->magic, err); __put_user_error(0, &end->size, err); } /* set the "end" magic */ if (err == 0) { struct _aarch64_ctx __user *end = apply_user_offset(user, user->end_offset); __put_user_error(0, &end->magic, err); __put_user_error(0, &end->size, err); } return err; } static int get_sigframe(struct rt_sigframe_user_layout *user, struct ksignal *ksig, struct pt_regs *regs) { unsigned long sp, sp_top; int err; init_user_layout(user); err = setup_sigframe_layout(user, false); if (err) return err; sp = sp_top = sigsp(regs->sp, ksig); sp = round_down(sp - sizeof(struct frame_record), 16); user->next_frame = (struct frame_record __user *)sp; sp = round_down(sp, 16) - sigframe_size(user); user->sigframe = (struct rt_sigframe __user *)sp; /* * Check that we can actually write to the signal frame. */ if (!access_ok(user->sigframe, sp_top - sp)) return -EFAULT; return 0; } #ifdef CONFIG_ARM64_GCS static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) { u64 gcspr_el0; int ret = 0; if (!system_supports_gcs()) return 0; if (!task_gcs_el0_enabled(current)) return 0; /* * We are entering a signal handler, current register state is * active. */ gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); /* * Push a cap and the GCS entry for the trampoline onto the GCS. */ put_user_gcs((unsigned long)sigtramp, (unsigned long __user *)(gcspr_el0 - 16), &ret); put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 8), (unsigned long __user *)(gcspr_el0 - 8), &ret); if (ret != 0) return ret; gcspr_el0 -= 16; write_sysreg_s(gcspr_el0, SYS_GCSPR_EL0); return 0; } #else static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) { return 0; } #endif static int setup_return(struct pt_regs *regs, struct ksignal *ksig, struct rt_sigframe_user_layout *user, int usig) { __sigrestore_t sigtramp; int err; if (ksig->ka.sa.sa_flags & SA_RESTORER) sigtramp = ksig->ka.sa.sa_restorer; else sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp); err = gcs_signal_entry(sigtramp, ksig); if (err) return err; /* * We must not fail from this point onwards. We are going to update * registers, including SP, in order to invoke the signal handler. If * we failed and attempted to deliver a nested SIGSEGV to a handler * after that point, the subsequent sigreturn would end up restoring * the (partial) state for the original signal handler. */ regs->regs[0] = usig; if (ksig->ka.sa.sa_flags & SA_SIGINFO) { regs->regs[1] = (unsigned long)&user->sigframe->info; regs->regs[2] = (unsigned long)&user->sigframe->uc; } regs->sp = (unsigned long)user->sigframe; regs->regs[29] = (unsigned long)&user->next_frame->fp; regs->regs[30] = (unsigned long)sigtramp; regs->pc = (unsigned long)ksig->ka.sa.sa_handler; /* * Signal delivery is a (wacky) indirect function call in * userspace, so simulate the same setting of BTYPE as a BLR * <register containing the signal handler entry point>. * Signal delivery to a location in a PROT_BTI guarded page * that is not a function entry point will now trigger a * SIGILL in userspace. * * If the signal handler entry point is not in a PROT_BTI * guarded page, this is harmless. */ if (system_supports_bti()) { regs->pstate &= ~PSR_BTYPE_MASK; regs->pstate |= PSR_BTYPE_C; } /* TCO (Tag Check Override) always cleared for signal handlers */ regs->pstate &= ~PSR_TCO_BIT; /* Signal handlers are invoked with ZA and streaming mode disabled */ if (system_supports_sme()) { /* * If we were in streaming mode the saved register * state was SVE but we will exit SM and use the * FPSIMD register state - flush the saved FPSIMD * register state in case it gets loaded. */ if (current->thread.svcr & SVCR_SM_MASK) { memset(¤t->thread.uw.fpsimd_state, 0, sizeof(current->thread.uw.fpsimd_state)); current->thread.fp_type = FP_STATE_FPSIMD; } current->thread.svcr &= ~(SVCR_ZA_MASK | SVCR_SM_MASK); sme_smstop(); } return 0; } static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct rt_sigframe_user_layout user; struct rt_sigframe __user *frame; struct user_access_state ua_state; int err = 0; fpsimd_signal_preserve_current_state(); if (get_sigframe(&user, ksig, regs)) return 1; save_reset_user_access_state(&ua_state); frame = user.sigframe; __put_user_error(0, &frame->uc.uc_flags, err); __put_user_error(NULL, &frame->uc.uc_link, err); err |= __save_altstack(&frame->uc.uc_stack, regs->sp); err |= setup_sigframe(&user, regs, set, &ua_state); if (ksig->ka.sa.sa_flags & SA_SIGINFO) err |= copy_siginfo_to_user(&frame->info, &ksig->info); if (err == 0) err = setup_return(regs, ksig, &user, usig); /* * We must not fail if setup_return() succeeded - see comment at the * beginning of setup_return(). */ if (err == 0) set_handler_user_access_state(); else restore_user_access_state(&ua_state); return err; } static void setup_restart_syscall(struct pt_regs *regs) { if (is_compat_task()) compat_setup_restart_syscall(regs); else regs->regs[8] = __NR_restart_syscall; } /* * OK, we're invoking a handler */ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { sigset_t *oldset = sigmask_to_save(); int usig = ksig->sig; int ret; rseq_signal_deliver(ksig, regs); /* * Set up the stack frame */ if (is_compat_task()) { if (ksig->ka.sa.sa_flags & SA_SIGINFO) ret = compat_setup_rt_frame(usig, ksig, oldset, regs); else ret = compat_setup_frame(usig, ksig, oldset, regs); } else { ret = setup_rt_frame(usig, ksig, oldset, regs); } /* * Check that the resulting registers are actually sane. */ ret |= !valid_user_regs(®s->user_regs, current); /* Step into the signal handler if we are stepping */ signal_setup_done(ret, ksig, test_thread_flag(TIF_SINGLESTEP)); } /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. * * Note that we go through the signals twice: once to check the signals that * the kernel can handle, and then we build all the user-level signal handling * stack-frames in one go after that. */ void do_signal(struct pt_regs *regs) { unsigned long continue_addr = 0, restart_addr = 0; int retval = 0; struct ksignal ksig; bool syscall = in_syscall(regs); /* * If we were from a system call, check for system call restarting... */ if (syscall) { continue_addr = regs->pc; restart_addr = continue_addr - (compat_thumb_mode(regs) ? 2 : 4); retval = regs->regs[0]; /* * Avoid additional syscall restarting via ret_to_user. */ forget_syscall(regs); /* * Prepare for system call restart. We do this here so that a * debugger will see the already changed PC. */ switch (retval) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: case -ERESTART_RESTARTBLOCK: regs->regs[0] = regs->orig_x0; regs->pc = restart_addr; break; } } /* * Get the signal to deliver. When running under ptrace, at this point * the debugger may change all of our registers. */ if (get_signal(&ksig)) { /* * Depending on the signal settings, we may need to revert the * decision to restart the system call, but skip this if a * debugger has chosen to restart at a different PC. */ if (regs->pc == restart_addr && (retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK || (retval == -ERESTARTSYS && !(ksig.ka.sa.sa_flags & SA_RESTART)))) { syscall_set_return_value(current, regs, -EINTR, 0); regs->pc = continue_addr; } handle_signal(&ksig, regs); return; } /* * Handle restarting a different system call. As above, if a debugger * has chosen to restart at a different PC, ignore the restart. */ if (syscall && regs->pc == restart_addr) { if (retval == -ERESTART_RESTARTBLOCK) setup_restart_syscall(regs); user_rewind_single_step(current); } restore_saved_sigmask(); } unsigned long __ro_after_init signal_minsigstksz; /* * Determine the stack space required for guaranteed signal devliery. * This function is used to populate AT_MINSIGSTKSZ at process startup. * cpufeatures setup is assumed to be complete. */ void __init minsigstksz_setup(void) { struct rt_sigframe_user_layout user; init_user_layout(&user); /* * If this fails, SIGFRAME_MAXSZ needs to be enlarged. It won't * be big enough, but it's our best guess: */ if (WARN_ON(setup_sigframe_layout(&user, true))) return; signal_minsigstksz = sigframe_size(&user) + round_up(sizeof(struct frame_record), 16) + 16; /* max alignment padding */ } /* * Compile-time assertions for siginfo_t offsets. Check NSIG* as well, as * changes likely come with new fields that should be added below. */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); static_assert(NSIGSYS == 2); static_assert(sizeof(siginfo_t) == 128); static_assert(__alignof__(siginfo_t) == 8); static_assert(offsetof(siginfo_t, si_signo) == 0x00); static_assert(offsetof(siginfo_t, si_errno) == 0x04); static_assert(offsetof(siginfo_t, si_code) == 0x08); static_assert(offsetof(siginfo_t, si_pid) == 0x10); static_assert(offsetof(siginfo_t, si_uid) == 0x14); static_assert(offsetof(siginfo_t, si_tid) == 0x10); static_assert(offsetof(siginfo_t, si_overrun) == 0x14); static_assert(offsetof(siginfo_t, si_status) == 0x18); static_assert(offsetof(siginfo_t, si_utime) == 0x20); static_assert(offsetof(siginfo_t, si_stime) == 0x28); static_assert(offsetof(siginfo_t, si_value) == 0x18); static_assert(offsetof(siginfo_t, si_int) == 0x18); static_assert(offsetof(siginfo_t, si_ptr) == 0x18); static_assert(offsetof(siginfo_t, si_addr) == 0x10); static_assert(offsetof(siginfo_t, si_addr_lsb) == 0x18); static_assert(offsetof(siginfo_t, si_lower) == 0x20); static_assert(offsetof(siginfo_t, si_upper) == 0x28); static_assert(offsetof(siginfo_t, si_pkey) == 0x20); static_assert(offsetof(siginfo_t, si_perf_data) == 0x18); static_assert(offsetof(siginfo_t, si_perf_type) == 0x20); static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24); static_assert(offsetof(siginfo_t, si_band) == 0x10); static_assert(offsetof(siginfo_t, si_fd) == 0x18); static_assert(offsetof(siginfo_t, si_call_addr) == 0x10); static_assert(offsetof(siginfo_t, si_syscall) == 0x18); static_assert(offsetof(siginfo_t, si_arch) == 0x1c); |
| 864 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/fault-inject.h> #include <linux/fault-inject-usercopy.h> static struct { struct fault_attr attr; } fail_usercopy = { .attr = FAULT_ATTR_INITIALIZER, }; static int __init setup_fail_usercopy(char *str) { return setup_fault_attr(&fail_usercopy.attr, str); } __setup("fail_usercopy=", setup_fail_usercopy); #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init fail_usercopy_debugfs(void) { struct dentry *dir; dir = fault_create_debugfs_attr("fail_usercopy", NULL, &fail_usercopy.attr); if (IS_ERR(dir)) return PTR_ERR(dir); return 0; } late_initcall(fail_usercopy_debugfs); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ bool should_fail_usercopy(void) { return should_fail(&fail_usercopy.attr, 1); } EXPORT_SYMBOL_GPL(should_fail_usercopy); |
| 483 418 97 481 482 482 483 482 146 147 147 147 147 147 147 17 132 17 17 17 257 257 238 238 18 293 293 47 47 47 147 147 482 147 132 147 147 147 147 147 131 17 147 147 147 416 417 20 417 417 1 482 482 259 258 60 146 60 107 60 146 158 30 147 147 183 2 43 44 44 44 44 183 183 44 218 184 184 184 184 184 32 183 183 183 4 2 2 22 22 1 22 22 22 220 199 30 221 44 44 44 44 44 44 1 44 259 259 8 8 438 440 520 520 438 518 519 137 137 137 2 137 137 137 3 3 20 18 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 | // SPDX-License-Identifier: GPL-2.0-only /* * (C) 1997 Linus Torvalds * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation) */ #include <linux/export.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/mm.h> #include <linux/backing-dev.h> #include <linux/hash.h> #include <linux/swap.h> #include <linux/security.h> #include <linux/cdev.h> #include <linux/memblock.h> #include <linux/fsnotify.h> #include <linux/mount.h> #include <linux/posix_acl.h> #include <linux/buffer_head.h> /* for inode_has_buffers */ #include <linux/ratelimit.h> #include <linux/list_lru.h> #include <linux/iversion.h> #include <linux/rw_hint.h> #include <linux/seq_file.h> #include <linux/debugfs.h> #include <trace/events/writeback.h> #define CREATE_TRACE_POINTS #include <trace/events/timestamp.h> #include "internal.h" /* * Inode locking rules: * * inode->i_lock protects: * inode->i_state, inode->i_hash, __iget(), inode->i_io_list * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru * inode->i_sb->s_inode_list_lock protects: * inode->i_sb->s_inodes, inode->i_sb_list * bdi->wb.list_lock protects: * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash * * Lock ordering: * * inode->i_sb->s_inode_list_lock * inode->i_lock * Inode LRU list locks * * bdi->wb.list_lock * inode->i_lock * * inode_hash_lock * inode->i_sb->s_inode_list_lock * inode->i_lock * * iunique_lock * inode_hash_lock */ static unsigned int i_hash_mask __ro_after_init; static unsigned int i_hash_shift __ro_after_init; static struct hlist_head *inode_hashtable __ro_after_init; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); /* * Empty aops. Can be used for the cases where the user does not * define any of the address_space operations. */ const struct address_space_operations empty_aops = { }; EXPORT_SYMBOL(empty_aops); static DEFINE_PER_CPU(unsigned long, nr_inodes); static DEFINE_PER_CPU(unsigned long, nr_unused); static struct kmem_cache *inode_cachep __ro_after_init; static long get_nr_inodes(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_inodes, i); return sum < 0 ? 0 : sum; } static inline long get_nr_inodes_unused(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_unused, i); return sum < 0 ? 0 : sum; } long get_nr_dirty_inodes(void) { /* not actually dirty inodes, but a wild approximation */ long nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); return nr_dirty > 0 ? nr_dirty : 0; } #ifdef CONFIG_DEBUG_FS static DEFINE_PER_CPU(long, mg_ctime_updates); static DEFINE_PER_CPU(long, mg_fine_stamps); static DEFINE_PER_CPU(long, mg_ctime_swaps); static unsigned long get_mg_ctime_updates(void) { unsigned long sum = 0; int i; for_each_possible_cpu(i) sum += data_race(per_cpu(mg_ctime_updates, i)); return sum; } static unsigned long get_mg_fine_stamps(void) { unsigned long sum = 0; int i; for_each_possible_cpu(i) sum += data_race(per_cpu(mg_fine_stamps, i)); return sum; } static unsigned long get_mg_ctime_swaps(void) { unsigned long sum = 0; int i; for_each_possible_cpu(i) sum += data_race(per_cpu(mg_ctime_swaps, i)); return sum; } #define mgtime_counter_inc(__var) this_cpu_inc(__var) static int mgts_show(struct seq_file *s, void *p) { unsigned long ctime_updates = get_mg_ctime_updates(); unsigned long ctime_swaps = get_mg_ctime_swaps(); unsigned long fine_stamps = get_mg_fine_stamps(); unsigned long floor_swaps = timekeeping_get_mg_floor_swaps(); seq_printf(s, "%lu %lu %lu %lu\n", ctime_updates, ctime_swaps, fine_stamps, floor_swaps); return 0; } DEFINE_SHOW_ATTRIBUTE(mgts); static int __init mg_debugfs_init(void) { debugfs_create_file("multigrain_timestamps", S_IFREG | S_IRUGO, NULL, NULL, &mgts_fops); return 0; } late_initcall(mg_debugfs_init); #else /* ! CONFIG_DEBUG_FS */ #define mgtime_counter_inc(__var) do { } while (0) #endif /* CONFIG_DEBUG_FS */ /* * Handle nr_inode sysctl */ #ifdef CONFIG_SYSCTL /* * Statistics gathering.. */ static struct inodes_stat_t inodes_stat; static int proc_nr_inodes(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static const struct ctl_table inodes_sysctls[] = { { .procname = "inode-nr", .data = &inodes_stat, .maxlen = 2*sizeof(long), .mode = 0444, .proc_handler = proc_nr_inodes, }, { .procname = "inode-state", .data = &inodes_stat, .maxlen = 7*sizeof(long), .mode = 0444, .proc_handler = proc_nr_inodes, }, }; static int __init init_fs_inode_sysctls(void) { register_sysctl_init("fs", inodes_sysctls); return 0; } early_initcall(init_fs_inode_sysctls); #endif static int no_open(struct inode *inode, struct file *file) { return -ENXIO; } /** * inode_init_always_gfp - perform inode structure initialisation * @sb: superblock inode belongs to * @inode: inode to initialise * @gfp: allocation flags * * These are initializations that need to be done on every inode * allocation as the fields are not initialised by slab allocation. * If there are additional allocations required @gfp is used. */ int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp) { static const struct inode_operations empty_iops; static const struct file_operations no_open_fops = {.open = no_open}; struct address_space *const mapping = &inode->i_data; inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; inode->i_state = 0; atomic64_set(&inode->i_sequence, 0); atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; inode->i_fop = &no_open_fops; inode->i_ino = 0; inode->__i_nlink = 1; inode->i_opflags = 0; if (sb->s_xattr) inode->i_opflags |= IOP_XATTR; if (sb->s_type->fs_flags & FS_MGTIME) inode->i_opflags |= IOP_MGTIME; i_uid_write(inode, 0); i_gid_write(inode, 0); atomic_set(&inode->i_writecount, 0); inode->i_size = 0; inode->i_write_hint = WRITE_LIFE_NOT_SET; inode->i_blocks = 0; inode->i_bytes = 0; inode->i_generation = 0; inode->i_pipe = NULL; inode->i_cdev = NULL; inode->i_link = NULL; inode->i_dir_seq = 0; inode->i_rdev = 0; inode->dirtied_when = 0; #ifdef CONFIG_CGROUP_WRITEBACK inode->i_wb_frn_winner = 0; inode->i_wb_frn_avg_time = 0; inode->i_wb_frn_history = 0; #endif spin_lock_init(&inode->i_lock); lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); init_rwsem(&inode->i_rwsem); lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key); atomic_set(&inode->i_dio_count, 0); mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; mapping->wb_err = 0; atomic_set(&mapping->i_mmap_writable, 0); #ifdef CONFIG_READ_ONLY_THP_FOR_FS atomic_set(&mapping->nr_thps, 0); #endif mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->i_private_data = NULL; mapping->writeback_index = 0; init_rwsem(&mapping->invalidate_lock); lockdep_set_class_and_name(&mapping->invalidate_lock, &sb->s_type->invalidate_lock_key, "mapping.invalidate_lock"); if (sb->s_iflags & SB_I_STABLE_WRITES) mapping_set_stable_writes(mapping); inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ #ifdef CONFIG_FS_POSIX_ACL inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; #endif #ifdef CONFIG_FSNOTIFY inode->i_fsnotify_mask = 0; #endif inode->i_flctx = NULL; if (unlikely(security_inode_alloc(inode, gfp))) return -ENOMEM; this_cpu_inc(nr_inodes); return 0; } EXPORT_SYMBOL(inode_init_always_gfp); void free_inode_nonrcu(struct inode *inode) { kmem_cache_free(inode_cachep, inode); } EXPORT_SYMBOL(free_inode_nonrcu); static void i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); if (inode->free_inode) inode->free_inode(inode); else free_inode_nonrcu(inode); } static struct inode *alloc_inode(struct super_block *sb) { const struct super_operations *ops = sb->s_op; struct inode *inode; if (ops->alloc_inode) inode = ops->alloc_inode(sb); else inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL); if (!inode) return NULL; if (unlikely(inode_init_always(sb, inode))) { if (ops->destroy_inode) { ops->destroy_inode(inode); if (!ops->free_inode) return NULL; } inode->free_inode = ops->free_inode; i_callback(&inode->i_rcu); return NULL; } return inode; } void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); inode_detach_wb(inode); security_inode_free(inode); fsnotify_inode_delete(inode); locks_free_lock_context(inode); if (!inode->i_nlink) { WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); atomic_long_dec(&inode->i_sb->s_remove_count); } #ifdef CONFIG_FS_POSIX_ACL if (inode->i_acl && !is_uncached_acl(inode->i_acl)) posix_acl_release(inode->i_acl); if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl)) posix_acl_release(inode->i_default_acl); #endif this_cpu_dec(nr_inodes); } EXPORT_SYMBOL(__destroy_inode); static void destroy_inode(struct inode *inode) { const struct super_operations *ops = inode->i_sb->s_op; BUG_ON(!list_empty(&inode->i_lru)); __destroy_inode(inode); if (ops->destroy_inode) { ops->destroy_inode(inode); if (!ops->free_inode) return; } inode->free_inode = ops->free_inode; call_rcu(&inode->i_rcu, i_callback); } /** * drop_nlink - directly drop an inode's link count * @inode: inode * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. In cases * where we are attempting to track writes to the * filesystem, a decrement to zero means an imminent * write when the file is truncated and actually unlinked * on the filesystem. */ void drop_nlink(struct inode *inode) { WARN_ON(inode->i_nlink == 0); inode->__i_nlink--; if (!inode->i_nlink) atomic_long_inc(&inode->i_sb->s_remove_count); } EXPORT_SYMBOL(drop_nlink); /** * clear_nlink - directly zero an inode's link count * @inode: inode * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. See * drop_nlink() for why we care about i_nlink hitting zero. */ void clear_nlink(struct inode *inode) { if (inode->i_nlink) { inode->__i_nlink = 0; atomic_long_inc(&inode->i_sb->s_remove_count); } } EXPORT_SYMBOL(clear_nlink); /** * set_nlink - directly set an inode's link count * @inode: inode * @nlink: new nlink (should be non-zero) * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. */ void set_nlink(struct inode *inode, unsigned int nlink) { if (!nlink) { clear_nlink(inode); } else { /* Yes, some filesystems do change nlink from zero to one */ if (inode->i_nlink == 0) atomic_long_dec(&inode->i_sb->s_remove_count); inode->__i_nlink = nlink; } } EXPORT_SYMBOL(set_nlink); /** * inc_nlink - directly increment an inode's link count * @inode: inode * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. Currently, * it is only here for parity with dec_nlink(). */ void inc_nlink(struct inode *inode) { if (unlikely(inode->i_nlink == 0)) { WARN_ON(!(inode->i_state & I_LINKABLE)); atomic_long_dec(&inode->i_sb->s_remove_count); } inode->__i_nlink++; } EXPORT_SYMBOL(inc_nlink); static void __address_space_init_once(struct address_space *mapping) { xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->i_private_list); spin_lock_init(&mapping->i_private_lock); mapping->i_mmap = RB_ROOT_CACHED; } void address_space_init_once(struct address_space *mapping) { memset(mapping, 0, sizeof(*mapping)); __address_space_init_once(mapping); } EXPORT_SYMBOL(address_space_init_once); /* * These are initializations that only need to be done * once, because the fields are idempotent across use * of the inode, so let the slab aware of that. */ void inode_init_once(struct inode *inode) { memset(inode, 0, sizeof(*inode)); INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); INIT_LIST_HEAD(&inode->i_sb_list); __address_space_init_once(&inode->i_data); i_size_ordered_init(inode); } EXPORT_SYMBOL(inode_init_once); static void init_once(void *foo) { struct inode *inode = (struct inode *) foo; inode_init_once(inode); } /* * get additional reference to inode; caller must already hold one. */ void ihold(struct inode *inode) { WARN_ON(atomic_inc_return(&inode->i_count) < 2); } EXPORT_SYMBOL(ihold); static void __inode_add_lru(struct inode *inode, bool rotate) { if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) return; if (atomic_read(&inode->i_count)) return; if (!(inode->i_sb->s_flags & SB_ACTIVE)) return; if (!mapping_shrinkable(&inode->i_data)) return; if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); else if (rotate) inode->i_state |= I_REFERENCED; } struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, struct inode *inode, u32 bit) { void *bit_address; bit_address = inode_state_wait_address(inode, bit); init_wait_var_entry(wqe, bit_address, 0); return __var_waitqueue(bit_address); } EXPORT_SYMBOL(inode_bit_waitqueue); /* * Add inode to LRU if needed (inode is unused and clean). * * Needs inode->i_lock held. */ void inode_add_lru(struct inode *inode) { __inode_add_lru(inode, false); } static void inode_lru_list_del(struct inode *inode) { if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); } static void inode_pin_lru_isolating(struct inode *inode) { lockdep_assert_held(&inode->i_lock); WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE)); inode->i_state |= I_LRU_ISOLATING; } static void inode_unpin_lru_isolating(struct inode *inode) { spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_LRU_ISOLATING)); inode->i_state &= ~I_LRU_ISOLATING; /* Called with inode->i_lock which ensures memory ordering. */ inode_wake_up_bit(inode, __I_LRU_ISOLATING); spin_unlock(&inode->i_lock); } static void inode_wait_for_lru_isolating(struct inode *inode) { struct wait_bit_queue_entry wqe; struct wait_queue_head *wq_head; lockdep_assert_held(&inode->i_lock); if (!(inode->i_state & I_LRU_ISOLATING)) return; wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING); for (;;) { prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); /* * Checking I_LRU_ISOLATING with inode->i_lock guarantees * memory ordering. */ if (!(inode->i_state & I_LRU_ISOLATING)) break; spin_unlock(&inode->i_lock); schedule(); spin_lock(&inode->i_lock); } finish_wait(wq_head, &wqe.wq_entry); WARN_ON(inode->i_state & I_LRU_ISOLATING); } /** * inode_sb_list_add - add inode to the superblock list of inodes * @inode: inode to add */ void inode_sb_list_add(struct inode *inode) { spin_lock(&inode->i_sb->s_inode_list_lock); list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); spin_unlock(&inode->i_sb->s_inode_list_lock); } EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { if (!list_empty(&inode->i_sb_list)) { spin_lock(&inode->i_sb->s_inode_list_lock); list_del_init(&inode->i_sb_list); spin_unlock(&inode->i_sb->s_inode_list_lock); } } static unsigned long hash(struct super_block *sb, unsigned long hashval) { unsigned long tmp; tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / L1_CACHE_BYTES; tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); return tmp & i_hash_mask; } /** * __insert_inode_hash - hash an inode * @inode: unhashed inode * @hashval: unsigned long value used to locate this object in the * inode_hashtable. * * Add an inode to the inode hash for this superblock. */ void __insert_inode_hash(struct inode *inode, unsigned long hashval) { struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); hlist_add_head_rcu(&inode->i_hash, b); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } EXPORT_SYMBOL(__insert_inode_hash); /** * __remove_inode_hash - remove an inode from the hash * @inode: inode to unhash * * Remove an inode from the superblock. */ void __remove_inode_hash(struct inode *inode) { spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); hlist_del_init_rcu(&inode->i_hash); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } EXPORT_SYMBOL(__remove_inode_hash); void dump_mapping(const struct address_space *mapping) { struct inode *host; const struct address_space_operations *a_ops; struct hlist_node *dentry_first; struct dentry *dentry_ptr; struct dentry dentry; char fname[64] = {}; unsigned long ino; /* * If mapping is an invalid pointer, we don't want to crash * accessing it, so probe everything depending on it carefully. */ if (get_kernel_nofault(host, &mapping->host) || get_kernel_nofault(a_ops, &mapping->a_ops)) { pr_warn("invalid mapping:%px\n", mapping); return; } if (!host) { pr_warn("aops:%ps\n", a_ops); return; } if (get_kernel_nofault(dentry_first, &host->i_dentry.first) || get_kernel_nofault(ino, &host->i_ino)) { pr_warn("aops:%ps invalid inode:%px\n", a_ops, host); return; } if (!dentry_first) { pr_warn("aops:%ps ino:%lx\n", a_ops, ino); return; } dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); if (get_kernel_nofault(dentry, dentry_ptr) || !dentry.d_parent || !dentry.d_name.name) { pr_warn("aops:%ps ino:%lx invalid dentry:%px\n", a_ops, ino, dentry_ptr); return; } if (strncpy_from_kernel_nofault(fname, dentry.d_name.name, 63) < 0) strscpy(fname, "<invalid>"); /* * Even if strncpy_from_kernel_nofault() succeeded, * the fname could be unreliable */ pr_warn("aops:%ps ino:%lx dentry name(?):\"%s\"\n", a_ops, ino, fname); } void clear_inode(struct inode *inode) { /* * We have to cycle the i_pages lock here because reclaim can be in the * process of removing the last page (in __filemap_remove_folio()) * and we must not free the mapping under it. */ xa_lock_irq(&inode->i_data.i_pages); BUG_ON(inode->i_data.nrpages); /* * Almost always, mapping_empty(&inode->i_data) here; but there are * two known and long-standing ways in which nodes may get left behind * (when deep radix-tree node allocation failed partway; or when THP * collapse_file() failed). Until those two known cases are cleaned up, * or a cleanup function is called here, do not BUG_ON(!mapping_empty), * nor even WARN_ON(!mapping_empty). */ xa_unlock_irq(&inode->i_data.i_pages); BUG_ON(!list_empty(&inode->i_data.i_private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); BUG_ON(!list_empty(&inode->i_wb_list)); /* don't need i_lock here, no concurrent mods to i_state */ inode->i_state = I_FREEING | I_CLEAR; } EXPORT_SYMBOL(clear_inode); /* * Free the inode passed in, removing it from the lists it is still connected * to. We remove any pages still attached to the inode and wait for any IO that * is still in progress before finally destroying the inode. * * An inode must already be marked I_FREEING so that we avoid the inode being * moved back onto lists if we race with other code that manipulates the lists * (e.g. writeback_single_inode). The caller is responsible for setting this. * * An inode must already be removed from the LRU list before being evicted from * the cache. This should occur atomically with setting the I_FREEING state * flag, so no inodes here should ever be on the LRU when being evicted. */ static void evict(struct inode *inode) { const struct super_operations *op = inode->i_sb->s_op; BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); if (!list_empty(&inode->i_io_list)) inode_io_list_del(inode); inode_sb_list_del(inode); spin_lock(&inode->i_lock); inode_wait_for_lru_isolating(inode); /* * Wait for flusher thread to be done with the inode so that filesystem * does not start destroying it while writeback is still running. Since * the inode has I_FREEING set, flusher thread won't start new work on * the inode. We just have to wait for running writeback to finish. */ inode_wait_for_writeback(inode); spin_unlock(&inode->i_lock); if (op->evict_inode) { op->evict_inode(inode); } else { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); } if (S_ISCHR(inode->i_mode) && inode->i_cdev) cd_forget(inode); remove_inode_hash(inode); /* * Wake up waiters in __wait_on_freeing_inode(). * * Lockless hash lookup may end up finding the inode before we removed * it above, but only lock it *after* we are done with the wakeup below. * In this case the potential waiter cannot safely block. * * The inode being unhashed after the call to remove_inode_hash() is * used as an indicator whether blocking on it is safe. */ spin_lock(&inode->i_lock); /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees the bit cleared or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb__after_spinlock(); inode_wake_up_bit(inode, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); spin_unlock(&inode->i_lock); destroy_inode(inode); } /* * dispose_list - dispose of the contents of a local list * @head: the head of the list to free * * Dispose-list gets a local list with local inodes in it, so it doesn't * need to worry about list corruption and SMP locks. */ static void dispose_list(struct list_head *head) { while (!list_empty(head)) { struct inode *inode; inode = list_first_entry(head, struct inode, i_lru); list_del_init(&inode->i_lru); evict(inode); cond_resched(); } } /** * evict_inodes - evict all evictable inodes for a superblock * @sb: superblock to operate on * * Make sure that no inodes with zero refcount are retained. This is * called by superblock shutdown after having SB_ACTIVE flag removed, * so any inode reaching zero refcount during or after that call will * be immediately evicted. */ void evict_inodes(struct super_block *sb) { struct inode *inode, *next; LIST_HEAD(dispose); again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; spin_lock(&inode->i_lock); if (atomic_read(&inode->i_count)) { spin_unlock(&inode->i_lock); continue; } if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); continue; } inode->i_state |= I_FREEING; inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); /* * We can have a ton of inodes to evict at unmount time given * enough memory, check to see if we need to go to sleep for a * bit so we don't livelock. */ if (need_resched()) { spin_unlock(&sb->s_inode_list_lock); cond_resched(); dispose_list(&dispose); goto again; } } spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); } EXPORT_SYMBOL_GPL(evict_inodes); /** * invalidate_inodes - attempt to free all inodes on a superblock * @sb: superblock to operate on * * Attempts to free all inodes (including dirty inodes) for a given superblock. */ void invalidate_inodes(struct super_block *sb) { struct inode *inode, *next; LIST_HEAD(dispose); again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); continue; } if (atomic_read(&inode->i_count)) { spin_unlock(&inode->i_lock); continue; } inode->i_state |= I_FREEING; inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); if (need_resched()) { spin_unlock(&sb->s_inode_list_lock); cond_resched(); dispose_list(&dispose); goto again; } } spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); } /* * Isolate the inode from the LRU in preparation for freeing it. * * If the inode has the I_REFERENCED flag set, then it means that it has been * used recently - the flag is set in iput_final(). When we encounter such an * inode, clear the flag and move it to the back of the LRU so it gets another * pass through the LRU before it gets reclaimed. This is necessary because of * the fact we are doing lazy LRU updates to minimise lock contention so the * LRU does not have strict ordering. Hence we don't want to reclaim inodes * with this flag set because they are the inodes that are out of order. */ static enum lru_status inode_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct inode *inode = container_of(item, struct inode, i_lru); /* * We are inverting the lru lock/inode->i_lock here, so use a * trylock. If we fail to get the lock, just skip it. */ if (!spin_trylock(&inode->i_lock)) return LRU_SKIP; /* * Inodes can get referenced, redirtied, or repopulated while * they're already on the LRU, and this can make them * unreclaimable for a while. Remove them lazily here; iput, * sync, or the last page cache deletion will requeue them. */ if (atomic_read(&inode->i_count) || (inode->i_state & ~I_REFERENCED) || !mapping_shrinkable(&inode->i_data)) { list_lru_isolate(lru, &inode->i_lru); spin_unlock(&inode->i_lock); this_cpu_dec(nr_unused); return LRU_REMOVED; } /* Recently referenced inodes get one more pass */ if (inode->i_state & I_REFERENCED) { inode->i_state &= ~I_REFERENCED; spin_unlock(&inode->i_lock); return LRU_ROTATE; } /* * On highmem systems, mapping_shrinkable() permits dropping * page cache in order to free up struct inodes: lowmem might * be under pressure before the cache inside the highmem zone. */ if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) { inode_pin_lru_isolating(inode); spin_unlock(&inode->i_lock); spin_unlock(&lru->lock); if (remove_inode_buffers(inode)) { unsigned long reap; reap = invalidate_mapping_pages(&inode->i_data, 0, -1); if (current_is_kswapd()) __count_vm_events(KSWAPD_INODESTEAL, reap); else __count_vm_events(PGINODESTEAL, reap); mm_account_reclaimed_pages(reap); } inode_unpin_lru_isolating(inode); return LRU_RETRY; } WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; list_lru_isolate_move(lru, &inode->i_lru, freeable); spin_unlock(&inode->i_lock); this_cpu_dec(nr_unused); return LRU_REMOVED; } /* * Walk the superblock inode LRU for freeable inodes and attempt to free them. * This is called from the superblock shrinker function with a number of inodes * to trim from the LRU. Inodes to be freed are moved to a temporary list and * then are freed outside inode_lock by dispose_list(). */ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(freeable); long freed; freed = list_lru_shrink_walk(&sb->s_inode_lru, sc, inode_lru_isolate, &freeable); dispose_list(&freeable); return freed; } static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked); /* * Called with the inode lock held. */ static struct inode *find_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data, bool is_inode_hash_locked) { struct inode *inode = NULL; if (is_inode_hash_locked) lockdep_assert_held(&inode_hash_lock); else lockdep_assert_not_held(&inode_hash_lock); rcu_read_lock(); repeat: hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_sb != sb) continue; if (!test(inode, data)) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode, is_inode_hash_locked); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { spin_unlock(&inode->i_lock); rcu_read_unlock(); return ERR_PTR(-ESTALE); } __iget(inode); spin_unlock(&inode->i_lock); rcu_read_unlock(); return inode; } rcu_read_unlock(); return NULL; } /* * find_inode_fast is the fast path version of find_inode, see the comment at * iget_locked for details. */ static struct inode *find_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino, bool is_inode_hash_locked) { struct inode *inode = NULL; if (is_inode_hash_locked) lockdep_assert_held(&inode_hash_lock); else lockdep_assert_not_held(&inode_hash_lock); rcu_read_lock(); repeat: hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_ino != ino) continue; if (inode->i_sb != sb) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode, is_inode_hash_locked); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { spin_unlock(&inode->i_lock); rcu_read_unlock(); return ERR_PTR(-ESTALE); } __iget(inode); spin_unlock(&inode->i_lock); rcu_read_unlock(); return inode; } rcu_read_unlock(); return NULL; } /* * Each cpu owns a range of LAST_INO_BATCH numbers. * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations, * to renew the exhausted range. * * This does not significantly increase overflow rate because every CPU can * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the * 2^32 range, and is a worst-case. Even a 50% wastage would only increase * overflow rate by 2x, which does not seem too significant. * * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW * error if st_ino won't fit in target struct field. Use 32bit counter * here to attempt to avoid that. */ #define LAST_INO_BATCH 1024 static DEFINE_PER_CPU(unsigned int, last_ino); unsigned int get_next_ino(void) { unsigned int *p = &get_cpu_var(last_ino); unsigned int res = *p; #ifdef CONFIG_SMP if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { static atomic_t shared_last_ino; int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino); res = next - LAST_INO_BATCH; } #endif res++; /* get_next_ino should not provide a 0 inode number */ if (unlikely(!res)) res++; *p = res; put_cpu_var(last_ino); return res; } EXPORT_SYMBOL(get_next_ino); /** * new_inode_pseudo - obtain an inode * @sb: superblock * * Allocates a new inode for given superblock. * Inode wont be chained in superblock s_inodes list * This means : * - fs can't be unmount * - quotas, fsnotify, writeback can't work */ struct inode *new_inode_pseudo(struct super_block *sb) { return alloc_inode(sb); } /** * new_inode - obtain an inode * @sb: superblock * * Allocates a new inode for given superblock. The default gfp_mask * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE. * If HIGHMEM pages are unsuitable or it is known that pages allocated * for the page cache are not reclaimable or migratable, * mapping_set_gfp_mask() must be called with suitable flags on the * newly created inode's mapping * */ struct inode *new_inode(struct super_block *sb) { struct inode *inode; inode = new_inode_pseudo(sb); if (inode) inode_sb_list_add(inode); return inode; } EXPORT_SYMBOL(new_inode); #ifdef CONFIG_DEBUG_LOCK_ALLOC void lockdep_annotate_inode_mutex_key(struct inode *inode) { if (S_ISDIR(inode->i_mode)) { struct file_system_type *type = inode->i_sb->s_type; /* Set new key only if filesystem hasn't already changed it */ if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) { /* * ensure nobody is actually holding i_mutex */ // mutex_destroy(&inode->i_mutex); init_rwsem(&inode->i_rwsem); lockdep_set_class(&inode->i_rwsem, &type->i_mutex_dir_key); } } } EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key); #endif /** * unlock_new_inode - clear the I_NEW state and wake up any waiters * @inode: new inode to unlock * * Called when the inode is fully initialised to clear the new state of the * inode and wake up anyone waiting for the inode to finish initialisation. */ void unlock_new_inode(struct inode *inode) { lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees the bit cleared or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(unlock_new_inode); void discard_new_inode(struct inode *inode) { lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees the bit cleared or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); iput(inode); } EXPORT_SYMBOL(discard_new_inode); /** * lock_two_nondirectories - take two i_mutexes on non-directory objects * * Lock any non-NULL argument. Passed objects must not be directories. * Zero, one or two objects may be locked by this function. * * @inode1: first inode to lock * @inode2: second inode to lock */ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) { if (inode1) WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); if (inode2) WARN_ON_ONCE(S_ISDIR(inode2->i_mode)); if (inode1 > inode2) swap(inode1, inode2); if (inode1) inode_lock(inode1); if (inode2 && inode2 != inode1) inode_lock_nested(inode2, I_MUTEX_NONDIR2); } EXPORT_SYMBOL(lock_two_nondirectories); /** * unlock_two_nondirectories - release locks from lock_two_nondirectories() * @inode1: first inode to unlock * @inode2: second inode to unlock */ void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) { if (inode1) { WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); inode_unlock(inode1); } if (inode2 && inode2 != inode1) { WARN_ON_ONCE(S_ISDIR(inode2->i_mode)); inode_unlock(inode2); } } EXPORT_SYMBOL(unlock_two_nondirectories); /** * inode_insert5 - obtain an inode from a mounted file system * @inode: pre-allocated inode to use for insert to cache * @hashval: hash value (usually inode number) to get * @test: callback used for comparisons between inodes * @set: callback used to initialize a new struct inode * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, * and if present return it with an increased reference count. This is a * variant of iget5_locked() that doesn't allocate an inode. * * If the inode is not present in the cache, insert the pre-allocated inode and * return it locked, hashed, and with the I_NEW flag set. The file system gets * to fill it in before unlocking it via unlock_new_inode(). * * Note that both @test and @set are called with the inode_hash_lock held, so * they can't sleep. */ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); struct inode *old; again: spin_lock(&inode_hash_lock); old = find_inode(inode->i_sb, head, test, data, true); if (unlikely(old)) { /* * Uhhuh, somebody else created the same inode under us. * Use the old inode instead of the preallocated one. */ spin_unlock(&inode_hash_lock); if (IS_ERR(old)) return NULL; wait_on_inode(old); if (unlikely(inode_unhashed(old))) { iput(old); goto again; } return old; } if (set && unlikely(set(inode, data))) { inode = NULL; goto unlock; } /* * Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents */ spin_lock(&inode->i_lock); inode->i_state |= I_NEW; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); /* * Add inode to the sb list if it's not already. It has I_NEW at this * point, so it should be safe to test i_sb_list locklessly. */ if (list_empty(&inode->i_sb_list)) inode_sb_list_add(inode); unlock: spin_unlock(&inode_hash_lock); return inode; } EXPORT_SYMBOL(inode_insert5); /** * iget5_locked - obtain an inode from a mounted file system * @sb: super block of file system * @hashval: hash value (usually inode number) to get * @test: callback used for comparisons between inodes * @set: callback used to initialize a new struct inode * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, * and if present return it with an increased reference count. This is a * generalized version of iget_locked() for file systems where the inode * number is not sufficient for unique identification of an inode. * * If the inode is not present in the cache, allocate and insert a new inode * and return it locked, hashed, and with the I_NEW flag set. The file system * gets to fill it in before unlocking it via unlock_new_inode(). * * Note that both @test and @set are called with the inode_hash_lock held, so * they can't sleep. */ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { struct inode *inode = ilookup5(sb, hashval, test, data); if (!inode) { struct inode *new = alloc_inode(sb); if (new) { inode = inode_insert5(new, hashval, test, set, data); if (unlikely(inode != new)) destroy_inode(new); } } return inode; } EXPORT_SYMBOL(iget5_locked); /** * iget5_locked_rcu - obtain an inode from a mounted file system * @sb: super block of file system * @hashval: hash value (usually inode number) to get * @test: callback used for comparisons between inodes * @set: callback used to initialize a new struct inode * @data: opaque data pointer to pass to @test and @set * * This is equivalent to iget5_locked, except the @test callback must * tolerate the inode not being stable, including being mid-teardown. */ struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode, *new; again: inode = find_inode(sb, head, test, data, false); if (inode) { if (IS_ERR(inode)) return NULL; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } return inode; } new = alloc_inode(sb); if (new) { inode = inode_insert5(new, hashval, test, set, data); if (unlikely(inode != new)) destroy_inode(new); } return inode; } EXPORT_SYMBOL_GPL(iget5_locked_rcu); /** * iget_locked - obtain an inode from a mounted file system * @sb: super block of file system * @ino: inode number to get * * Search for the inode specified by @ino in the inode cache and if present * return it with an increased reference count. This is for file systems * where the inode number is sufficient for unique identification of an inode. * * If the inode is not in cache, allocate a new inode and return it locked, * hashed, and with the I_NEW flag set. The file system gets to fill it in * before unlocking it via unlock_new_inode(). */ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; again: inode = find_inode_fast(sb, head, ino, false); if (inode) { if (IS_ERR(inode)) return NULL; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } return inode; } inode = alloc_inode(sb); if (inode) { struct inode *old; spin_lock(&inode_hash_lock); /* We released the lock, so.. */ old = find_inode_fast(sb, head, ino, true); if (!old) { inode->i_ino = ino; spin_lock(&inode->i_lock); inode->i_state = I_NEW; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); inode_sb_list_add(inode); spin_unlock(&inode_hash_lock); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents */ return inode; } /* * Uhhuh, somebody else created the same inode under * us. Use the old inode instead of the one we just * allocated. */ spin_unlock(&inode_hash_lock); destroy_inode(inode); if (IS_ERR(old)) return NULL; inode = old; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } } return inode; } EXPORT_SYMBOL(iget_locked); /* * search the inode cache for a matching inode number. * If we find one, then the inode number we are trying to * allocate is not unique and so we should not use it. * * Returns 1 if the inode number is unique, 0 if it is not. */ static int test_inode_iunique(struct super_block *sb, unsigned long ino) { struct hlist_head *b = inode_hashtable + hash(sb, ino); struct inode *inode; hlist_for_each_entry_rcu(inode, b, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb) return 0; } return 1; } /** * iunique - get a unique inode number * @sb: superblock * @max_reserved: highest reserved inode number * * Obtain an inode number that is unique on the system for a given * superblock. This is used by file systems that have no natural * permanent inode numbering system. An inode number is returned that * is higher than the reserved limit but unique. * * BUGS: * With a large number of inodes live on the file system this function * currently becomes quite slow. */ ino_t iunique(struct super_block *sb, ino_t max_reserved) { /* * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW * error if st_ino won't fit in target struct field. Use 32bit counter * here to attempt to avoid that. */ static DEFINE_SPINLOCK(iunique_lock); static unsigned int counter; ino_t res; rcu_read_lock(); spin_lock(&iunique_lock); do { if (counter <= max_reserved) counter = max_reserved + 1; res = counter++; } while (!test_inode_iunique(sb, res)); spin_unlock(&iunique_lock); rcu_read_unlock(); return res; } EXPORT_SYMBOL(iunique); struct inode *igrab(struct inode *inode) { spin_lock(&inode->i_lock); if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { __iget(inode); spin_unlock(&inode->i_lock); } else { spin_unlock(&inode->i_lock); /* * Handle the case where s_op->clear_inode is not been * called yet, and somebody is calling igrab * while the inode is getting freed. */ inode = NULL; } return inode; } EXPORT_SYMBOL(igrab); /** * ilookup5_nowait - search for an inode in the inode cache * @sb: super block of file system to search * @hashval: hash value (usually inode number) to search for * @test: callback used for comparisons between inodes * @data: opaque data pointer to pass to @test * * Search for the inode specified by @hashval and @data in the inode cache. * If the inode is in the cache, the inode is returned with an incremented * reference count. * * Note: I_NEW is not waited upon so you have to be very careful what you do * with the returned inode. You probably should be using ilookup5() instead. * * Note2: @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; spin_lock(&inode_hash_lock); inode = find_inode(sb, head, test, data, true); spin_unlock(&inode_hash_lock); return IS_ERR(inode) ? NULL : inode; } EXPORT_SYMBOL(ilookup5_nowait); /** * ilookup5 - search for an inode in the inode cache * @sb: super block of file system to search * @hashval: hash value (usually inode number) to search for * @test: callback used for comparisons between inodes * @data: opaque data pointer to pass to @test * * Search for the inode specified by @hashval and @data in the inode cache, * and if the inode is in the cache, return the inode with an incremented * reference count. Waits on I_NEW before returning the inode. * returned with an incremented reference count. * * This is a generalized version of ilookup() for file systems where the * inode number is not sufficient for unique identification of an inode. * * Note: @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct inode *inode; again: inode = ilookup5_nowait(sb, hashval, test, data); if (inode) { wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } } return inode; } EXPORT_SYMBOL(ilookup5); /** * ilookup - search for an inode in the inode cache * @sb: super block of file system to search * @ino: inode number to search for * * Search for the inode @ino in the inode cache, and if the inode is in the * cache, the inode is returned with an incremented reference count. */ struct inode *ilookup(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; again: inode = find_inode_fast(sb, head, ino, false); if (inode) { if (IS_ERR(inode)) return NULL; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } } return inode; } EXPORT_SYMBOL(ilookup); /** * find_inode_nowait - find an inode in the inode cache * @sb: super block of file system to search * @hashval: hash value (usually inode number) to search for * @match: callback used for comparisons between inodes * @data: opaque data pointer to pass to @match * * Search for the inode specified by @hashval and @data in the inode * cache, where the helper function @match will return 0 if the inode * does not match, 1 if the inode does match, and -1 if the search * should be stopped. The @match function must be responsible for * taking the i_lock spin_lock and checking i_state for an inode being * freed or being initialized, and incrementing the reference count * before returning 1. It also must not sleep, since it is called with * the inode_hash_lock spinlock held. * * This is a even more generalized version of ilookup5() when the * function must never block --- find_inode() can block in * __wait_on_freeing_inode() --- or when the caller can not increment * the reference count because the resulting iput() might cause an * inode eviction. The tradeoff is that the @match funtion must be * very carefully implemented. */ struct inode *find_inode_nowait(struct super_block *sb, unsigned long hashval, int (*match)(struct inode *, unsigned long, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode, *ret_inode = NULL; int mval; spin_lock(&inode_hash_lock); hlist_for_each_entry(inode, head, i_hash) { if (inode->i_sb != sb) continue; mval = match(inode, hashval, data); if (mval == 0) continue; if (mval == 1) ret_inode = inode; goto out; } out: spin_unlock(&inode_hash_lock); return ret_inode; } EXPORT_SYMBOL(find_inode_nowait); /** * find_inode_rcu - find an inode in the inode cache * @sb: Super block of file system to search * @hashval: Key to hash * @test: Function to test match on an inode * @data: Data for test function * * Search for the inode specified by @hashval and @data in the inode cache, * where the helper function @test will return 0 if the inode does not match * and 1 if it does. The @test function must be responsible for taking the * i_lock spin_lock and checking i_state for an inode being freed or being * initialized. * * If successful, this will return the inode for which the @test function * returned 1 and NULL otherwise. * * The @test function is not permitted to take a ref on any inode presented. * It is also not permitted to sleep. * * The caller must hold the RCU read lock. */ struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious find_inode_rcu() usage"); hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && test(inode, data)) return inode; } return NULL; } EXPORT_SYMBOL(find_inode_rcu); /** * find_inode_by_ino_rcu - Find an inode in the inode cache * @sb: Super block of file system to search * @ino: The inode number to match * * Search for the inode specified by @hashval and @data in the inode cache, * where the helper function @test will return 0 if the inode does not match * and 1 if it does. The @test function must be responsible for taking the * i_lock spin_lock and checking i_state for an inode being freed or being * initialized. * * If successful, this will return the inode for which the @test function * returned 1 and NULL otherwise. * * The @test function is not permitted to take a ref on any inode presented. * It is also not permitted to sleep. * * The caller must hold the RCU read lock. */ struct inode *find_inode_by_ino_rcu(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious find_inode_by_ino_rcu() usage"); hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) return inode; } return NULL; } EXPORT_SYMBOL(find_inode_by_ino_rcu); int insert_inode_locked(struct inode *inode) { struct super_block *sb = inode->i_sb; ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); while (1) { struct inode *old = NULL; spin_lock(&inode_hash_lock); hlist_for_each_entry(old, head, i_hash) { if (old->i_ino != ino) continue; if (old->i_sb != sb) continue; spin_lock(&old->i_lock); if (old->i_state & (I_FREEING|I_WILL_FREE)) { spin_unlock(&old->i_lock); continue; } break; } if (likely(!old)) { spin_lock(&inode->i_lock); inode->i_state |= I_NEW | I_CREATING; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); return 0; } if (unlikely(old->i_state & I_CREATING)) { spin_unlock(&old->i_lock); spin_unlock(&inode_hash_lock); return -EBUSY; } __iget(old); spin_unlock(&old->i_lock); spin_unlock(&inode_hash_lock); wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); return -EBUSY; } iput(old); } } EXPORT_SYMBOL(insert_inode_locked); int insert_inode_locked4(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct inode *old; inode->i_state |= I_CREATING; old = inode_insert5(inode, hashval, test, NULL, data); if (old != inode) { iput(old); return -EBUSY; } return 0; } EXPORT_SYMBOL(insert_inode_locked4); int generic_delete_inode(struct inode *inode) { return 1; } EXPORT_SYMBOL(generic_delete_inode); /* * Called when we're dropping the last reference * to an inode. * * Call the FS "drop_inode()" function, defaulting to * the legacy UNIX filesystem behaviour. If it tells * us to evict inode, do so. Otherwise, retain inode * in cache if fs is alive, sync and evict if fs is * shutting down. */ static void iput_final(struct inode *inode) { struct super_block *sb = inode->i_sb; const struct super_operations *op = inode->i_sb->s_op; unsigned long state; int drop; WARN_ON(inode->i_state & I_NEW); if (op->drop_inode) drop = op->drop_inode(inode); else drop = generic_drop_inode(inode); if (!drop && !(inode->i_state & I_DONTCACHE) && (sb->s_flags & SB_ACTIVE)) { __inode_add_lru(inode, true); spin_unlock(&inode->i_lock); return; } state = inode->i_state; if (!drop) { WRITE_ONCE(inode->i_state, state | I_WILL_FREE); spin_unlock(&inode->i_lock); write_inode_now(inode, 1); spin_lock(&inode->i_lock); state = inode->i_state; WARN_ON(state & I_NEW); state &= ~I_WILL_FREE; } WRITE_ONCE(inode->i_state, state | I_FREEING); if (!list_empty(&inode->i_lru)) inode_lru_list_del(inode); spin_unlock(&inode->i_lock); evict(inode); } /** * iput - put an inode * @inode: inode to put * * Puts an inode, dropping its usage count. If the inode use count hits * zero, the inode is then freed and may also be destroyed. * * Consequently, iput() can sleep. */ void iput(struct inode *inode) { if (!inode) return; BUG_ON(inode->i_state & I_CLEAR); retry: if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); trace_writeback_lazytime_iput(inode); mark_inode_dirty_sync(inode); goto retry; } iput_final(inode); } } EXPORT_SYMBOL(iput); #ifdef CONFIG_BLOCK /** * bmap - find a block number in a file * @inode: inode owning the block number being requested * @block: pointer containing the block to find * * Replaces the value in ``*block`` with the block number on the device holding * corresponding to the requested block number in the file. * That is, asked for block 4 of inode 1 the function will replace the * 4 in ``*block``, with disk block relative to the disk start that holds that * block of the file. * * Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a * hole, returns 0 and ``*block`` is also set to 0. */ int bmap(struct inode *inode, sector_t *block) { if (!inode->i_mapping->a_ops->bmap) return -EINVAL; *block = inode->i_mapping->a_ops->bmap(inode->i_mapping, *block); return 0; } EXPORT_SYMBOL(bmap); #endif /* * With relative atime, only update atime if the previous atime is * earlier than or equal to either the ctime or mtime, * or if at least a day has passed since the last atime update. */ static bool relatime_need_update(struct vfsmount *mnt, struct inode *inode, struct timespec64 now) { struct timespec64 atime, mtime, ctime; if (!(mnt->mnt_flags & MNT_RELATIME)) return true; /* * Is mtime younger than or equal to atime? If yes, update atime: */ atime = inode_get_atime(inode); mtime = inode_get_mtime(inode); if (timespec64_compare(&mtime, &atime) >= 0) return true; /* * Is ctime younger than or equal to atime? If yes, update atime: */ ctime = inode_get_ctime(inode); if (timespec64_compare(&ctime, &atime) >= 0) return true; /* * Is the previous atime value older than a day? If yes, * update atime: */ if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60) return true; /* * Good, we can skip the atime update: */ return false; } /** * inode_update_timestamps - update the timestamps on the inode * @inode: inode to be updated * @flags: S_* flags that needed to be updated * * The update_time function is called when an inode's timestamps need to be * updated for a read or write operation. This function handles updating the * actual timestamps. It's up to the caller to ensure that the inode is marked * dirty appropriately. * * In the case where any of S_MTIME, S_CTIME, or S_VERSION need to be updated, * attempt to update all three of them. S_ATIME updates can be handled * independently of the rest. * * Returns a set of S_* flags indicating which values changed. */ int inode_update_timestamps(struct inode *inode, int flags) { int updated = 0; struct timespec64 now; if (flags & (S_MTIME|S_CTIME|S_VERSION)) { struct timespec64 ctime = inode_get_ctime(inode); struct timespec64 mtime = inode_get_mtime(inode); now = inode_set_ctime_current(inode); if (!timespec64_equal(&now, &ctime)) updated |= S_CTIME; if (!timespec64_equal(&now, &mtime)) { inode_set_mtime_to_ts(inode, now); updated |= S_MTIME; } if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated)) updated |= S_VERSION; } else { now = current_time(inode); } if (flags & S_ATIME) { struct timespec64 atime = inode_get_atime(inode); if (!timespec64_equal(&now, &atime)) { inode_set_atime_to_ts(inode, now); updated |= S_ATIME; } } return updated; } EXPORT_SYMBOL(inode_update_timestamps); /** * generic_update_time - update the timestamps on the inode * @inode: inode to be updated * @flags: S_* flags that needed to be updated * * The update_time function is called when an inode's timestamps need to be * updated for a read or write operation. In the case where any of S_MTIME, S_CTIME, * or S_VERSION need to be updated we attempt to update all three of them. S_ATIME * updates can be handled done independently of the rest. * * Returns a S_* mask indicating which fields were updated. */ int generic_update_time(struct inode *inode, int flags) { int updated = inode_update_timestamps(inode, flags); int dirty_flags = 0; if (updated & (S_ATIME|S_MTIME|S_CTIME)) dirty_flags = inode->i_sb->s_flags & SB_LAZYTIME ? I_DIRTY_TIME : I_DIRTY_SYNC; if (updated & S_VERSION) dirty_flags |= I_DIRTY_SYNC; __mark_inode_dirty(inode, dirty_flags); return updated; } EXPORT_SYMBOL(generic_update_time); /* * This does the actual work of updating an inodes time or version. Must have * had called mnt_want_write() before calling this. */ int inode_update_time(struct inode *inode, int flags) { if (inode->i_op->update_time) return inode->i_op->update_time(inode, flags); generic_update_time(inode, flags); return 0; } EXPORT_SYMBOL(inode_update_time); /** * atime_needs_update - update the access time * @path: the &struct path to update * @inode: inode to update * * Update the accessed time on an inode and mark it for writeback. * This function automatically handles read only file systems and media, * as well as the "noatime" flag and inode specific "noatime" markers. */ bool atime_needs_update(const struct path *path, struct inode *inode) { struct vfsmount *mnt = path->mnt; struct timespec64 now, atime; if (inode->i_flags & S_NOATIME) return false; /* Atime updates will likely cause i_uid and i_gid to be written * back improprely if their true value is unknown to the vfs. */ if (HAS_UNMAPPED_ID(mnt_idmap(mnt), inode)) return false; if (IS_NOATIME(inode)) return false; if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)) return false; if (mnt->mnt_flags & MNT_NOATIME) return false; if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) return false; now = current_time(inode); if (!relatime_need_update(mnt, inode, now)) return false; atime = inode_get_atime(inode); if (timespec64_equal(&atime, &now)) return false; return true; } void touch_atime(const struct path *path) { struct vfsmount *mnt = path->mnt; struct inode *inode = d_inode(path->dentry); if (!atime_needs_update(path, inode)) return; if (!sb_start_write_trylock(inode->i_sb)) return; if (mnt_get_write_access(mnt) != 0) goto skip_update; /* * File systems can error out when updating inodes if they need to * allocate new space to modify an inode (such is the case for * Btrfs), but since we touch atime while walking down the path we * really don't care if we failed to update the atime of the file, * so just ignore the return value. * We may also fail on filesystems that have the ability to make parts * of the fs read only, e.g. subvolumes in Btrfs. */ inode_update_time(inode, S_ATIME); mnt_put_write_access(mnt); skip_update: sb_end_write(inode->i_sb); } EXPORT_SYMBOL(touch_atime); /* * Return mask of changes for notify_change() that need to be done as a * response to write or truncate. Return 0 if nothing has to be changed. * Negative value on error (change should be denied). */ int dentry_needs_remove_privs(struct mnt_idmap *idmap, struct dentry *dentry) { struct inode *inode = d_inode(dentry); int mask = 0; int ret; if (IS_NOSEC(inode)) return 0; mask = setattr_should_drop_suidgid(idmap, inode); ret = security_inode_need_killpriv(dentry); if (ret < 0) return ret; if (ret) mask |= ATTR_KILL_PRIV; return mask; } static int __remove_privs(struct mnt_idmap *idmap, struct dentry *dentry, int kill) { struct iattr newattrs; newattrs.ia_valid = ATTR_FORCE | kill; /* * Note we call this on write, so notify_change will not * encounter any conflicting delegations: */ return notify_change(idmap, dentry, &newattrs, NULL); } int file_remove_privs_flags(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); int error = 0; int kill; if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; kill = dentry_needs_remove_privs(file_mnt_idmap(file), dentry); if (kill < 0) return kill; if (kill) { if (flags & IOCB_NOWAIT) return -EAGAIN; error = __remove_privs(file_mnt_idmap(file), dentry, kill); } if (!error) inode_has_no_xattr(inode); return error; } EXPORT_SYMBOL_GPL(file_remove_privs_flags); /** * file_remove_privs - remove special file privileges (suid, capabilities) * @file: file to remove privileges from * * When file is modified by a write or truncation ensure that special * file privileges are removed. * * Return: 0 on success, negative errno on failure. */ int file_remove_privs(struct file *file) { return file_remove_privs_flags(file, 0); } EXPORT_SYMBOL(file_remove_privs); /** * current_time - Return FS time (possibly fine-grained) * @inode: inode. * * Return the current time truncated to the time granularity supported by * the fs, as suitable for a ctime/mtime change. If the ctime is flagged * as having been QUERIED, get a fine-grained timestamp, but don't update * the floor. * * For a multigrain inode, this is effectively an estimate of the timestamp * that a file would receive. An actual update must go through * inode_set_ctime_current(). */ struct timespec64 current_time(struct inode *inode) { struct timespec64 now; u32 cns; ktime_get_coarse_real_ts64_mg(&now); if (!is_mgtime(inode)) goto out; /* If nothing has queried it, then coarse time is fine */ cns = smp_load_acquire(&inode->i_ctime_nsec); if (cns & I_CTIME_QUERIED) { /* * If there is no apparent change, then get a fine-grained * timestamp. */ if (now.tv_nsec == (cns & ~I_CTIME_QUERIED)) ktime_get_real_ts64(&now); } out: return timestamp_truncate(now, inode); } EXPORT_SYMBOL(current_time); static int inode_needs_update_time(struct inode *inode) { struct timespec64 now, ts; int sync_it = 0; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; now = current_time(inode); ts = inode_get_mtime(inode); if (!timespec64_equal(&ts, &now)) sync_it |= S_MTIME; ts = inode_get_ctime(inode); if (!timespec64_equal(&ts, &now)) sync_it |= S_CTIME; if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) sync_it |= S_VERSION; return sync_it; } static int __file_update_time(struct file *file, int sync_mode) { int ret = 0; struct inode *inode = file_inode(file); /* try to update time settings */ if (!mnt_get_write_access_file(file)) { ret = inode_update_time(inode, sync_mode); mnt_put_write_access_file(file); } return ret; } /** * file_update_time - update mtime and ctime time * @file: file accessed * * Update the mtime and ctime members of an inode and mark the inode for * writeback. Note that this function is meant exclusively for usage in * the file write path of filesystems, and filesystems may choose to * explicitly ignore updates via this function with the _NOCMTIME inode * flag, e.g. for network filesystem where these imestamps are handled * by the server. This can return an error for file systems who need to * allocate space in order to update an inode. * * Return: 0 on success, negative errno on failure. */ int file_update_time(struct file *file) { int ret; struct inode *inode = file_inode(file); ret = inode_needs_update_time(inode); if (ret <= 0) return ret; return __file_update_time(file, ret); } EXPORT_SYMBOL(file_update_time); /** * file_modified_flags - handle mandated vfs changes when modifying a file * @file: file that was modified * @flags: kiocb flags * * When file has been modified ensure that special * file privileges are removed and time settings are updated. * * If IOCB_NOWAIT is set, special file privileges will not be removed and * time settings will not be updated. It will return -EAGAIN. * * Context: Caller must hold the file's inode lock. * * Return: 0 on success, negative errno on failure. */ static int file_modified_flags(struct file *file, int flags) { int ret; struct inode *inode = file_inode(file); /* * Clear the security bits if the process is not being run by root. * This keeps people from modifying setuid and setgid binaries. */ ret = file_remove_privs_flags(file, flags); if (ret) return ret; if (unlikely(file->f_mode & FMODE_NOCMTIME)) return 0; ret = inode_needs_update_time(inode); if (ret <= 0) return ret; if (flags & IOCB_NOWAIT) return -EAGAIN; return __file_update_time(file, ret); } /** * file_modified - handle mandated vfs changes when modifying a file * @file: file that was modified * * When file has been modified ensure that special * file privileges are removed and time settings are updated. * * Context: Caller must hold the file's inode lock. * * Return: 0 on success, negative errno on failure. */ int file_modified(struct file *file) { return file_modified_flags(file, 0); } EXPORT_SYMBOL(file_modified); /** * kiocb_modified - handle mandated vfs changes when modifying a file * @iocb: iocb that was modified * * When file has been modified ensure that special * file privileges are removed and time settings are updated. * * Context: Caller must hold the file's inode lock. * * Return: 0 on success, negative errno on failure. */ int kiocb_modified(struct kiocb *iocb) { return file_modified_flags(iocb->ki_filp, iocb->ki_flags); } EXPORT_SYMBOL_GPL(kiocb_modified); int inode_needs_sync(struct inode *inode) { if (IS_SYNC(inode)) return 1; if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) return 1; return 0; } EXPORT_SYMBOL(inode_needs_sync); /* * If we try to find an inode in the inode hash while it is being * deleted, we have to wait until the filesystem completes its * deletion before reporting that it isn't found. This function waits * until the deletion _might_ have completed. Callers are responsible * to recheck inode state. * * It doesn't matter if I_NEW is not set initially, a call to * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list * will DTRT. */ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked) { struct wait_bit_queue_entry wqe; struct wait_queue_head *wq_head; /* * Handle racing against evict(), see that routine for more details. */ if (unlikely(inode_unhashed(inode))) { WARN_ON(is_inode_hash_locked); spin_unlock(&inode->i_lock); return; } wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW); prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); rcu_read_unlock(); if (is_inode_hash_locked) spin_unlock(&inode_hash_lock); schedule(); finish_wait(wq_head, &wqe.wq_entry); if (is_inode_hash_locked) spin_lock(&inode_hash_lock); rcu_read_lock(); } static __initdata unsigned long ihash_entries; static int __init set_ihash_entries(char *str) { if (!str) return 0; ihash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("ihash_entries=", set_ihash_entries); /* * Initialize the waitqueues and inode hash table. */ void __init inode_init_early(void) { /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ if (hashdist) return; inode_hashtable = alloc_large_system_hash("Inode-cache", sizeof(struct hlist_head), ihash_entries, 14, HASH_EARLY | HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); } void __init inode_init(void) { /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| SLAB_ACCOUNT), init_once); /* Hash may have been set up in inode_init_early */ if (!hashdist) return; inode_hashtable = alloc_large_system_hash("Inode-cache", sizeof(struct hlist_head), ihash_entries, 14, HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); } void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) { inode->i_mode = mode; if (S_ISCHR(mode)) { inode->i_fop = &def_chr_fops; inode->i_rdev = rdev; } else if (S_ISBLK(mode)) { if (IS_ENABLED(CONFIG_BLOCK)) inode->i_fop = &def_blk_fops; inode->i_rdev = rdev; } else if (S_ISFIFO(mode)) inode->i_fop = &pipefifo_fops; else if (S_ISSOCK(mode)) ; /* leave it no_open_fops */ else printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" " inode %s:%lu\n", mode, inode->i_sb->s_id, inode->i_ino); } EXPORT_SYMBOL(init_special_inode); /** * inode_init_owner - Init uid,gid,mode for new inode according to posix standards * @idmap: idmap of the mount the inode was created from * @inode: New inode * @dir: Directory inode * @mode: mode of the new inode * * If the inode has been created through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions * and initializing i_uid and i_gid. On non-idmapped mounts or if permission * checking is to be performed on the raw inode simply pass @nop_mnt_idmap. */ void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode, const struct inode *dir, umode_t mode) { inode_fsuid_set(inode, idmap); if (dir && dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid; /* Directories are special, and always inherit S_ISGID */ if (S_ISDIR(mode)) mode |= S_ISGID; } else inode_fsgid_set(inode, idmap); inode->i_mode = mode; } EXPORT_SYMBOL(inode_init_owner); /** * inode_owner_or_capable - check current task permissions to inode * @idmap: idmap of the mount the inode was found from * @inode: inode being checked * * Return true if current either has CAP_FOWNER in a namespace with the * inode owner uid mapped, or owns the file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode) { vfsuid_t vfsuid; struct user_namespace *ns; vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return true; ns = current_user_ns(); if (vfsuid_has_mapping(ns, vfsuid) && ns_capable(ns, CAP_FOWNER)) return true; return false; } EXPORT_SYMBOL(inode_owner_or_capable); /* * Direct i/o helper functions */ bool inode_dio_finished(const struct inode *inode) { return atomic_read(&inode->i_dio_count) == 0; } EXPORT_SYMBOL(inode_dio_finished); /** * inode_dio_wait - wait for outstanding DIO requests to finish * @inode: inode to wait for * * Waits for all pending direct I/O requests to finish so that we can * proceed with a truncate or equivalent operation. * * Must be called under a lock that serializes taking new references * to i_dio_count, usually by inode->i_mutex. */ void inode_dio_wait(struct inode *inode) { wait_var_event(&inode->i_dio_count, inode_dio_finished(inode)); } EXPORT_SYMBOL(inode_dio_wait); void inode_dio_wait_interruptible(struct inode *inode) { wait_var_event_interruptible(&inode->i_dio_count, inode_dio_finished(inode)); } EXPORT_SYMBOL(inode_dio_wait_interruptible); /* * inode_set_flags - atomically set some inode flags * * Note: the caller should be holding i_mutex, or else be sure that * they have exclusive access to the inode structure (i.e., while the * inode is being instantiated). The reason for the cmpxchg() loop * --- which wouldn't be necessary if all code paths which modify * i_flags actually followed this rule, is that there is at least one * code path which doesn't today so we use cmpxchg() out of an abundance * of caution. * * In the long run, i_mutex is overkill, and we should probably look * at using the i_lock spinlock to protect i_flags, and then make sure * it is so documented in include/linux/fs.h and that all code follows * the locking convention!! */ void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask) { WARN_ON_ONCE(flags & ~mask); set_mask_bits(&inode->i_flags, mask, flags); } EXPORT_SYMBOL(inode_set_flags); void inode_nohighmem(struct inode *inode) { mapping_set_gfp_mask(inode->i_mapping, GFP_USER); } EXPORT_SYMBOL(inode_nohighmem); struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts) { trace_inode_set_ctime_to_ts(inode, &ts); set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec); inode->i_ctime_sec = ts.tv_sec; inode->i_ctime_nsec = ts.tv_nsec; return ts; } EXPORT_SYMBOL(inode_set_ctime_to_ts); /** * timestamp_truncate - Truncate timespec to a granularity * @t: Timespec * @inode: inode being updated * * Truncate a timespec to the granularity supported by the fs * containing the inode. Always rounds down. gran must * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). */ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode) { struct super_block *sb = inode->i_sb; unsigned int gran = sb->s_time_gran; t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max); if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min)) t.tv_nsec = 0; /* Avoid division in the common cases 1 ns and 1 s. */ if (gran == 1) ; /* nothing */ else if (gran == NSEC_PER_SEC) t.tv_nsec = 0; else if (gran > 1 && gran < NSEC_PER_SEC) t.tv_nsec -= t.tv_nsec % gran; else WARN(1, "invalid file time granularity: %u", gran); return t; } EXPORT_SYMBOL(timestamp_truncate); /** * inode_set_ctime_current - set the ctime to current_time * @inode: inode * * Set the inode's ctime to the current value for the inode. Returns the * current value that was assigned. If this is not a multigrain inode, then we * set it to the later of the coarse time and floor value. * * If it is multigrain, then we first see if the coarse-grained timestamp is * distinct from what is already there. If so, then use that. Otherwise, get a * fine-grained timestamp. * * After that, try to swap the new value into i_ctime_nsec. Accept the * resulting ctime, regardless of the outcome of the swap. If it has * already been replaced, then that timestamp is later than the earlier * unacceptable one, and is thus acceptable. */ struct timespec64 inode_set_ctime_current(struct inode *inode) { struct timespec64 now; u32 cns, cur; ktime_get_coarse_real_ts64_mg(&now); now = timestamp_truncate(now, inode); /* Just return that if this is not a multigrain fs */ if (!is_mgtime(inode)) { inode_set_ctime_to_ts(inode, now); goto out; } /* * A fine-grained time is only needed if someone has queried * for timestamps, and the current coarse grained time isn't * later than what's already there. */ cns = smp_load_acquire(&inode->i_ctime_nsec); if (cns & I_CTIME_QUERIED) { struct timespec64 ctime = { .tv_sec = inode->i_ctime_sec, .tv_nsec = cns & ~I_CTIME_QUERIED }; if (timespec64_compare(&now, &ctime) <= 0) { ktime_get_real_ts64_mg(&now); now = timestamp_truncate(now, inode); mgtime_counter_inc(mg_fine_stamps); } } mgtime_counter_inc(mg_ctime_updates); /* No need to cmpxchg if it's exactly the same */ if (cns == now.tv_nsec && inode->i_ctime_sec == now.tv_sec) { trace_ctime_xchg_skip(inode, &now); goto out; } cur = cns; retry: /* Try to swap the nsec value into place. */ if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now.tv_nsec)) { /* If swap occurred, then we're (mostly) done */ inode->i_ctime_sec = now.tv_sec; trace_ctime_ns_xchg(inode, cns, now.tv_nsec, cur); mgtime_counter_inc(mg_ctime_swaps); } else { /* * Was the change due to someone marking the old ctime QUERIED? * If so then retry the swap. This can only happen once since * the only way to clear I_CTIME_QUERIED is to stamp the inode * with a new ctime. */ if (!(cns & I_CTIME_QUERIED) && (cns | I_CTIME_QUERIED) == cur) { cns = cur; goto retry; } /* Otherwise, keep the existing ctime */ now.tv_sec = inode->i_ctime_sec; now.tv_nsec = cur & ~I_CTIME_QUERIED; } out: return now; } EXPORT_SYMBOL(inode_set_ctime_current); /** * inode_set_ctime_deleg - try to update the ctime on a delegated inode * @inode: inode to update * @update: timespec64 to set the ctime * * Attempt to atomically update the ctime on behalf of a delegation holder. * * The nfs server can call back the holder of a delegation to get updated * inode attributes, including the mtime. When updating the mtime, update * the ctime to a value at least equal to that. * * This can race with concurrent updates to the inode, in which * case the update is skipped. * * Note that this works even when multigrain timestamps are not enabled, * so it is used in either case. */ struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 update) { struct timespec64 now, cur_ts; u32 cur, old; /* pairs with try_cmpxchg below */ cur = smp_load_acquire(&inode->i_ctime_nsec); cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED; cur_ts.tv_sec = inode->i_ctime_sec; /* If the update is older than the existing value, skip it. */ if (timespec64_compare(&update, &cur_ts) <= 0) return cur_ts; ktime_get_coarse_real_ts64_mg(&now); /* Clamp the update to "now" if it's in the future */ if (timespec64_compare(&update, &now) > 0) update = now; update = timestamp_truncate(update, inode); /* No need to update if the values are already the same */ if (timespec64_equal(&update, &cur_ts)) return cur_ts; /* * Try to swap the nsec value into place. If it fails, that means * it raced with an update due to a write or similar activity. That * stamp takes precedence, so just skip the update. */ retry: old = cur; if (try_cmpxchg(&inode->i_ctime_nsec, &cur, update.tv_nsec)) { inode->i_ctime_sec = update.tv_sec; mgtime_counter_inc(mg_ctime_swaps); return update; } /* * Was the change due to another task marking the old ctime QUERIED? * * If so, then retry the swap. This can only happen once since * the only way to clear I_CTIME_QUERIED is to stamp the inode * with a new ctime. */ if (!(old & I_CTIME_QUERIED) && (cur == (old | I_CTIME_QUERIED))) goto retry; /* Otherwise, it was a new timestamp. */ cur_ts.tv_sec = inode->i_ctime_sec; cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED; return cur_ts; } EXPORT_SYMBOL(inode_set_ctime_deleg); /** * in_group_or_capable - check whether caller is CAP_FSETID privileged * @idmap: idmap of the mount @inode was found from * @inode: inode to check * @vfsgid: the new/current vfsgid of @inode * * Check whether @vfsgid is in the caller's group list or if the caller is * privileged with CAP_FSETID over @inode. This can be used to determine * whether the setgid bit can be kept or must be dropped. * * Return: true if the caller is sufficiently privileged, false if not. */ bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid) { if (vfsgid_in_group_p(vfsgid)) return true; if (capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) return true; return false; } EXPORT_SYMBOL(in_group_or_capable); /** * mode_strip_sgid - handle the sgid bit for non-directories * @idmap: idmap of the mount the inode was created from * @dir: parent directory inode * @mode: mode of the file to be created in @dir * * If the @mode of the new file has both the S_ISGID and S_IXGRP bit * raised and @dir has the S_ISGID bit raised ensure that the caller is * either in the group of the parent directory or they have CAP_FSETID * in their user namespace and are privileged over the parent directory. * In all other cases, strip the S_ISGID bit from @mode. * * Return: the new mode to use for the file */ umode_t mode_strip_sgid(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode) { if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP)) return mode; if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID)) return mode; if (in_group_or_capable(idmap, dir, i_gid_into_vfsgid(idmap, dir))) return mode; return mode & ~S_ISGID; } EXPORT_SYMBOL(mode_strip_sgid); |
| 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_POLL_H #define _LINUX_POLL_H #include <linux/compiler.h> #include <linux/ktime.h> #include <linux/wait.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <uapi/linux/poll.h> #include <uapi/linux/eventpoll.h> /* ~832 bytes of stack space used max in sys_select/sys_poll before allocating additional memory. */ #define MAX_STACK_ALLOC 832 #define FRONTEND_STACK_ALLOC 256 #define SELECT_STACK_ALLOC FRONTEND_STACK_ALLOC #define POLL_STACK_ALLOC FRONTEND_STACK_ALLOC #define WQUEUES_STACK_ALLOC (MAX_STACK_ALLOC - FRONTEND_STACK_ALLOC) #define N_INLINE_POLL_ENTRIES (WQUEUES_STACK_ALLOC / sizeof(struct poll_table_entry)) #define DEFAULT_POLLMASK (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM) struct poll_table_struct; /* * structures and helpers for f_op->poll implementations */ typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *); /* * Do not touch the structure directly, use the access function * poll_requested_events() instead. */ typedef struct poll_table_struct { poll_queue_proc _qproc; __poll_t _key; } poll_table; static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { if (p && p->_qproc) { p->_qproc(filp, wait_address, p); /* * This memory barrier is paired in the wq_has_sleeper(). * See the comment above prepare_to_wait(), we need to * ensure that subsequent tests in this thread can't be * reordered with __add_wait_queue() in _qproc() paths. */ smp_mb(); } } /* * Return the set of events that the application wants to poll for. * This is useful for drivers that need to know whether a DMA transfer has * to be started implicitly on poll(). You typically only want to do that * if the application is actually polling for POLLIN and/or POLLOUT. */ static inline __poll_t poll_requested_events(const poll_table *p) { return p ? p->_key : ~(__poll_t)0; } static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) { pt->_qproc = qproc; pt->_key = ~(__poll_t)0; /* all events enabled */ } static inline bool file_can_poll(struct file *file) { return file->f_op->poll; } static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt) { if (unlikely(!file->f_op->poll)) return DEFAULT_POLLMASK; return file->f_op->poll(file, pt); } struct poll_table_entry { struct file *filp; __poll_t key; wait_queue_entry_t wait; wait_queue_head_t *wait_address; }; /* * Structures and helpers for select/poll syscall */ struct poll_wqueues { poll_table pt; struct poll_table_page *table; struct task_struct *polling_task; int triggered; int error; int inline_index; struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES]; }; extern void poll_initwait(struct poll_wqueues *pwq); extern void poll_freewait(struct poll_wqueues *pwq); extern u64 select_estimate_accuracy(struct timespec64 *tv); #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1) extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec64 *end_time); extern int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec); #define __MAP(v, from, to) \ (from < to ? (v & from) * (to/from) : (v & from) / (from/to)) static inline __u16 mangle_poll(__poll_t val) { __u16 v = (__force __u16)val; #define M(X) __MAP(v, (__force __u16)EPOLL##X, POLL##X) return M(IN) | M(OUT) | M(PRI) | M(ERR) | M(NVAL) | M(RDNORM) | M(RDBAND) | M(WRNORM) | M(WRBAND) | M(HUP) | M(RDHUP) | M(MSG); #undef M } static inline __poll_t demangle_poll(u16 val) { #define M(X) (__force __poll_t)__MAP(val, POLL##X, (__force __u16)EPOLL##X) return M(IN) | M(OUT) | M(PRI) | M(ERR) | M(NVAL) | M(RDNORM) | M(RDBAND) | M(WRNORM) | M(WRBAND) | M(HUP) | M(RDHUP) | M(MSG); #undef M } #undef __MAP #endif /* _LINUX_POLL_H */ |
| 242 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2015, 2016 ARM Ltd. */ #ifndef __KVM_ARM_VGIC_H #define __KVM_ARM_VGIC_H #include <linux/bits.h> #include <linux/kvm.h> #include <linux/irqreturn.h> #include <linux/kref.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/static_key.h> #include <linux/types.h> #include <linux/xarray.h> #include <kvm/iodev.h> #include <linux/list.h> #include <linux/jump_label.h> #include <linux/irqchip/arm-gic-v4.h> #define VGIC_V3_MAX_CPUS 512 #define VGIC_V2_MAX_CPUS 8 #define VGIC_NR_IRQS_LEGACY 256 #define VGIC_NR_SGIS 16 #define VGIC_NR_PPIS 16 #define VGIC_NR_PRIVATE_IRQS (VGIC_NR_SGIS + VGIC_NR_PPIS) #define VGIC_MAX_SPI 1019 #define VGIC_MAX_RESERVED 1023 #define VGIC_MIN_LPI 8192 #define KVM_IRQCHIP_NUM_PINS (1020 - 32) #define irq_is_ppi(irq) ((irq) >= VGIC_NR_SGIS && (irq) < VGIC_NR_PRIVATE_IRQS) #define irq_is_spi(irq) ((irq) >= VGIC_NR_PRIVATE_IRQS && \ (irq) <= VGIC_MAX_SPI) enum vgic_type { VGIC_V2, /* Good ol' GICv2 */ VGIC_V3, /* New fancy GICv3 */ }; /* same for all guests, as depending only on the _host's_ GIC model */ struct vgic_global { /* type of the host GIC */ enum vgic_type type; /* Physical address of vgic virtual cpu interface */ phys_addr_t vcpu_base; /* GICV mapping, kernel VA */ void __iomem *vcpu_base_va; /* GICV mapping, HYP VA */ void __iomem *vcpu_hyp_va; /* virtual control interface mapping, kernel VA */ void __iomem *vctrl_base; /* virtual control interface mapping, HYP VA */ void __iomem *vctrl_hyp; /* Number of implemented list registers */ int nr_lr; /* Maintenance IRQ number */ unsigned int maint_irq; /* maximum number of VCPUs allowed (GICv2 limits us to 8) */ int max_gic_vcpus; /* Only needed for the legacy KVM_CREATE_IRQCHIP */ bool can_emulate_gicv2; /* Hardware has GICv4? */ bool has_gicv4; bool has_gicv4_1; /* Pseudo GICv3 from outer space */ bool no_hw_deactivation; /* GIC system register CPU interface */ struct static_key_false gicv3_cpuif; u32 ich_vtr_el2; }; extern struct vgic_global kvm_vgic_global_state; #define VGIC_V2_MAX_LRS (1 << 6) #define VGIC_V3_MAX_LRS 16 #define VGIC_V3_LR_INDEX(lr) (VGIC_V3_MAX_LRS - 1 - lr) enum vgic_irq_config { VGIC_CONFIG_EDGE = 0, VGIC_CONFIG_LEVEL }; /* * Per-irq ops overriding some common behavious. * * Always called in non-preemptible section and the functions can use * kvm_arm_get_running_vcpu() to get the vcpu pointer for private IRQs. */ struct irq_ops { /* Per interrupt flags for special-cased interrupts */ unsigned long flags; #define VGIC_IRQ_SW_RESAMPLE BIT(0) /* Clear the active state for resampling */ /* * Callback function pointer to in-kernel devices that can tell us the * state of the input level of mapped level-triggered IRQ faster than * peaking into the physical GIC. */ bool (*get_input_level)(int vintid); }; struct vgic_irq { raw_spinlock_t irq_lock; /* Protects the content of the struct */ struct rcu_head rcu; struct list_head ap_list; struct kvm_vcpu *vcpu; /* SGIs and PPIs: The VCPU * SPIs and LPIs: The VCPU whose ap_list * this is queued on. */ struct kvm_vcpu *target_vcpu; /* The VCPU that this interrupt should * be sent to, as a result of the * targets reg (v2) or the * affinity reg (v3). */ u32 intid; /* Guest visible INTID */ bool line_level; /* Level only */ bool pending_latch; /* The pending latch state used to calculate * the pending state for both level * and edge triggered IRQs. */ bool active; /* not used for LPIs */ bool enabled; bool hw; /* Tied to HW IRQ */ struct kref refcount; /* Used for LPIs */ u32 hwintid; /* HW INTID number */ unsigned int host_irq; /* linux irq corresponding to hwintid */ union { u8 targets; /* GICv2 target VCPUs mask */ u32 mpidr; /* GICv3 target VCPU */ }; u8 source; /* GICv2 SGIs only */ u8 active_source; /* GICv2 SGIs only */ u8 priority; u8 group; /* 0 == group 0, 1 == group 1 */ enum vgic_irq_config config; /* Level or edge */ struct irq_ops *ops; void *owner; /* Opaque pointer to reserve an interrupt for in-kernel devices. */ }; static inline bool vgic_irq_needs_resampling(struct vgic_irq *irq) { return irq->ops && (irq->ops->flags & VGIC_IRQ_SW_RESAMPLE); } struct vgic_register_region; struct vgic_its; enum iodev_type { IODEV_CPUIF, IODEV_DIST, IODEV_REDIST, IODEV_ITS }; struct vgic_io_device { gpa_t base_addr; union { struct kvm_vcpu *redist_vcpu; struct vgic_its *its; }; const struct vgic_register_region *regions; enum iodev_type iodev_type; int nr_regions; struct kvm_io_device dev; }; struct vgic_its { /* The base address of the ITS control register frame */ gpa_t vgic_its_base; bool enabled; struct vgic_io_device iodev; struct kvm_device *dev; /* These registers correspond to GITS_BASER{0,1} */ u64 baser_device_table; u64 baser_coll_table; /* Protects the command queue */ struct mutex cmd_lock; u64 cbaser; u32 creadr; u32 cwriter; /* migration ABI revision in use */ u32 abi_rev; /* Protects the device and collection lists */ struct mutex its_lock; struct list_head device_list; struct list_head collection_list; /* * Caches the (device_id, event_id) -> vgic_irq translation for * LPIs that are mapped and enabled. */ struct xarray translation_cache; }; struct vgic_state_iter; struct vgic_redist_region { u32 index; gpa_t base; u32 count; /* number of redistributors or 0 if single region */ u32 free_index; /* index of the next free redistributor */ struct list_head list; }; struct vgic_dist { bool in_kernel; bool ready; bool initialized; /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */ u32 vgic_model; /* Implementation revision as reported in the GICD_IIDR */ u32 implementation_rev; #define KVM_VGIC_IMP_REV_2 2 /* GICv2 restorable groups */ #define KVM_VGIC_IMP_REV_3 3 /* GICv3 GICR_CTLR.{IW,CES,RWP} */ #define KVM_VGIC_IMP_REV_LATEST KVM_VGIC_IMP_REV_3 /* Userspace can write to GICv2 IGROUPR */ bool v2_groups_user_writable; /* Do injected MSIs require an additional device ID? */ bool msis_require_devid; int nr_spis; /* base addresses in guest physical address space: */ gpa_t vgic_dist_base; /* distributor */ union { /* either a GICv2 CPU interface */ gpa_t vgic_cpu_base; /* or a number of GICv3 redistributor regions */ struct list_head rd_regions; }; /* distributor enabled */ bool enabled; /* Wants SGIs without active state */ bool nassgireq; struct vgic_irq *spis; struct vgic_io_device dist_iodev; bool has_its; bool table_write_in_progress; /* * Contains the attributes and gpa of the LPI configuration table. * Since we report GICR_TYPER.CommonLPIAff as 0b00, we can share * one address across all redistributors. * GICv3 spec: IHI 0069E 6.1.1 "LPI Configuration tables" */ u64 propbaser; #define LPI_XA_MARK_DEBUG_ITER XA_MARK_0 struct xarray lpi_xa; /* used by vgic-debug */ struct vgic_state_iter *iter; /* * GICv4 ITS per-VM data, containing the IRQ domain, the VPE * array, the property table pointer as well as allocation * data. This essentially ties the Linux IRQ core and ITS * together, and avoids leaking KVM's data structures anywhere * else. */ struct its_vm its_vm; }; struct vgic_v2_cpu_if { u32 vgic_hcr; u32 vgic_vmcr; u32 vgic_apr; u32 vgic_lr[VGIC_V2_MAX_LRS]; unsigned int used_lrs; }; struct vgic_v3_cpu_if { u32 vgic_hcr; u32 vgic_vmcr; u32 vgic_sre; /* Restored only, change ignored */ u32 vgic_ap0r[4]; u32 vgic_ap1r[4]; u64 vgic_lr[VGIC_V3_MAX_LRS]; /* * GICv4 ITS per-VPE data, containing the doorbell IRQ, the * pending table pointer, the its_vm pointer and a few other * HW specific things. As for the its_vm structure, this is * linking the Linux IRQ subsystem and the ITS together. */ struct its_vpe its_vpe; unsigned int used_lrs; }; struct vgic_cpu { /* CPU vif control registers for world switch */ union { struct vgic_v2_cpu_if vgic_v2; struct vgic_v3_cpu_if vgic_v3; }; struct vgic_irq *private_irqs; raw_spinlock_t ap_list_lock; /* Protects the ap_list */ /* * List of IRQs that this VCPU should consider because they are either * Active or Pending (hence the name; AP list), or because they recently * were one of the two and need to be migrated off this list to another * VCPU. */ struct list_head ap_list_head; /* * Members below are used with GICv3 emulation only and represent * parts of the redistributor. */ struct vgic_io_device rd_iodev; struct vgic_redist_region *rdreg; u32 rdreg_index; atomic_t syncr_busy; /* Contains the attributes and gpa of the LPI pending tables. */ u64 pendbaser; /* GICR_CTLR.{ENABLE_LPIS,RWP} */ atomic_t ctlr; /* Cache guest priority bits */ u32 num_pri_bits; /* Cache guest interrupt ID bits */ u32 num_id_bits; }; extern struct static_key_false vgic_v2_cpuif_trap; extern struct static_key_false vgic_v3_cpuif_trap; int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr); void kvm_vgic_early_init(struct kvm *kvm); int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu); int kvm_vgic_create(struct kvm *kvm, u32 type); void kvm_vgic_destroy(struct kvm *kvm); void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu); int kvm_vgic_map_resources(struct kvm *kvm); int kvm_vgic_hyp_init(void); void kvm_vgic_init_cpu_hardware(void); int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, unsigned int intid, bool level, void *owner); int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, u32 vintid, struct irq_ops *ops); int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid); int kvm_vgic_get_map(struct kvm_vcpu *vcpu, unsigned int vintid); bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid); int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); void kvm_vgic_load(struct kvm_vcpu *vcpu); void kvm_vgic_put(struct kvm_vcpu *vcpu); #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) #define vgic_initialized(k) ((k)->arch.vgic.initialized) #define vgic_ready(k) ((k)->arch.vgic.ready) #define vgic_valid_spi(k, i) (((i) >= VGIC_NR_PRIVATE_IRQS) && \ ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu); void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid); void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1); /** * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW * * The host's GIC naturally limits the maximum amount of VCPUs a guest * can use. */ static inline int kvm_vgic_get_max_vcpus(void) { return kvm_vgic_global_state.max_gic_vcpus; } /** * kvm_vgic_setup_default_irq_routing: * Setup a default flat gsi routing table mapping all SPIs */ int kvm_vgic_setup_default_irq_routing(struct kvm *kvm); int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner); struct kvm_kernel_irq_routing_entry; int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int irq, struct kvm_kernel_irq_routing_entry *irq_entry); int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int irq, struct kvm_kernel_irq_routing_entry *irq_entry); int vgic_v4_load(struct kvm_vcpu *vcpu); void vgic_v4_commit(struct kvm_vcpu *vcpu); int vgic_v4_put(struct kvm_vcpu *vcpu); /* CPU HP callbacks */ void kvm_vgic_cpu_up(void); void kvm_vgic_cpu_down(void); #endif /* __KVM_ARM_VGIC_H */ |
| 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 171 171 170 171 171 171 170 171 170 170 170 171 170 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 | // SPDX-License-Identifier: GPL-2.0-only /* * Generic helpers for smp ipi calls * * (C) Jens Axboe <jens.axboe@oracle.com> 2008 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/irq_work.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/gfp.h> #include <linux/smp.h> #include <linux/cpu.h> #include <linux/sched.h> #include <linux/sched/idle.h> #include <linux/hypervisor.h> #include <linux/sched/clock.h> #include <linux/nmi.h> #include <linux/sched/debug.h> #include <linux/jump_label.h> #include <linux/string_choices.h> #include <trace/events/ipi.h> #define CREATE_TRACE_POINTS #include <trace/events/csd.h> #undef CREATE_TRACE_POINTS #include "smpboot.h" #include "sched/smp.h" #define CSD_TYPE(_csd) ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK) struct call_function_data { call_single_data_t __percpu *csd; cpumask_var_t cpumask; cpumask_var_t cpumask_ipi; }; static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1); static void __flush_smp_call_function_queue(bool warn_cpu_offline); int smpcfd_prepare_cpu(unsigned int cpu) { struct call_function_data *cfd = &per_cpu(cfd_data, cpu); if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return -ENOMEM; if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, cpu_to_node(cpu))) { free_cpumask_var(cfd->cpumask); return -ENOMEM; } cfd->csd = alloc_percpu(call_single_data_t); if (!cfd->csd) { free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); return -ENOMEM; } return 0; } int smpcfd_dead_cpu(unsigned int cpu) { struct call_function_data *cfd = &per_cpu(cfd_data, cpu); free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); free_percpu(cfd->csd); return 0; } int smpcfd_dying_cpu(unsigned int cpu) { /* * The IPIs for the smp-call-function callbacks queued by other * CPUs might arrive late, either due to hardware latencies or * because this CPU disabled interrupts (inside stop-machine) * before the IPIs were sent. So flush out any pending callbacks * explicitly (without waiting for the IPIs to arrive), to * ensure that the outgoing CPU doesn't go offline with work * still pending. */ __flush_smp_call_function_queue(false); irq_work_run(); return 0; } void __init call_function_init(void) { int i; for_each_possible_cpu(i) init_llist_head(&per_cpu(call_single_queue, i)); smpcfd_prepare_cpu(smp_processor_id()); } static __always_inline void send_call_function_single_ipi(int cpu) { if (call_function_single_prep_ipi(cpu)) { trace_ipi_send_cpu(cpu, _RET_IP_, generic_smp_call_function_single_interrupt); arch_send_call_function_single_ipi(cpu); } } static __always_inline void send_call_function_ipi_mask(struct cpumask *mask) { trace_ipi_send_cpumask(mask, _RET_IP_, generic_smp_call_function_single_interrupt); arch_send_call_function_ipi_mask(mask); } static __always_inline void csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd) { trace_csd_function_entry(func, csd); func(info); trace_csd_function_exit(func, csd); } #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled); /* * Parse the csdlock_debug= kernel boot parameter. * * If you need to restore the old "ext" value that once provided * additional debugging information, reapply the following commits: * * de7b09ef658d ("locking/csd_lock: Prepare more CSD lock debugging") * a5aabace5fb8 ("locking/csd_lock: Add more data to CSD lock debugging") */ static int __init csdlock_debug(char *str) { int ret; unsigned int val = 0; ret = get_option(&str, &val); if (ret) { if (val) static_branch_enable(&csdlock_debug_enabled); else static_branch_disable(&csdlock_debug_enabled); } return 1; } __setup("csdlock_debug=", csdlock_debug); static DEFINE_PER_CPU(call_single_data_t *, cur_csd); static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); static DEFINE_PER_CPU(void *, cur_csd_info); static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */ module_param(csd_lock_timeout, ulong, 0644); static int panic_on_ipistall; /* CSD panic timeout in milliseconds, 300000 for five minutes. */ module_param(panic_on_ipistall, int, 0644); static atomic_t csd_bug_count = ATOMIC_INIT(0); /* Record current CSD work for current CPU, NULL to erase. */ static void __csd_lock_record(call_single_data_t *csd) { if (!csd) { smp_mb(); /* NULL cur_csd after unlock. */ __this_cpu_write(cur_csd, NULL); return; } __this_cpu_write(cur_csd_func, csd->func); __this_cpu_write(cur_csd_info, csd->info); smp_wmb(); /* func and info before csd. */ __this_cpu_write(cur_csd, csd); smp_mb(); /* Update cur_csd before function call. */ /* Or before unlock, as the case may be. */ } static __always_inline void csd_lock_record(call_single_data_t *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) __csd_lock_record(csd); } static int csd_lock_wait_getcpu(call_single_data_t *csd) { unsigned int csd_type; csd_type = CSD_TYPE(csd); if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC) return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */ return -1; } static atomic_t n_csd_lock_stuck; /** * csd_lock_is_stuck - Has a CSD-lock acquisition been stuck too long? * * Returns @true if a CSD-lock acquisition is stuck and has been stuck * long enough for a "non-responsive CSD lock" message to be printed. */ bool csd_lock_is_stuck(void) { return !!atomic_read(&n_csd_lock_stuck); } /* * Complain if too much time spent waiting. Note that only * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, * so waiting on other types gets much less information. */ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id, unsigned long *nmessages) { int cpu = -1; int cpux; bool firsttime; u64 ts2, ts_delta; call_single_data_t *cpu_cur_csd; unsigned int flags = READ_ONCE(csd->node.u_flags); unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC; if (!(flags & CSD_FLAG_LOCK)) { if (!unlikely(*bug_id)) return true; cpu = csd_lock_wait_getcpu(csd); pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n", *bug_id, raw_smp_processor_id(), cpu); atomic_dec(&n_csd_lock_stuck); return true; } ts2 = ktime_get_mono_fast_ns(); /* How long since we last checked for a stuck CSD lock.*/ ts_delta = ts2 - *ts1; if (likely(ts_delta <= csd_lock_timeout_ns * (*nmessages + 1) * (!*nmessages ? 1 : (ilog2(num_online_cpus()) / 2 + 1)) || csd_lock_timeout_ns == 0)) return false; if (ts0 > ts2) { /* Our own sched_clock went backward; don't blame another CPU. */ ts_delta = ts0 - ts2; pr_alert("sched_clock on CPU %d went backward by %llu ns\n", raw_smp_processor_id(), ts_delta); *ts1 = ts2; return false; } firsttime = !*bug_id; if (firsttime) *bug_id = atomic_inc_return(&csd_bug_count); cpu = csd_lock_wait_getcpu(csd); if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu)) cpux = 0; else cpux = cpu; cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */ /* How long since this CSD lock was stuck. */ ts_delta = ts2 - ts0; pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n", firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta, cpu, csd->func, csd->info); (*nmessages)++; if (firsttime) atomic_inc(&n_csd_lock_stuck); /* * If the CSD lock is still stuck after 5 minutes, it is unlikely * to become unstuck. Use a signed comparison to avoid triggering * on underflows when the TSC is out of sync between sockets. */ BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC)); if (cpu_cur_csd && csd != cpu_cur_csd) { pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n", *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)), READ_ONCE(per_cpu(cur_csd_info, cpux))); } else { pr_alert("\tcsd: CSD lock (#%d) %s.\n", *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request"); } if (cpu >= 0) { if (atomic_cmpxchg_acquire(&per_cpu(trigger_backtrace, cpu), 1, 0)) dump_cpu_task(cpu); if (!cpu_cur_csd) { pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu); arch_send_call_function_single_ipi(cpu); } } if (firsttime) dump_stack(); *ts1 = ts2; return false; } /* * csd_lock/csd_unlock used to serialize access to per-cpu csd resources * * For non-synchronous ipi calls the csd can still be in use by the * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ static void __csd_lock_wait(call_single_data_t *csd) { unsigned long nmessages = 0; int bug_id = 0; u64 ts0, ts1; ts1 = ts0 = ktime_get_mono_fast_ns(); for (;;) { if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, &nmessages)) break; cpu_relax(); } smp_acquire__after_ctrl_dep(); } static __always_inline void csd_lock_wait(call_single_data_t *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) { __csd_lock_wait(csd); return; } smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #else static void csd_lock_record(call_single_data_t *csd) { } static __always_inline void csd_lock_wait(call_single_data_t *csd) { smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #endif static __always_inline void csd_lock(call_single_data_t *csd) { csd_lock_wait(csd); csd->node.u_flags |= CSD_FLAG_LOCK; /* * prevent CPU from reordering the above assignment * to ->flags with any subsequent assignments to other * fields of the specified call_single_data_t structure: */ smp_wmb(); } static __always_inline void csd_unlock(call_single_data_t *csd) { WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK)); /* * ensure we're all done before releasing data: */ smp_store_release(&csd->node.u_flags, 0); } static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); void __smp_call_single_queue(int cpu, struct llist_node *node) { /* * We have to check the type of the CSD before queueing it, because * once queued it can have its flags cleared by * flush_smp_call_function_queue() * even if we haven't sent the smp_call IPI yet (e.g. the stopper * executes migration_cpu_stop() on the remote CPU). */ if (trace_csd_queue_cpu_enabled()) { call_single_data_t *csd; smp_call_func_t func; csd = container_of(node, call_single_data_t, node.llist); func = CSD_TYPE(csd) == CSD_TYPE_TTWU ? sched_ttwu_pending : csd->func; trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); } /* * The list addition should be visible to the target CPU when it pops * the head of the list to pull the entry off it in the IPI handler * because of normal cache coherency rules implied by the underlying * llist ops. * * If IPIs can go out of order to the cache coherency protocol * in an architecture, sufficient synchronisation should be added * to arch code to make it appear to obey cache coherency WRT * locking and barrier primitives. Generic code isn't really * equipped to do the right thing... */ if (llist_add(node, &per_cpu(call_single_queue, cpu))) send_call_function_single_ipi(cpu); } /* * Insert a previously allocated call_single_data_t element * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ static int generic_exec_single(int cpu, call_single_data_t *csd) { if (cpu == smp_processor_id()) { smp_call_func_t func = csd->func; void *info = csd->info; unsigned long flags; /* * We can unlock early even for the synchronous on-stack case, * since we're doing this from the same CPU.. */ csd_lock_record(csd); csd_unlock(csd); local_irq_save(flags); csd_do_func(func, info, NULL); csd_lock_record(NULL); local_irq_restore(flags); return 0; } if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) { csd_unlock(csd); return -ENXIO; } __smp_call_single_queue(cpu, &csd->node.llist); return 0; } /** * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks * * Invoked by arch to handle an IPI for call function single. * Must be called with interrupts disabled. */ void generic_smp_call_function_single_interrupt(void) { __flush_smp_call_function_queue(true); } /** * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks * * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an * offline CPU. Skip this check if set to 'false'. * * Flush any pending smp-call-function callbacks queued on this CPU. This is * invoked by the generic IPI handler, as well as by a CPU about to go offline, * to ensure that all pending IPI callbacks are run before it goes completely * offline. * * Loop through the call_single_queue and run all the queued callbacks. * Must be called with interrupts disabled. */ static void __flush_smp_call_function_queue(bool warn_cpu_offline) { call_single_data_t *csd, *csd_next; struct llist_node *entry, *prev; struct llist_head *head; static bool warned; atomic_t *tbt; lockdep_assert_irqs_disabled(); /* Allow waiters to send backtrace NMI from here onwards */ tbt = this_cpu_ptr(&trigger_backtrace); atomic_set_release(tbt, 1); head = this_cpu_ptr(&call_single_queue); entry = llist_del_all(head); entry = llist_reverse_order(entry); /* There shouldn't be any pending callbacks on an offline CPU. */ if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) && !warned && entry != NULL)) { warned = true; WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); /* * We don't have to use the _safe() variant here * because we are not invoking the IPI handlers yet. */ llist_for_each_entry(csd, entry, node.llist) { switch (CSD_TYPE(csd)) { case CSD_TYPE_ASYNC: case CSD_TYPE_SYNC: case CSD_TYPE_IRQ_WORK: pr_warn("IPI callback %pS sent to offline CPU\n", csd->func); break; case CSD_TYPE_TTWU: pr_warn("IPI task-wakeup sent to offline CPU\n"); break; default: pr_warn("IPI callback, unknown type %d, sent to offline CPU\n", CSD_TYPE(csd)); break; } } } /* * First; run all SYNC callbacks, people are waiting for us. */ prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { /* Do we wait until *after* callback? */ if (CSD_TYPE(csd) == CSD_TYPE_SYNC) { smp_call_func_t func = csd->func; void *info = csd->info; if (prev) { prev->next = &csd_next->node.llist; } else { entry = &csd_next->node.llist; } csd_lock_record(csd); csd_do_func(func, info, csd); csd_unlock(csd); csd_lock_record(NULL); } else { prev = &csd->node.llist; } } if (!entry) return; /* * Second; run all !SYNC callbacks. */ prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { int type = CSD_TYPE(csd); if (type != CSD_TYPE_TTWU) { if (prev) { prev->next = &csd_next->node.llist; } else { entry = &csd_next->node.llist; } if (type == CSD_TYPE_ASYNC) { smp_call_func_t func = csd->func; void *info = csd->info; csd_lock_record(csd); csd_unlock(csd); csd_do_func(func, info, csd); csd_lock_record(NULL); } else if (type == CSD_TYPE_IRQ_WORK) { irq_work_single(csd); } } else { prev = &csd->node.llist; } } /* * Third; only CSD_TYPE_TTWU is left, issue those. */ if (entry) { csd = llist_entry(entry, typeof(*csd), node.llist); csd_do_func(sched_ttwu_pending, entry, csd); } } /** * flush_smp_call_function_queue - Flush pending smp-call-function callbacks * from task context (idle, migration thread) * * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to * handle queued SMP function calls before scheduling. * * The migration thread has to ensure that an eventually pending wakeup has * been handled before it migrates a task. */ void flush_smp_call_function_queue(void) { unsigned int was_pending; unsigned long flags; if (llist_empty(this_cpu_ptr(&call_single_queue))) return; local_irq_save(flags); /* Get the already pending soft interrupts for RT enabled kernels */ was_pending = local_softirq_pending(); __flush_smp_call_function_queue(true); if (local_softirq_pending()) do_softirq_post_smp_call_flush(was_pending); local_irq_restore(flags); } /* * smp_call_function_single - Run a function on a specific CPU * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait until function has completed on other CPUs. * * Returns 0 on success, else a negative status code. */ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int wait) { call_single_data_t *csd; call_single_data_t csd_stack = { .node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, }, }; int this_cpu; int err; /* * prevent preemption and reschedule on another processor, * as well as CPU removal */ this_cpu = get_cpu(); /* * Can deadlock when called with interrupts disabled. * We allow cpu's that are not yet online though, as no one else can * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress); /* * When @wait we can deadlock when we interrupt between llist_add() and * arch_send_call_function_ipi*(); when !@wait we can deadlock due to * csd_lock() on because the interrupt context uses the same csd * storage. */ WARN_ON_ONCE(!in_task()); csd = &csd_stack; if (!wait) { csd = this_cpu_ptr(&csd_data); csd_lock(csd); } csd->func = func; csd->info = info; #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG csd->node.src = smp_processor_id(); csd->node.dst = cpu; #endif err = generic_exec_single(cpu, csd); if (wait) csd_lock_wait(csd); put_cpu(); return err; } EXPORT_SYMBOL(smp_call_function_single); /** * smp_call_function_single_async() - Run an asynchronous function on a * specific CPU. * @cpu: The CPU to run on. * @csd: Pre-allocated and setup data structure * * Like smp_call_function_single(), but the call is asynchonous and * can thus be done from contexts with disabled interrupts. * * The caller passes his own pre-allocated data structure * (ie: embedded in an object) and is responsible for synchronizing it * such that the IPIs performed on the @csd are strictly serialized. * * If the function is called with one csd which has not yet been * processed by previous call to smp_call_function_single_async(), the * function will return immediately with -EBUSY showing that the csd * object is still in progress. * * NOTE: Be careful, there is unfortunately no current debugging facility to * validate the correctness of this serialization. * * Return: %0 on success or negative errno value on error */ int smp_call_function_single_async(int cpu, call_single_data_t *csd) { int err = 0; preempt_disable(); if (csd->node.u_flags & CSD_FLAG_LOCK) { err = -EBUSY; goto out; } csd->node.u_flags = CSD_FLAG_LOCK; smp_wmb(); err = generic_exec_single(cpu, csd); out: preempt_enable(); return err; } EXPORT_SYMBOL_GPL(smp_call_function_single_async); /* * smp_call_function_any - Run a function on any of the given cpus * @mask: The mask of cpus it can run on. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait until function has completed. * * Returns 0 on success, else a negative status code (if no cpus were online). * * Selection preference: * 1) current cpu if in @mask * 2) any cpu of current node if in @mask * 3) any other online cpu in @mask */ int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait) { unsigned int cpu; const struct cpumask *nodemask; int ret; /* Try for same CPU (cheapest) */ cpu = get_cpu(); if (cpumask_test_cpu(cpu, mask)) goto call; /* Try for same node. */ nodemask = cpumask_of_node(cpu_to_node(cpu)); for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; cpu = cpumask_next_and(cpu, nodemask, mask)) { if (cpu_online(cpu)) goto call; } /* Any online will do: smp_call_function_single handles nr_cpu_ids. */ cpu = cpumask_any_and(mask, cpu_online_mask); call: ret = smp_call_function_single(cpu, func, info, wait); put_cpu(); return ret; } EXPORT_SYMBOL_GPL(smp_call_function_any); /* * Flags to be used as scf_flags argument of smp_call_function_many_cond(). * * %SCF_WAIT: Wait until function execution is completed * %SCF_RUN_LOCAL: Run also locally if local cpu is set in cpumask */ #define SCF_WAIT (1U << 0) #define SCF_RUN_LOCAL (1U << 1) static void smp_call_function_many_cond(const struct cpumask *mask, smp_call_func_t func, void *info, unsigned int scf_flags, smp_cond_func_t cond_func) { int cpu, last_cpu, this_cpu = smp_processor_id(); struct call_function_data *cfd; bool wait = scf_flags & SCF_WAIT; int nr_cpus = 0; bool run_remote = false; bool run_local = false; lockdep_assert_preemption_disabled(); /* * Can deadlock when called with interrupts disabled. * We allow cpu's that are not yet online though, as no one else can * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ if (cpu_online(this_cpu) && !oops_in_progress && !early_boot_irqs_disabled) lockdep_assert_irqs_enabled(); /* * When @wait we can deadlock when we interrupt between llist_add() and * arch_send_call_function_ipi*(); when !@wait we can deadlock due to * csd_lock() on because the interrupt context uses the same csd * storage. */ WARN_ON_ONCE(!in_task()); /* Check if we need local execution. */ if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) && (!cond_func || cond_func(this_cpu, info))) run_local = true; /* Check if we need remote execution, i.e., any CPU excluding this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); if (cpu == this_cpu) cpu = cpumask_next_and(cpu, mask, cpu_online_mask); if (cpu < nr_cpu_ids) run_remote = true; if (run_remote) { cfd = this_cpu_ptr(&cfd_data); cpumask_and(cfd->cpumask, mask, cpu_online_mask); __cpumask_clear_cpu(this_cpu, cfd->cpumask); cpumask_clear(cfd->cpumask_ipi); for_each_cpu(cpu, cfd->cpumask) { call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu); if (cond_func && !cond_func(cpu, info)) { __cpumask_clear_cpu(cpu, cfd->cpumask); continue; } csd_lock(csd); if (wait) csd->node.u_flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG csd->node.src = smp_processor_id(); csd->node.dst = cpu; #endif trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) { __cpumask_set_cpu(cpu, cfd->cpumask_ipi); nr_cpus++; last_cpu = cpu; } } /* * Choose the most efficient way to send an IPI. Note that the * number of CPUs might be zero due to concurrent changes to the * provided mask. */ if (nr_cpus == 1) send_call_function_single_ipi(last_cpu); else if (likely(nr_cpus > 1)) send_call_function_ipi_mask(cfd->cpumask_ipi); } if (run_local) { unsigned long flags; local_irq_save(flags); csd_do_func(func, info, NULL); local_irq_restore(flags); } if (run_remote && wait) { for_each_cpu(cpu, cfd->cpumask) { call_single_data_t *csd; csd = per_cpu_ptr(cfd->csd, cpu); csd_lock_wait(csd); } } } /** * smp_call_function_many(): Run a function on a set of CPUs. * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait * (atomically) until function has completed on other CPUs. If * %SCF_RUN_LOCAL is set, the function will also be run locally * if the local CPU is set in the @cpumask. * * If @wait is true, then returns once @func has returned. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. Preemption * must be disabled when calling this function. */ void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) { smp_call_function_many_cond(mask, func, info, wait * SCF_WAIT, NULL); } EXPORT_SYMBOL(smp_call_function_many); /** * smp_call_function(): Run a function on all other CPUs. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed * on other CPUs. * * Returns 0. * * If @wait is true, then returns once @func has returned; otherwise * it returns just before the target cpu calls @func. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ void smp_call_function(smp_call_func_t func, void *info, int wait) { preempt_disable(); smp_call_function_many(cpu_online_mask, func, info, wait); preempt_enable(); } EXPORT_SYMBOL(smp_call_function); /* Setup configured maximum number of CPUs to activate */ unsigned int setup_max_cpus = NR_CPUS; EXPORT_SYMBOL(setup_max_cpus); /* * Setup routine for controlling SMP activation * * Command-line option of "nosmp" or "maxcpus=0" will disable SMP * activation entirely (the MPS table probe still happens, though). * * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer * greater than 0, limits the maximum number of CPUs activated in * SMP mode to <NUM>. */ void __weak __init arch_disable_smp_support(void) { } static int __init nosmp(char *str) { setup_max_cpus = 0; arch_disable_smp_support(); return 0; } early_param("nosmp", nosmp); /* this is hard limit */ static int __init nrcpus(char *str) { int nr_cpus; if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids) set_nr_cpu_ids(nr_cpus); return 0; } early_param("nr_cpus", nrcpus); static int __init maxcpus(char *str) { get_option(&str, &setup_max_cpus); if (setup_max_cpus == 0) arch_disable_smp_support(); return 0; } early_param("maxcpus", maxcpus); #if (NR_CPUS > 1) && !defined(CONFIG_FORCE_NR_CPUS) /* Setup number of possible processor ids */ unsigned int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); #endif /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ void __init setup_nr_cpu_ids(void) { set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1); } /* Called by boot processor to activate the rest. */ void __init smp_init(void) { int num_nodes, num_cpus; idle_threads_init(); cpuhp_threads_init(); pr_info("Bringing up secondary CPUs ...\n"); bringup_nonboot_cpus(setup_max_cpus); num_nodes = num_online_nodes(); num_cpus = num_online_cpus(); pr_info("Brought up %d node%s, %d CPU%s\n", num_nodes, str_plural(num_nodes), num_cpus, str_plural(num_cpus)); /* Any cleanup work */ smp_cpus_done(setup_max_cpus); } /* * on_each_cpu_cond(): Call a function on each processor for which * the supplied function cond_func returns true, optionally waiting * for all the required CPUs to finish. This may include the local * processor. * @cond_func: A callback function that is passed a cpu id and * the info parameter. The function is called * with preemption disabled. The function should * return a blooean value indicating whether to IPI * the specified CPU. * @func: The function to run on all applicable CPUs. * This must be fast and non-blocking. * @info: An arbitrary pointer to pass to both functions. * @wait: If true, wait (atomically) until function has * completed on other CPUs. * * Preemption is disabled to protect against CPUs going offline but not online. * CPUs going online during the call will not be seen or sent an IPI. * * You must not call this function with disabled interrupts or * from a hardware interrupt handler or from a bottom half handler. */ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait, const struct cpumask *mask) { unsigned int scf_flags = SCF_RUN_LOCAL; if (wait) scf_flags |= SCF_WAIT; preempt_disable(); smp_call_function_many_cond(mask, func, info, scf_flags, cond_func); preempt_enable(); } EXPORT_SYMBOL(on_each_cpu_cond_mask); static void do_nothing(void *unused) { } /** * kick_all_cpus_sync - Force all cpus out of idle * * Used to synchronize the update of pm_idle function pointer. It's * called after the pointer is updated and returns after the dummy * callback function has been executed on all cpus. The execution of * the function can only happen on the remote cpus after they have * left the idle function which had been called via pm_idle function * pointer. So it's guaranteed that nothing uses the previous pointer * anymore. */ void kick_all_cpus_sync(void) { /* Make sure the change is visible before we kick the cpus */ smp_mb(); smp_call_function(do_nothing, NULL, 1); } EXPORT_SYMBOL_GPL(kick_all_cpus_sync); /** * wake_up_all_idle_cpus - break all cpus out of idle * wake_up_all_idle_cpus try to break all cpus which is in idle state even * including idle polling cpus, for non-idle cpus, we will do nothing * for them. */ void wake_up_all_idle_cpus(void) { int cpu; for_each_possible_cpu(cpu) { preempt_disable(); if (cpu != smp_processor_id() && cpu_online(cpu)) wake_up_if_idle(cpu); preempt_enable(); } } EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); /** * struct smp_call_on_cpu_struct - Call a function on a specific CPU * @work: &work_struct * @done: &completion to signal * @func: function to call * @data: function's data argument * @ret: return value from @func * @cpu: target CPU (%-1 for any CPU) * * Used to call a function on a specific cpu and wait for it to return. * Optionally make sure the call is done on a specified physical cpu via vcpu * pinning in order to support virtualized environments. */ struct smp_call_on_cpu_struct { struct work_struct work; struct completion done; int (*func)(void *); void *data; int ret; int cpu; }; static void smp_call_on_cpu_callback(struct work_struct *work) { struct smp_call_on_cpu_struct *sscs; sscs = container_of(work, struct smp_call_on_cpu_struct, work); if (sscs->cpu >= 0) hypervisor_pin_vcpu(sscs->cpu); sscs->ret = sscs->func(sscs->data); if (sscs->cpu >= 0) hypervisor_pin_vcpu(-1); complete(&sscs->done); } int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) { struct smp_call_on_cpu_struct sscs = { .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done), .func = func, .data = par, .cpu = phys ? cpu : -1, }; INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback); if (cpu >= nr_cpu_ids || !cpu_online(cpu)) return -ENXIO; queue_work_on(cpu, system_wq, &sscs.work); wait_for_completion(&sscs.done); destroy_work_on_stack(&sscs.work); return sscs.ret; } EXPORT_SYMBOL_GPL(smp_call_on_cpu); |
| 196 196 187 7 2 186 187 194 187 187 187 184 176 263 400 402 4 1 4 4 402 255 400 42 402 400 41 402 401 152 175 176 1 187 186 187 186 187 187 187 187 187 187 183 184 11 5 4 4 184 184 184 184 4 184 183 4 4 170 168 11 163 163 168 168 1 165 165 165 165 7 164 165 7 28 7 7 7 7 7 7 7 163 165 165 14 168 7 7 4 4 263 2 175 10 168 7 266 266 265 1 3 184 171 171 4 167 168 11 7 1 1 8 165 3 7 991 989 866 15 187 13 165 1 184 31 166 184 181 3 185 165 5 165 187 187 8 187 187 165 32 32 32 1 1 3 3 32 7 1 8 8 4 8 6 4 2 263 263 263 263 263 263 263 238 238 2 238 238 238 20 171 152 1 152 23 171 170 4 4 4 20 5 23 24 5 3 20 21 2 3 20 20 23 21 2 2 19 19 21 150 1 29 160 1 154 10 1 177 162 4 3 5 17 2 22 21 3 24 2 7 19 22 21 3 2 22 2 7 19 163 162 2 19 162 19 161 147 29 28 172 19 153 4 171 167 4 171 169 4 2 2 6 3 4 186 2 6 15 177 173 29 173 187 187 187 8 187 187 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/namei.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * Some corrections by tytso. */ /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname * lookup logic. */ /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture. */ #include <linux/init.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/wordpart.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> #include <linux/fsnotify.h> #include <linux/personality.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/mount.h> #include <linux/audit.h> #include <linux/capability.h> #include <linux/file.h> #include <linux/fcntl.h> #include <linux/device_cgroup.h> #include <linux/fs_struct.h> #include <linux/posix_acl.h> #include <linux/hash.h> #include <linux/bitops.h> #include <linux/init_task.h> #include <linux/uaccess.h> #include "internal.h" #include "mount.h" /* [Feb-1997 T. Schoebel-Theuer] * Fundamental changes in the pathname lookup mechanisms (namei) * were necessary because of omirr. The reason is that omirr needs * to know the _real_ pathname, not the user-supplied one, in case * of symlinks (and also when transname replacements occur). * * The new code replaces the old recursive symlink resolution with * an iterative one (in case of non-nested symlink chains). It does * this with calls to <fs>_follow_link(). * As a side effect, dir_namei(), _namei() and follow_link() are now * replaced with a single function lookup_dentry() that can handle all * the special cases of the former code. * * With the new dcache, the pathname is stored at each inode, at least as * long as the refcount of the inode is positive. As a side effect, the * size of the dcache depends on the inode cache and thus is dynamic. * * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink * resolution to correspond with current state of the code. * * Note that the symlink resolution is not *completely* iterative. * There is still a significant amount of tail- and mid- recursion in * the algorithm. Also, note that <fs>_readlink() is not used in * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink() * may return different results than <fs>_follow_link(). Many virtual * filesystems (including /proc) exhibit this behavior. */ /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation: * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL * and the name already exists in form of a symlink, try to create the new * name indicated by the symlink. The old code always complained that the * name already exists, due to not following the symlink even if its target * is nonexistent. The new semantics affects also mknod() and link() when * the name is a symlink pointing to a non-existent name. * * I don't know which semantics is the right one, since I have no access * to standards. But I found by trial that HP-UX 9.0 has the full "new" * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the * "old" one. Personally, I think the new semantics is much more logical. * Note that "ln old new" where "new" is a symlink pointing to a non-existing * file does succeed in both HP-UX and SunOs, but not in Solaris * and in the old Linux semantics. */ /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink * semantics. See the comments in "open_namei" and "do_link" below. * * [10-Sep-98 Alan Modra] Another symlink change. */ /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks: * inside the path - always follow. * in the last component in creation/removal/renaming - never follow. * if LOOKUP_FOLLOW passed - follow. * if the pathname has trailing slashes - follow. * otherwise - don't follow. * (applied in that order). * * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT * restored for 2.4. This is the last surviving part of old 4.2BSD bug. * During the 2.4 we need to fix the userland stuff depending on it - * hopefully we will be able to get rid of that wart in 2.5. So far only * XEmacs seems to be relying on it... */ /* * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland) * implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives * any extra contention... */ /* In order to reduce some races, while at the same time doing additional * checking and hopefully speeding things up, we copy filenames to the * kernel data space before using them.. * * POSIX.1 2.4: an empty pathname is invalid (ENOENT). * PATH_MAX includes the nul terminator --RR. */ #define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) struct filename * getname_flags(const char __user *filename, int flags) { struct filename *result; char *kname; int len; result = audit_reusename(filename); if (result) return result; result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); /* * First, try to embed the struct filename inside the names_cache * allocation */ kname = (char *)result->iname; result->name = kname; len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX); /* * Handle both empty path and copy failure in one go. */ if (unlikely(len <= 0)) { if (unlikely(len < 0)) { __putname(result); return ERR_PTR(len); } /* The empty path is special. */ if (!(flags & LOOKUP_EMPTY)) { __putname(result); return ERR_PTR(-ENOENT); } } /* * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a * separate struct filename so we can dedicate the entire * names_cache allocation for the pathname, and re-do the copy from * userland. */ if (unlikely(len == EMBEDDED_NAME_MAX)) { const size_t size = offsetof(struct filename, iname[1]); kname = (char *)result; /* * size is chosen that way we to guarantee that * result->iname[0] is within the same object and that * kname can't be equal to result->iname, no matter what. */ result = kzalloc(size, GFP_KERNEL); if (unlikely(!result)) { __putname(kname); return ERR_PTR(-ENOMEM); } result->name = kname; len = strncpy_from_user(kname, filename, PATH_MAX); if (unlikely(len < 0)) { __putname(kname); kfree(result); return ERR_PTR(len); } /* The empty path is special. */ if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) { __putname(kname); kfree(result); return ERR_PTR(-ENOENT); } if (unlikely(len == PATH_MAX)) { __putname(kname); kfree(result); return ERR_PTR(-ENAMETOOLONG); } } atomic_set(&result->refcnt, 1); result->uptr = filename; result->aname = NULL; audit_getname(result); return result; } struct filename *getname_uflags(const char __user *filename, int uflags) { int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; return getname_flags(filename, flags); } struct filename *getname(const char __user * filename) { return getname_flags(filename, 0); } struct filename *__getname_maybe_null(const char __user *pathname) { struct filename *name; char c; /* try to save on allocations; loss on um, though */ if (get_user(c, pathname)) return ERR_PTR(-EFAULT); if (!c) return NULL; name = getname_flags(pathname, LOOKUP_EMPTY); if (!IS_ERR(name) && !(name->name[0])) { putname(name); name = NULL; } return name; } struct filename *getname_kernel(const char * filename) { struct filename *result; int len = strlen(filename) + 1; result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); if (len <= EMBEDDED_NAME_MAX) { result->name = (char *)result->iname; } else if (len <= PATH_MAX) { const size_t size = offsetof(struct filename, iname[1]); struct filename *tmp; tmp = kmalloc(size, GFP_KERNEL); if (unlikely(!tmp)) { __putname(result); return ERR_PTR(-ENOMEM); } tmp->name = (char *)result; result = tmp; } else { __putname(result); return ERR_PTR(-ENAMETOOLONG); } memcpy((char *)result->name, filename, len); result->uptr = NULL; result->aname = NULL; atomic_set(&result->refcnt, 1); audit_getname(result); return result; } EXPORT_SYMBOL(getname_kernel); void putname(struct filename *name) { if (IS_ERR_OR_NULL(name)) return; if (WARN_ON_ONCE(!atomic_read(&name->refcnt))) return; if (!atomic_dec_and_test(&name->refcnt)) return; if (name->name != name->iname) { __putname(name->name); kfree(name); } else __putname(name); } EXPORT_SYMBOL(putname); /** * check_acl - perform ACL permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * This function performs the ACL permission checking. Since this function * retrieve POSIX acls it needs to know whether it is called from a blocking or * non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ static int check_acl(struct mnt_idmap *idmap, struct inode *inode, int mask) { #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *acl; if (mask & MAY_NOT_BLOCK) { acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); if (!acl) return -EAGAIN; /* no ->get_inode_acl() calls in RCU mode... */ if (is_uncached_acl(acl)) return -ECHILD; return posix_acl_permission(idmap, inode, acl, mask); } acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { int error = posix_acl_permission(idmap, inode, acl, mask); posix_acl_release(acl); return error; } #endif return -EAGAIN; } /* * Very quick optimistic "we know we have no ACL's" check. * * Note that this is purely for ACL_TYPE_ACCESS, and purely * for the "we have cached that there are no ACLs" case. * * If this returns true, we know there are no ACLs. But if * it returns false, we might still not have ACLs (it could * be the is_uncached_acl() case). */ static inline bool no_acl_inode(struct inode *inode) { #ifdef CONFIG_FS_POSIX_ACL return likely(!READ_ONCE(inode->i_acl)); #else return true; #endif } /** * acl_permission_check - perform basic UNIX permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * This function performs the basic UNIX permission checking. Since this * function may retrieve POSIX acls it needs to know whether it is called from a * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ static int acl_permission_check(struct mnt_idmap *idmap, struct inode *inode, int mask) { unsigned int mode = inode->i_mode; vfsuid_t vfsuid; /* * Common cheap case: everybody has the requested * rights, and there are no ACLs to check. No need * to do any owner/group checks in that case. * * - 'mask&7' is the requested permission bit set * - multiplying by 0111 spreads them out to all of ugo * - '& ~mode' looks for missing inode permission bits * - the '!' is for "no missing permissions" * * After that, we just need to check that there are no * ACL's on the inode - do the 'IS_POSIXACL()' check last * because it will dereference the ->i_sb pointer and we * want to avoid that if at all possible. */ if (!((mask & 7) * 0111 & ~mode)) { if (no_acl_inode(inode)) return 0; if (!IS_POSIXACL(inode)) return 0; } /* Are we the owner? If so, ACL's don't matter */ vfsuid = i_uid_into_vfsuid(idmap, inode); if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { mask &= 7; mode >>= 6; return (mask & ~mode) ? -EACCES : 0; } /* Do we have ACL's? */ if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { int error = check_acl(idmap, inode, mask); if (error != -EAGAIN) return error; } /* Only RWX matters for group/other mode bits */ mask &= 7; /* * Are the group permissions different from * the other permissions in the bits we care * about? Need to check group ownership if so. */ if (mask & (mode ^ (mode >> 3))) { vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); if (vfsgid_in_group_p(vfsgid)) mode >>= 3; } /* Bits in 'mode' clear that we require? */ return (mask & ~mode) ? -EACCES : 0; } /** * generic_permission - check for access rights on a Posix-like filesystem * @idmap: idmap of the mount the inode was found from * @inode: inode to check access rights for * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, * %MAY_NOT_BLOCK ...) * * Used to check for read/write/execute permissions on a file. * We use "fsuid" for this, letting us set arbitrary permissions * for filesystem access without changing the "normal" uids which * are used for other things. * * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk * request cannot be satisfied (eg. requires blocking or too much complexity). * It would then be called again in ref-walk mode. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int generic_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret; /* * Do the basic permission checks. */ ret = acl_permission_check(idmap, inode, mask); if (ret != -EACCES) return ret; if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ if (!(mask & MAY_WRITE)) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; } /* * Searching includes executable on directories, else just read. */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; /* * Read/write DACs are always overridable. * Executable DACs are overridable when there is * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; } EXPORT_SYMBOL(generic_permission); /** * do_inode_permission - UNIX permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * We _really_ want to just do "generic_permission()" without * even looking at the inode->i_op values. So we keep a cache * flag in inode->i_opflags, that says "this has not special * permission function, use the fast case". */ static inline int do_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { if (likely(inode->i_op->permission)) return inode->i_op->permission(idmap, inode, mask); /* This gets set once for the inode lifetime */ spin_lock(&inode->i_lock); inode->i_opflags |= IOP_FASTPERM; spin_unlock(&inode->i_lock); } return generic_permission(idmap, inode, mask); } /** * sb_permission - Check superblock-level permissions * @sb: Superblock of inode to check permission on * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Separate out file-system wide checks from inode-specific permission checks. */ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) { if (unlikely(mask & MAY_WRITE)) { umode_t mode = inode->i_mode; /* Nobody gets write access to a read-only fs. */ if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) return -EROFS; } return 0; } /** * inode_permission - Check for access rights to a given inode * @idmap: idmap of the mount the inode was found from * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Check for read/write/execute permissions on an inode. We use fs[ug]id for * this, letting us set arbitrary permissions for filesystem access without * changing the "normal" UIDs which are used for other things. * * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. */ int inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int retval; retval = sb_permission(inode->i_sb, inode, mask); if (retval) return retval; if (unlikely(mask & MAY_WRITE)) { /* * Nobody gets write access to an immutable file. */ if (IS_IMMUTABLE(inode)) return -EPERM; /* * Updating mtime will likely cause i_uid and i_gid to be * written back improperly if their true value is unknown * to the vfs. */ if (HAS_UNMAPPED_ID(idmap, inode)) return -EACCES; } retval = do_inode_permission(idmap, inode, mask); if (retval) return retval; retval = devcgroup_inode_permission(inode, mask); if (retval) return retval; return security_inode_permission(inode, mask); } EXPORT_SYMBOL(inode_permission); /** * path_get - get a reference to a path * @path: path to get the reference to * * Given a path increment the reference count to the dentry and the vfsmount. */ void path_get(const struct path *path) { mntget(path->mnt); dget(path->dentry); } EXPORT_SYMBOL(path_get); /** * path_put - put a reference to a path * @path: path to put the reference to * * Given a path decrement the reference count to the dentry and the vfsmount. */ void path_put(const struct path *path) { dput(path->dentry); mntput(path->mnt); } EXPORT_SYMBOL(path_put); #define EMBEDDED_LEVELS 2 struct nameidata { struct path path; struct qstr last; struct path root; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags, state; unsigned seq, next_seq, m_seq, r_seq; int last_type; unsigned depth; int total_link_count; struct saved { struct path link; struct delayed_call done; const char *name; unsigned seq; } *stack, internal[EMBEDDED_LEVELS]; struct filename *name; const char *pathname; struct nameidata *saved; unsigned root_seq; int dfd; vfsuid_t dir_vfsuid; umode_t dir_mode; } __randomize_layout; #define ND_ROOT_PRESET 1 #define ND_ROOT_GRABBED 2 #define ND_JUMPED 4 static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name) { struct nameidata *old = current->nameidata; p->stack = p->internal; p->depth = 0; p->dfd = dfd; p->name = name; p->pathname = likely(name) ? name->name : ""; p->path.mnt = NULL; p->path.dentry = NULL; p->total_link_count = old ? old->total_link_count : 0; p->saved = old; current->nameidata = p; } static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name, const struct path *root) { __set_nameidata(p, dfd, name); p->state = 0; if (unlikely(root)) { p->state = ND_ROOT_PRESET; p->root = *root; } } static void restore_nameidata(void) { struct nameidata *now = current->nameidata, *old = now->saved; current->nameidata = old; if (old) old->total_link_count = now->total_link_count; if (now->stack != now->internal) kfree(now->stack); } static bool nd_alloc_stack(struct nameidata *nd) { struct saved *p; p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL); if (unlikely(!p)) return false; memcpy(p, nd->internal, sizeof(nd->internal)); nd->stack = p; return true; } /** * path_connected - Verify that a dentry is below mnt.mnt_root * @mnt: The mountpoint to check. * @dentry: The dentry to check. * * Rename can sometimes move a file or directory outside of a bind * mount, path_connected allows those cases to be detected. */ static bool path_connected(struct vfsmount *mnt, struct dentry *dentry) { struct super_block *sb = mnt->mnt_sb; /* Bind mounts can have disconnected paths */ if (mnt->mnt_root == sb->s_root) return true; return is_subdir(dentry, mnt->mnt_root); } static void drop_links(struct nameidata *nd) { int i = nd->depth; while (i--) { struct saved *last = nd->stack + i; do_delayed_call(&last->done); clear_delayed_call(&last->done); } } static void leave_rcu(struct nameidata *nd) { nd->flags &= ~LOOKUP_RCU; nd->seq = nd->next_seq = 0; rcu_read_unlock(); } static void terminate_walk(struct nameidata *nd) { drop_links(nd); if (!(nd->flags & LOOKUP_RCU)) { int i; path_put(&nd->path); for (i = 0; i < nd->depth; i++) path_put(&nd->stack[i].link); if (nd->state & ND_ROOT_GRABBED) { path_put(&nd->root); nd->state &= ~ND_ROOT_GRABBED; } } else { leave_rcu(nd); } nd->depth = 0; nd->path.mnt = NULL; nd->path.dentry = NULL; } /* path_put is needed afterwards regardless of success or failure */ static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq) { int res = __legitimize_mnt(path->mnt, mseq); if (unlikely(res)) { if (res > 0) path->mnt = NULL; path->dentry = NULL; return false; } if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) { path->dentry = NULL; return false; } return !read_seqcount_retry(&path->dentry->d_seq, seq); } static inline bool legitimize_path(struct nameidata *nd, struct path *path, unsigned seq) { return __legitimize_path(path, seq, nd->m_seq); } static bool legitimize_links(struct nameidata *nd) { int i; if (unlikely(nd->flags & LOOKUP_CACHED)) { drop_links(nd); nd->depth = 0; return false; } for (i = 0; i < nd->depth; i++) { struct saved *last = nd->stack + i; if (unlikely(!legitimize_path(nd, &last->link, last->seq))) { drop_links(nd); nd->depth = i + 1; return false; } } return true; } static bool legitimize_root(struct nameidata *nd) { /* Nothing to do if nd->root is zero or is managed by the VFS user. */ if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET)) return true; nd->state |= ND_ROOT_GRABBED; return legitimize_path(nd, &nd->root, nd->root_seq); } /* * Path walking has 2 modes, rcu-walk and ref-walk (see * Documentation/filesystems/path-lookup.txt). In situations when we can't * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab * normal reference counts on dentries and vfsmounts to transition to ref-walk * mode. Refcounts are grabbed at the last known good point before rcu-walk * got stuck, so ref-walk may continue from there. If this is not successful * (eg. a seqcount has changed), then failure is returned and it's up to caller * to restart the path walk from the beginning in ref-walk mode. */ /** * try_to_unlazy - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * Returns: true on success, false on failure * * try_to_unlazy attempts to legitimize the current nd->path and nd->root * for ref-walk mode. * Must be called from rcu-walk context. * Nothing should touch nameidata between try_to_unlazy() failure and * terminate_walk(). */ static bool try_to_unlazy(struct nameidata *nd) { struct dentry *parent = nd->path.dentry; BUG_ON(!(nd->flags & LOOKUP_RCU)); if (unlikely(!legitimize_links(nd))) goto out1; if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) goto out; if (unlikely(!legitimize_root(nd))) goto out; leave_rcu(nd); BUG_ON(nd->inode != parent->d_inode); return true; out1: nd->path.mnt = NULL; nd->path.dentry = NULL; out: leave_rcu(nd); return false; } /** * try_to_unlazy_next - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * @dentry: next dentry to step into * Returns: true on success, false on failure * * Similar to try_to_unlazy(), but here we have the next dentry already * picked by rcu-walk and want to legitimize that in addition to the current * nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context. * Nothing should touch nameidata between try_to_unlazy_next() failure and * terminate_walk(). */ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) { int res; BUG_ON(!(nd->flags & LOOKUP_RCU)); if (unlikely(!legitimize_links(nd))) goto out2; res = __legitimize_mnt(nd->path.mnt, nd->m_seq); if (unlikely(res)) { if (res > 0) goto out2; goto out1; } if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref))) goto out1; /* * We need to move both the parent and the dentry from the RCU domain * to be properly refcounted. And the sequence number in the dentry * validates *both* dentry counters, since we checked the sequence * number of the parent after we got the child sequence number. So we * know the parent must still be valid if the child sequence number is */ if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) goto out; if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) goto out_dput; /* * Sequence counts matched. Now make sure that the root is * still valid and get it if required. */ if (unlikely(!legitimize_root(nd))) goto out_dput; leave_rcu(nd); return true; out2: nd->path.mnt = NULL; out1: nd->path.dentry = NULL; out: leave_rcu(nd); return false; out_dput: leave_rcu(nd); dput(dentry); return false; } static inline int d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) return dentry->d_op->d_revalidate(dir, name, dentry, flags); else return 1; } /** * complete_walk - successful completion of path walk * @nd: pointer nameidata * * If we had been in RCU mode, drop out of it and legitimize nd->path. * Revalidate the final result, unless we'd already done that during * the path walk or the filesystem doesn't ask for it. Return 0 on * success, -error on failure. In case of failure caller does not * need to drop nd->path. */ static int complete_walk(struct nameidata *nd) { struct dentry *dentry = nd->path.dentry; int status; if (nd->flags & LOOKUP_RCU) { /* * We don't want to zero nd->root for scoped-lookups or * externally-managed nd->root. */ if (!(nd->state & ND_ROOT_PRESET)) if (!(nd->flags & LOOKUP_IS_SCOPED)) nd->root.mnt = NULL; nd->flags &= ~LOOKUP_CACHED; if (!try_to_unlazy(nd)) return -ECHILD; } if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { /* * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't * ever step outside the root during lookup" and should already * be guaranteed by the rest of namei, we want to avoid a namei * BUG resulting in userspace being given a path that was not * scoped within the root at some point during the lookup. * * So, do a final sanity-check to make sure that in the * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED) * we won't silently return an fd completely outside of the * requested root to userspace. * * Userspace could move the path outside the root after this * check, but as discussed elsewhere this is not a concern (the * resolved file was inside the root at some point). */ if (!path_is_under(&nd->path, &nd->root)) return -EXDEV; } if (likely(!(nd->state & ND_JUMPED))) return 0; if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) return 0; status = dentry->d_op->d_weak_revalidate(dentry, nd->flags); if (status > 0) return 0; if (!status) status = -ESTALE; return status; } static int set_root(struct nameidata *nd) { struct fs_struct *fs = current->fs; /* * Jumping to the real root in a scoped-lookup is a BUG in namei, but we * still have to ensure it doesn't happen because it will cause a breakout * from the dirfd. */ if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED)) return -ENOTRECOVERABLE; if (nd->flags & LOOKUP_RCU) { unsigned seq; do { seq = read_seqcount_begin(&fs->seq); nd->root = fs->root; nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_root(fs, &nd->root); nd->state |= ND_ROOT_GRABBED; } return 0; } static int nd_jump_root(struct nameidata *nd) { if (unlikely(nd->flags & LOOKUP_BENEATH)) return -EXDEV; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { /* Absolute path arguments to path_init() are allowed. */ if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt) return -EXDEV; } if (!nd->root.mnt) { int error = set_root(nd); if (error) return error; } if (nd->flags & LOOKUP_RCU) { struct dentry *d; nd->path = nd->root; d = nd->path.dentry; nd->inode = d->d_inode; nd->seq = nd->root_seq; if (read_seqcount_retry(&d->d_seq, nd->seq)) return -ECHILD; } else { path_put(&nd->path); nd->path = nd->root; path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } nd->state |= ND_JUMPED; return 0; } /* * Helper to directly jump to a known parsed path from ->get_link, * caller must have taken a reference to path beforehand. */ int nd_jump_link(const struct path *path) { int error = -ELOOP; struct nameidata *nd = current->nameidata; if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS)) goto err; error = -EXDEV; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { if (nd->path.mnt != path->mnt) goto err; } /* Not currently safe for scoped-lookups. */ if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) goto err; path_put(&nd->path); nd->path = *path; nd->inode = nd->path.dentry->d_inode; nd->state |= ND_JUMPED; return 0; err: path_put(path); return error; } static inline void put_link(struct nameidata *nd) { struct saved *last = nd->stack + --nd->depth; do_delayed_call(&last->done); if (!(nd->flags & LOOKUP_RCU)) path_put(&last->link); } static int sysctl_protected_symlinks __read_mostly; static int sysctl_protected_hardlinks __read_mostly; static int sysctl_protected_fifos __read_mostly; static int sysctl_protected_regular __read_mostly; #ifdef CONFIG_SYSCTL static const struct ctl_table namei_sysctls[] = { { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "protected_hardlinks", .data = &sysctl_protected_hardlinks, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "protected_fifos", .data = &sysctl_protected_fifos, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "protected_regular", .data = &sysctl_protected_regular, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, }; static int __init init_fs_namei_sysctls(void) { register_sysctl_init("fs", namei_sysctls); return 0; } fs_initcall(init_fs_namei_sysctls); #endif /* CONFIG_SYSCTL */ /** * may_follow_link - Check symlink following for unsafe situations * @nd: nameidata pathwalk data * @inode: Used for idmapping. * * In the case of the sysctl_protected_symlinks sysctl being enabled, * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is * in a sticky world-writable directory. This is to protect privileged * processes from failing races against path names that may change out * from under them by way of other users creating malicious symlinks. * It will permit symlinks to be followed only when outside a sticky * world-writable directory, or when the uid of the symlink and follower * match, or when the directory owner matches the symlink's owner. * * Returns 0 if following the symlink is allowed, -ve on error. */ static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { struct mnt_idmap *idmap; vfsuid_t vfsuid; if (!sysctl_protected_symlinks) return 0; idmap = mnt_idmap(nd->path.mnt); vfsuid = i_uid_into_vfsuid(idmap, inode); /* Allowed if owner and follower match. */ if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return 0; /* Allowed if parent directory not sticky and world-writable. */ if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) return 0; /* Allowed if parent directory and link owner match. */ if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid)) return 0; if (nd->flags & LOOKUP_RCU) return -ECHILD; audit_inode(nd->name, nd->stack[0].link.dentry, 0); audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link"); return -EACCES; } /** * safe_hardlink_source - Check for safe hardlink conditions * @idmap: idmap of the mount the inode was found from * @inode: the source inode to hardlink from * * Return false if at least one of the following conditions: * - inode is not a regular file * - inode is setuid * - inode is setgid and group-exec * - access failure for read and write * * Otherwise returns true. */ static bool safe_hardlink_source(struct mnt_idmap *idmap, struct inode *inode) { umode_t mode = inode->i_mode; /* Special files should not get pinned to the filesystem. */ if (!S_ISREG(mode)) return false; /* Setuid files should not get pinned to the filesystem. */ if (mode & S_ISUID) return false; /* Executable setgid files should not get pinned to the filesystem. */ if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) return false; /* Hardlinking to unreadable or unwritable sources is dangerous. */ if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE)) return false; return true; } /** * may_linkat - Check permissions for creating a hardlink * @idmap: idmap of the mount the inode was found from * @link: the source to hardlink from * * Block hardlink when all of: * - sysctl_protected_hardlinks enabled * - fsuid does not match inode * - hardlink source is unsafe (see safe_hardlink_source() above) * - not CAP_FOWNER in a namespace with the inode owner uid mapped * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if successful, -ve on error. */ int may_linkat(struct mnt_idmap *idmap, const struct path *link) { struct inode *inode = link->dentry->d_inode; /* Inode writeback is not safe when the uid or gid are invalid. */ if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; if (!sysctl_protected_hardlinks) return 0; /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ if (safe_hardlink_source(idmap, inode) || inode_owner_or_capable(idmap, inode)) return 0; audit_log_path_denied(AUDIT_ANOM_LINK, "linkat"); return -EPERM; } /** * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory * should be allowed, or not, on files that already * exist. * @idmap: idmap of the mount the inode was found from * @nd: nameidata pathwalk data * @inode: the inode of the file to open * * Block an O_CREAT open of a FIFO (or a regular file) when: * - sysctl_protected_fifos (or sysctl_protected_regular) is enabled * - the file already exists * - we are in a sticky directory * - we don't own the file * - the owner of the directory doesn't own the file * - the directory is world writable * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2 * the directory doesn't have to be world writable: being group writable will * be enough. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if the open is allowed, -ve on error. */ static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd, struct inode *const inode) { umode_t dir_mode = nd->dir_mode; vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid; if (likely(!(dir_mode & S_ISVTX))) return 0; if (S_ISREG(inode->i_mode) && !sysctl_protected_regular) return 0; if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos) return 0; i_vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq(i_vfsuid, dir_vfsuid)) return 0; if (vfsuid_eq_kuid(i_vfsuid, current_fsuid())) return 0; if (likely(dir_mode & 0002)) { audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create"); return -EACCES; } if (dir_mode & 0020) { if (sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) { audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create_fifo"); return -EACCES; } if (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode)) { audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create_regular"); return -EACCES; } } return 0; } /* * follow_up - Find the mountpoint of path's vfsmount * * Given a path, find the mountpoint of its source file system. * Replace @path with the path of the mountpoint in the parent mount. * Up is towards /. * * Return 1 if we went up a level and 0 if we were already at the * root. */ int follow_up(struct path *path) { struct mount *mnt = real_mount(path->mnt); struct mount *parent; struct dentry *mountpoint; read_seqlock_excl(&mount_lock); parent = mnt->mnt_parent; if (parent == mnt) { read_sequnlock_excl(&mount_lock); return 0; } mntget(&parent->mnt); mountpoint = dget(mnt->mnt_mountpoint); read_sequnlock_excl(&mount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); path->mnt = &parent->mnt; return 1; } EXPORT_SYMBOL(follow_up); static bool choose_mountpoint_rcu(struct mount *m, const struct path *root, struct path *path, unsigned *seqp) { while (mnt_has_parent(m)) { struct dentry *mountpoint = m->mnt_mountpoint; m = m->mnt_parent; if (unlikely(root->dentry == mountpoint && root->mnt == &m->mnt)) break; if (mountpoint != m->mnt.mnt_root) { path->mnt = &m->mnt; path->dentry = mountpoint; *seqp = read_seqcount_begin(&mountpoint->d_seq); return true; } } return false; } static bool choose_mountpoint(struct mount *m, const struct path *root, struct path *path) { bool found; rcu_read_lock(); while (1) { unsigned seq, mseq = read_seqbegin(&mount_lock); found = choose_mountpoint_rcu(m, root, path, &seq); if (unlikely(!found)) { if (!read_seqretry(&mount_lock, mseq)) break; } else { if (likely(__legitimize_path(path, seq, mseq))) break; rcu_read_unlock(); path_put(path); rcu_read_lock(); } } rcu_read_unlock(); return found; } /* * Perform an automount * - return -EISDIR to tell follow_managed() to stop and return the path we * were called with. */ static int follow_automount(struct path *path, int *count, unsigned lookup_flags) { struct dentry *dentry = path->dentry; /* We don't want to mount if someone's just doing a stat - * unless they're stat'ing a directory and appended a '/' to * the name. * * We do, however, want to mount if someone wants to open or * create a file of any type under the mountpoint, wants to * traverse through the mountpoint or wants to open the * mounted directory. Also, autofs may mark negative dentries * as being automount points. These will need the attentions * of the daemon to instantiate them before they can be used. */ if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && dentry->d_inode) return -EISDIR; if (count && (*count)++ >= MAXSYMLINKS) return -ELOOP; return finish_automount(dentry->d_op->d_automount(path), path); } /* * mount traversal - out-of-line part. One note on ->d_flags accesses - * dentries are pinned but not locked here, so negative dentry can go * positive right under us. Use of smp_load_acquire() provides a barrier * sufficient for ->d_inode and ->d_flags consistency. */ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, int *count, unsigned lookup_flags) { struct vfsmount *mnt = path->mnt; bool need_mntput = false; int ret = 0; while (flags & DCACHE_MANAGED_DENTRY) { /* Allow the filesystem to manage the transit without i_mutex * being held. */ if (flags & DCACHE_MANAGE_TRANSIT) { ret = path->dentry->d_op->d_manage(path, false); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } if (flags & DCACHE_MOUNTED) { // something's mounted on it.. struct vfsmount *mounted = lookup_mnt(path); if (mounted) { // ... in our namespace dput(path->dentry); if (need_mntput) mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); // here we know it's positive flags = path->dentry->d_flags; need_mntput = true; continue; } } if (!(flags & DCACHE_NEED_AUTOMOUNT)) break; // uncovered automount point ret = follow_automount(path, count, lookup_flags); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } if (ret == -EISDIR) ret = 0; // possible if you race with several mount --move if (need_mntput && path->mnt == mnt) mntput(path->mnt); if (!ret && unlikely(d_flags_negative(flags))) ret = -ENOENT; *jumped = need_mntput; return ret; } static inline int traverse_mounts(struct path *path, bool *jumped, int *count, unsigned lookup_flags) { unsigned flags = smp_load_acquire(&path->dentry->d_flags); /* fastpath */ if (likely(!(flags & DCACHE_MANAGED_DENTRY))) { *jumped = false; if (unlikely(d_flags_negative(flags))) return -ENOENT; return 0; } return __traverse_mounts(path, flags, jumped, count, lookup_flags); } int follow_down_one(struct path *path) { struct vfsmount *mounted; mounted = lookup_mnt(path); if (mounted) { dput(path->dentry); mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); return 1; } return 0; } EXPORT_SYMBOL(follow_down_one); /* * Follow down to the covering mount currently visible to userspace. At each * point, the filesystem owning that dentry may be queried as to whether the * caller is permitted to proceed or not. */ int follow_down(struct path *path, unsigned int flags) { struct vfsmount *mnt = path->mnt; bool jumped; int ret = traverse_mounts(path, &jumped, NULL, flags); if (path->mnt != mnt) mntput(mnt); return ret; } EXPORT_SYMBOL(follow_down); /* * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if * we meet a managed dentry that would need blocking. */ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path) { struct dentry *dentry = path->dentry; unsigned int flags = dentry->d_flags; if (likely(!(flags & DCACHE_MANAGED_DENTRY))) return true; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return false; for (;;) { /* * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) { int res = dentry->d_op->d_manage(path, true); if (res) return res == -EISDIR; flags = dentry->d_flags; } if (flags & DCACHE_MOUNTED) { struct mount *mounted = __lookup_mnt(path->mnt, dentry); if (mounted) { path->mnt = &mounted->mnt; dentry = path->dentry = mounted->mnt.mnt_root; nd->state |= ND_JUMPED; nd->next_seq = read_seqcount_begin(&dentry->d_seq); flags = dentry->d_flags; // makes sure that non-RCU pathwalk could reach // this state. if (read_seqretry(&mount_lock, nd->m_seq)) return false; continue; } if (read_seqretry(&mount_lock, nd->m_seq)) return false; } return !(flags & DCACHE_NEED_AUTOMOUNT); } } static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, struct path *path) { bool jumped; int ret; path->mnt = nd->path.mnt; path->dentry = dentry; if (nd->flags & LOOKUP_RCU) { unsigned int seq = nd->next_seq; if (likely(__follow_mount_rcu(nd, path))) return 0; // *path and nd->next_seq might've been clobbered path->mnt = nd->path.mnt; path->dentry = dentry; nd->next_seq = seq; if (!try_to_unlazy_next(nd, dentry)) return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); if (jumped) { if (unlikely(nd->flags & LOOKUP_NO_XDEV)) ret = -EXDEV; else nd->state |= ND_JUMPED; } if (unlikely(ret)) { dput(path->dentry); if (path->mnt != nd->path.mnt) mntput(path->mnt); } return ret; } /* * This looks up the name in dcache and possibly revalidates the found dentry. * NULL is returned if the dentry does not exist in the cache. */ static struct dentry *lookup_dcache(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct dentry *dentry = d_lookup(dir, name); if (dentry) { int error = d_revalidate(dir->d_inode, name, dentry, flags); if (unlikely(error <= 0)) { if (!error) d_invalidate(dentry); dput(dentry); return ERR_PTR(error); } } return dentry; } /* * Parent directory has inode locked exclusive. This is one * and only case when ->lookup() gets called on non in-lookup * dentries - as the matter of fact, this only gets called * when directory is guaranteed to have no in-lookup children * at all. */ struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, unsigned int flags) { struct dentry *dentry = lookup_dcache(name, base, flags); struct dentry *old; struct inode *dir = base->d_inode; if (dentry) return dentry; /* Don't create child dentry for a dead directory. */ if (unlikely(IS_DEADDIR(dir))) return ERR_PTR(-ENOENT); dentry = d_alloc(base, name); if (unlikely(!dentry)) return ERR_PTR(-ENOMEM); old = dir->i_op->lookup(dir, dentry, flags); if (unlikely(old)) { dput(dentry); dentry = old; } return dentry; } EXPORT_SYMBOL(lookup_one_qstr_excl); /** * lookup_fast - do fast lockless (but racy) lookup of a dentry * @nd: current nameidata * * Do a fast, but racy lookup in the dcache for the given dentry, and * revalidate it. Returns a valid dentry pointer or NULL if one wasn't * found. On error, an ERR_PTR will be returned. * * If this function returns a valid dentry and the walk is no longer * lazy, the dentry will carry a reference that must later be put. If * RCU mode is still in force, then this is not the case and the dentry * must be legitimized before use. If this returns NULL, then the walk * will no longer be in RCU mode. */ static struct dentry *lookup_fast(struct nameidata *nd) { struct dentry *dentry, *parent = nd->path.dentry; int status = 1; /* * Rename seqlock is not required here because in the off chance * of a false negative due to a concurrent rename, the caller is * going to fall back to non-racy lookup. */ if (nd->flags & LOOKUP_RCU) { dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq); if (unlikely(!dentry)) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); return NULL; } /* * This sequence count validates that the parent had no * changes while we did the lookup of the dentry above. */ if (read_seqcount_retry(&parent->d_seq, nd->seq)) return ERR_PTR(-ECHILD); status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags); if (likely(status > 0)) return dentry; if (!try_to_unlazy_next(nd, dentry)) return ERR_PTR(-ECHILD); if (status == -ECHILD) /* we'd been told to redo it in non-rcu mode */ status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags); } else { dentry = __d_lookup(parent, &nd->last); if (unlikely(!dentry)) return NULL; status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags); } if (unlikely(status <= 0)) { if (!status) d_invalidate(dentry); dput(dentry); return ERR_PTR(status); } return dentry; } /* Fast lookup failed, do it the slow way */ static struct dentry *__lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct dentry *dentry, *old; struct inode *inode = dir->d_inode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); /* Don't go there if it's already dead */ if (unlikely(IS_DEADDIR(inode))) return ERR_PTR(-ENOENT); again: dentry = d_alloc_parallel(dir, name, &wq); if (IS_ERR(dentry)) return dentry; if (unlikely(!d_in_lookup(dentry))) { int error = d_revalidate(inode, name, dentry, flags); if (unlikely(error <= 0)) { if (!error) { d_invalidate(dentry); dput(dentry); goto again; } dput(dentry); dentry = ERR_PTR(error); } } else { old = inode->i_op->lookup(inode, dentry, flags); d_lookup_done(dentry); if (unlikely(old)) { dput(dentry); dentry = old; } } return dentry; } static struct dentry *lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct inode *inode = dir->d_inode; struct dentry *res; inode_lock_shared(inode); res = __lookup_slow(name, dir, flags); inode_unlock_shared(inode); return res; } static inline int may_lookup(struct mnt_idmap *idmap, struct nameidata *restrict nd) { int err, mask; mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0; err = inode_permission(idmap, nd->inode, mask | MAY_EXEC); if (likely(!err)) return 0; // If we failed, and we weren't in LOOKUP_RCU, it's final if (!(nd->flags & LOOKUP_RCU)) return err; // Drop out of RCU mode to make sure it wasn't transient if (!try_to_unlazy(nd)) return -ECHILD; // redo it all non-lazy if (err != -ECHILD) // hard error return err; return inode_permission(idmap, nd->inode, MAY_EXEC); } static int reserve_stack(struct nameidata *nd, struct path *link) { if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) return -ELOOP; if (likely(nd->depth != EMBEDDED_LEVELS)) return 0; if (likely(nd->stack != nd->internal)) return 0; if (likely(nd_alloc_stack(nd))) return 0; if (nd->flags & LOOKUP_RCU) { // we need to grab link before we do unlazy. And we can't skip // unlazy even if we fail to grab the link - cleanup needs it bool grabbed_link = legitimize_path(nd, link, nd->next_seq); if (!try_to_unlazy(nd) || !grabbed_link) return -ECHILD; if (nd_alloc_stack(nd)) return 0; } return -ENOMEM; } enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; static const char *pick_link(struct nameidata *nd, struct path *link, struct inode *inode, int flags) { struct saved *last; const char *res; int error = reserve_stack(nd, link); if (unlikely(error)) { if (!(nd->flags & LOOKUP_RCU)) path_put(link); return ERR_PTR(error); } last = nd->stack + nd->depth++; last->link = *link; clear_delayed_call(&last->done); last->seq = nd->next_seq; if (flags & WALK_TRAILING) { error = may_follow_link(nd, inode); if (unlikely(error)) return ERR_PTR(error); } if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) || unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW)) return ERR_PTR(-ELOOP); if (!(nd->flags & LOOKUP_RCU)) { touch_atime(&last->link); cond_resched(); } else if (atime_needs_update(&last->link, inode)) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); touch_atime(&last->link); } error = security_inode_follow_link(link->dentry, inode, nd->flags & LOOKUP_RCU); if (unlikely(error)) return ERR_PTR(error); res = READ_ONCE(inode->i_link); if (!res) { const char * (*get)(struct dentry *, struct inode *, struct delayed_call *); get = inode->i_op->get_link; if (nd->flags & LOOKUP_RCU) { res = get(NULL, inode, &last->done); if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd)) res = get(link->dentry, inode, &last->done); } else { res = get(link->dentry, inode, &last->done); } if (!res) goto all_done; if (IS_ERR(res)) return res; } if (*res == '/') { error = nd_jump_root(nd); if (unlikely(error)) return ERR_PTR(error); while (unlikely(*++res == '/')) ; } if (*res) return res; all_done: // pure jump put_link(nd); return NULL; } /* * Do we need to follow links? We _really_ want to be able * to do this check without having to look at inode->i_op, * so we keep a cache of "no, this doesn't need follow_link" * for the common case. * * NOTE: dentry must be what nd->next_seq had been sampled from. */ static const char *step_into(struct nameidata *nd, int flags, struct dentry *dentry) { struct path path; struct inode *inode; int err = handle_mounts(nd, dentry, &path); if (err < 0) return ERR_PTR(err); inode = path.dentry->d_inode; if (likely(!d_is_symlink(path.dentry)) || ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) || (flags & WALK_NOFOLLOW)) { /* not a symlink or should not follow */ if (nd->flags & LOOKUP_RCU) { if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); if (unlikely(!inode)) return ERR_PTR(-ENOENT); } else { dput(nd->path.dentry); if (nd->path.mnt != path.mnt) mntput(nd->path.mnt); } nd->path = path; nd->inode = inode; nd->seq = nd->next_seq; return NULL; } if (nd->flags & LOOKUP_RCU) { /* make sure that d_is_symlink above matches inode */ if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); } else { if (path.mnt == nd->path.mnt) mntget(path.mnt); } return pick_link(nd, &path, inode, flags); } static struct dentry *follow_dotdot_rcu(struct nameidata *nd) { struct dentry *parent, *old; if (path_equal(&nd->path, &nd->root)) goto in_root; if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { struct path path; unsigned seq; if (!choose_mountpoint_rcu(real_mount(nd->path.mnt), &nd->root, &path, &seq)) goto in_root; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return ERR_PTR(-ECHILD); nd->path = path; nd->inode = path.dentry->d_inode; nd->seq = seq; // makes sure that non-RCU pathwalk could reach this state if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); /* we know that mountpoint was pinned */ } old = nd->path.dentry; parent = old->d_parent; nd->next_seq = read_seqcount_begin(&parent->d_seq); // makes sure that non-RCU pathwalk could reach this state if (read_seqcount_retry(&old->d_seq, nd->seq)) return ERR_PTR(-ECHILD); if (unlikely(!path_connected(nd->path.mnt, parent))) return ERR_PTR(-ECHILD); return parent; in_root: if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-ECHILD); nd->next_seq = nd->seq; return nd->path.dentry; } static struct dentry *follow_dotdot(struct nameidata *nd) { struct dentry *parent; if (path_equal(&nd->path, &nd->root)) goto in_root; if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { struct path path; if (!choose_mountpoint(real_mount(nd->path.mnt), &nd->root, &path)) goto in_root; path_put(&nd->path); nd->path = path; nd->inode = path.dentry->d_inode; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return ERR_PTR(-EXDEV); } /* rare case of legitimate dget_parent()... */ parent = dget_parent(nd->path.dentry); if (unlikely(!path_connected(nd->path.mnt, parent))) { dput(parent); return ERR_PTR(-ENOENT); } return parent; in_root: if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-EXDEV); return dget(nd->path.dentry); } static const char *handle_dots(struct nameidata *nd, int type) { if (type == LAST_DOTDOT) { const char *error = NULL; struct dentry *parent; if (!nd->root.mnt) { error = ERR_PTR(set_root(nd)); if (error) return error; } if (nd->flags & LOOKUP_RCU) parent = follow_dotdot_rcu(nd); else parent = follow_dotdot(nd); if (IS_ERR(parent)) return ERR_CAST(parent); error = step_into(nd, WALK_NOFOLLOW, parent); if (unlikely(error)) return error; if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { /* * If there was a racing rename or mount along our * path, then we can't be sure that ".." hasn't jumped * above nd->root (and so userspace should retry or use * some fallback). */ smp_rmb(); if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)) return ERR_PTR(-EAGAIN); if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)) return ERR_PTR(-EAGAIN); } } return NULL; } static const char *walk_component(struct nameidata *nd, int flags) { struct dentry *dentry; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and * parent relationships. */ if (unlikely(nd->last_type != LAST_NORM)) { if (!(flags & WALK_MORE) && nd->depth) put_link(nd); return handle_dots(nd, nd->last_type); } dentry = lookup_fast(nd); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (unlikely(!dentry)) { dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags); if (IS_ERR(dentry)) return ERR_CAST(dentry); } if (!(flags & WALK_MORE) && nd->depth) put_link(nd); return step_into(nd, flags, dentry); } /* * We can do the critical dentry name comparison and hashing * operations one word at a time, but we are limited to: * * - Architectures with fast unaligned word accesses. We could * do a "get_unaligned()" if this helps and is sufficiently * fast. * * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we * do not trap on the (extremely unlikely) case of a page * crossing operation. * * - Furthermore, we need an efficient 64-bit compile for the * 64-bit case in order to generate the "number of bytes in * the final mask". Again, that could be replaced with a * efficient population count instruction or similar. */ #ifdef CONFIG_DCACHE_WORD_ACCESS #include <asm/word-at-a-time.h> #ifdef HASH_MIX /* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */ #elif defined(CONFIG_64BIT) /* * Register pressure in the mixing function is an issue, particularly * on 32-bit x86, but almost any function requires one state value and * one temporary. Instead, use a function designed for two state values * and no temporaries. * * This function cannot create a collision in only two iterations, so * we have two iterations to achieve avalanche. In those two iterations, * we have six layers of mixing, which is enough to spread one bit's * influence out to 2^6 = 64 state bits. * * Rotate constants are scored by considering either 64 one-bit input * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the * probability of that delta causing a change to each of the 128 output * bits, using a sample of random initial states. * * The Shannon entropy of the computed probabilities is then summed * to produce a score. Ideally, any input change has a 50% chance of * toggling any given output bit. * * Mixing scores (in bits) for (12,45): * Input delta: 1-bit 2-bit * 1 round: 713.3 42542.6 * 2 rounds: 2753.7 140389.8 * 3 rounds: 5954.1 233458.2 * 4 rounds: 7862.6 256672.2 * Perfect: 8192 258048 * (64*128) (64*63/2 * 128) */ #define HASH_MIX(x, y, a) \ ( x ^= (a), \ y ^= x, x = rol64(x,12),\ x += y, y = rol64(y,45),\ y *= 9 ) /* * Fold two longs into one 32-bit hash value. This must be fast, but * latency isn't quite as critical, as there is a fair bit of additional * work done before the hash value is used. */ static inline unsigned int fold_hash(unsigned long x, unsigned long y) { y ^= x * GOLDEN_RATIO_64; y *= GOLDEN_RATIO_64; return y >> 32; } #else /* 32-bit case */ /* * Mixing scores (in bits) for (7,20): * Input delta: 1-bit 2-bit * 1 round: 330.3 9201.6 * 2 rounds: 1246.4 25475.4 * 3 rounds: 1907.1 31295.1 * 4 rounds: 2042.3 31718.6 * Perfect: 2048 31744 * (32*64) (32*31/2 * 64) */ #define HASH_MIX(x, y, a) \ ( x ^= (a), \ y ^= x, x = rol32(x, 7),\ x += y, y = rol32(y,20),\ y *= 9 ) static inline unsigned int fold_hash(unsigned long x, unsigned long y) { /* Use arch-optimized multiply if one exists */ return __hash_32(y ^ __hash_32(x)); } #endif /* * Return the hash of a string of known length. This is carfully * designed to match hash_name(), which is the more critical function. * In particular, we must end by hashing a final word containing 0..7 * payload bytes, to match the way that hash_name() iterates until it * finds the delimiter after the name. */ unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { unsigned long a, x = 0, y = (unsigned long)salt; for (;;) { if (!len) goto done; a = load_unaligned_zeropad(name); if (len < sizeof(unsigned long)) break; HASH_MIX(x, y, a); name += sizeof(unsigned long); len -= sizeof(unsigned long); } x ^= a & bytemask_from_count(len); done: return fold_hash(x, y); } EXPORT_SYMBOL(full_name_hash); /* Return the "hash_len" (hash and length) of a null-terminated string */ u64 hashlen_string(const void *salt, const char *name) { unsigned long a = 0, x = 0, y = (unsigned long)salt; unsigned long adata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; len = 0; goto inside; do { HASH_MIX(x, y, a); len += sizeof(unsigned long); inside: a = load_unaligned_zeropad(name+len); } while (!has_zero(a, &adata, &constants)); adata = prep_zero_mask(a, adata, &constants); mask = create_zero_mask(adata); x ^= a & zero_bytemask(mask); return hashlen_create(fold_hash(x, y), len + find_zero(mask)); } EXPORT_SYMBOL(hashlen_string); /* * Calculate the length and hash of the path component, and * return the length as the result. */ static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword) { unsigned long a, b, x, y = (unsigned long)nd->path.dentry; unsigned long adata, bdata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; /* * The first iteration is special, because it can result in * '.' and '..' and has no mixing other than the final fold. */ a = load_unaligned_zeropad(name); b = a ^ REPEAT_BYTE('/'); if (has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)) { adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); mask = create_zero_mask(adata | bdata); a &= zero_bytemask(mask); *lastword = a; len = find_zero(mask); nd->last.hash = fold_hash(a, y); nd->last.len = len; return name + len; } len = 0; x = 0; do { HASH_MIX(x, y, a); len += sizeof(unsigned long); a = load_unaligned_zeropad(name+len); b = a ^ REPEAT_BYTE('/'); } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); mask = create_zero_mask(adata | bdata); a &= zero_bytemask(mask); x ^= a; len += find_zero(mask); *lastword = 0; // Multi-word components cannot be DOT or DOTDOT nd->last.hash = fold_hash(x, y); nd->last.len = len; return name + len; } /* * Note that the 'last' word is always zero-masked, but * was loaded as a possibly big-endian word. */ #ifdef __BIG_ENDIAN #define LAST_WORD_IS_DOT (0x2eul << (BITS_PER_LONG-8)) #define LAST_WORD_IS_DOTDOT (0x2e2eul << (BITS_PER_LONG-16)) #endif #else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */ /* Return the hash of a string of known length */ unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { unsigned long hash = init_name_hash(salt); while (len--) hash = partial_name_hash((unsigned char)*name++, hash); return end_name_hash(hash); } EXPORT_SYMBOL(full_name_hash); /* Return the "hash_len" (hash and length) of a null-terminated string */ u64 hashlen_string(const void *salt, const char *name) { unsigned long hash = init_name_hash(salt); unsigned long len = 0, c; c = (unsigned char)*name; while (c) { len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } return hashlen_create(end_name_hash(hash), len); } EXPORT_SYMBOL(hashlen_string); /* * We know there's a real path component here of at least * one character. */ static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword) { unsigned long hash = init_name_hash(nd->path.dentry); unsigned long len = 0, c, last = 0; c = (unsigned char)*name; do { last = (last << 8) + c; len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); // This is reliable for DOT or DOTDOT, since the component // cannot contain NUL characters - top bits being zero means // we cannot have had any other pathnames. *lastword = last; nd->last.hash = end_name_hash(hash); nd->last.len = len; return name + len; } #endif #ifndef LAST_WORD_IS_DOT #define LAST_WORD_IS_DOT 0x2e #define LAST_WORD_IS_DOTDOT 0x2e2e #endif /* * Name resolution. * This is the basic name resolution function, turning a pathname into * the final dentry. We expect 'base' to be positive and a directory. * * Returns 0 and nd will have valid dentry and mnt on success. * Returns error and drops reference to input namei data on failure. */ static int link_path_walk(const char *name, struct nameidata *nd) { int depth = 0; // depth <= nd->depth int err; nd->last_type = LAST_ROOT; nd->flags |= LOOKUP_PARENT; if (IS_ERR(name)) return PTR_ERR(name); while (*name=='/') name++; if (!*name) { nd->dir_mode = 0; // short-circuit the 'hardening' idiocy return 0; } /* At this point we know we have a real path component. */ for(;;) { struct mnt_idmap *idmap; const char *link; unsigned long lastword; idmap = mnt_idmap(nd->path.mnt); err = may_lookup(idmap, nd); if (err) return err; nd->last.name = name; name = hash_name(nd, name, &lastword); switch(lastword) { case LAST_WORD_IS_DOTDOT: nd->last_type = LAST_DOTDOT; nd->state |= ND_JUMPED; break; case LAST_WORD_IS_DOT: nd->last_type = LAST_DOT; break; default: nd->last_type = LAST_NORM; nd->state &= ~ND_JUMPED; struct dentry *parent = nd->path.dentry; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { err = parent->d_op->d_hash(parent, &nd->last); if (err < 0) return err; } } if (!*name) goto OK; /* * If it wasn't NUL, we know it was '/'. Skip that * slash, and continue until no more slashes. */ do { name++; } while (unlikely(*name == '/')); if (unlikely(!*name)) { OK: /* pathname or trailing symlink, done */ if (!depth) { nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; return 0; } /* last component of nested symlink */ name = nd->stack[--depth].name; link = walk_component(nd, 0); } else { /* not the last component */ link = walk_component(nd, WALK_MORE); } if (unlikely(link)) { if (IS_ERR(link)) return PTR_ERR(link); /* a symlink to follow */ nd->stack[depth++].name = name; name = link; continue; } if (unlikely(!d_can_lookup(nd->path.dentry))) { if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd)) return -ECHILD; } return -ENOTDIR; } } } /* must be paired with terminate_walk() */ static const char *path_init(struct nameidata *nd, unsigned flags) { int error; const char *s = nd->pathname; /* LOOKUP_CACHED requires RCU, ask caller to retry */ if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED) return ERR_PTR(-EAGAIN); if (!*s) flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU) rcu_read_lock(); else nd->seq = nd->next_seq = 0; nd->flags = flags; nd->state |= ND_JUMPED; nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); smp_rmb(); if (nd->state & ND_ROOT_PRESET) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; if (*s && unlikely(!d_can_lookup(root))) return ERR_PTR(-ENOTDIR); nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); nd->root_seq = nd->seq; } else { path_get(&nd->path); } return s; } nd->root.mnt = NULL; /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) { error = nd_jump_root(nd); if (unlikely(error)) return ERR_PTR(error); return s; } /* Relative pathname -- get the starting-point it is relative to. */ if (nd->dfd == AT_FDCWD) { if (flags & LOOKUP_RCU) { struct fs_struct *fs = current->fs; unsigned seq; do { seq = read_seqcount_begin(&fs->seq); nd->path = fs->pwd; nd->inode = nd->path.dentry->d_inode; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_pwd(current->fs, &nd->path); nd->inode = nd->path.dentry->d_inode; } } else { /* Caller must check execute permissions on the starting path component */ CLASS(fd_raw, f)(nd->dfd); struct dentry *dentry; if (fd_empty(f)) return ERR_PTR(-EBADF); if (flags & LOOKUP_LINKAT_EMPTY) { if (fd_file(f)->f_cred != current_cred() && !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH)) return ERR_PTR(-ENOENT); } dentry = fd_file(f)->f_path.dentry; if (*s && unlikely(!d_can_lookup(dentry))) return ERR_PTR(-ENOTDIR); nd->path = fd_file(f)->f_path; if (flags & LOOKUP_RCU) { nd->inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } else { path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } } /* For scoped-lookups we need to set the root to the dirfd as well. */ if (flags & LOOKUP_IS_SCOPED) { nd->root = nd->path; if (flags & LOOKUP_RCU) { nd->root_seq = nd->seq; } else { path_get(&nd->root); nd->state |= ND_ROOT_GRABBED; } } return s; } static inline const char *lookup_last(struct nameidata *nd) { if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; return walk_component(nd, WALK_TRAILING); } static int handle_lookup_down(struct nameidata *nd) { if (!(nd->flags & LOOKUP_RCU)) dget(nd->path.dentry); nd->next_seq = nd->seq; return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry)); } /* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) { const char *s = path_init(nd, flags); int err; if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) { err = handle_lookup_down(nd); if (unlikely(err < 0)) s = ERR_PTR(err); } while (!(err = link_path_walk(s, nd)) && (s = lookup_last(nd)) != NULL) ; if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { err = handle_lookup_down(nd); nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please... } if (!err) err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) if (!d_can_lookup(nd->path.dentry)) err = -ENOTDIR; if (!err) { *path = nd->path; nd->path.mnt = NULL; nd->path.dentry = NULL; } terminate_walk(nd); return err; } int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root) { int retval; struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); set_nameidata(&nd, dfd, name, root); retval = path_lookupat(&nd, flags | LOOKUP_RCU, path); if (unlikely(retval == -ECHILD)) retval = path_lookupat(&nd, flags, path); if (unlikely(retval == -ESTALE)) retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path); if (likely(!retval)) audit_inode(name, path->dentry, flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0); restore_nameidata(); return retval; } /* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_parentat(struct nameidata *nd, unsigned flags, struct path *parent) { const char *s = path_init(nd, flags); int err = link_path_walk(s, nd); if (!err) err = complete_walk(nd); if (!err) { *parent = nd->path; nd->path.mnt = NULL; nd->path.dentry = NULL; } terminate_walk(nd); return err; } /* Note: this does not consume "name" */ static int __filename_parentat(int dfd, struct filename *name, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root) { int retval; struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); set_nameidata(&nd, dfd, name, root); retval = path_parentat(&nd, flags | LOOKUP_RCU, parent); if (unlikely(retval == -ECHILD)) retval = path_parentat(&nd, flags, parent); if (unlikely(retval == -ESTALE)) retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent); if (likely(!retval)) { *last = nd.last; *type = nd.last_type; audit_inode(name, parent->dentry, AUDIT_INODE_PARENT); } restore_nameidata(); return retval; } static int filename_parentat(int dfd, struct filename *name, unsigned int flags, struct path *parent, struct qstr *last, int *type) { return __filename_parentat(dfd, name, flags, parent, last, type, NULL); } /* does lookup, returns the object with parent locked */ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path) { struct dentry *d; struct qstr last; int type, error; error = filename_parentat(dfd, name, 0, path, &last, &type); if (error) return ERR_PTR(error); if (unlikely(type != LAST_NORM)) { path_put(path); return ERR_PTR(-EINVAL); } inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); d = lookup_one_qstr_excl(&last, path->dentry, 0); if (IS_ERR(d)) { inode_unlock(path->dentry->d_inode); path_put(path); } return d; } struct dentry *kern_path_locked(const char *name, struct path *path) { struct filename *filename = getname_kernel(name); struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path); putname(filename); return res; } struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path) { struct filename *filename = getname(name); struct dentry *res = __kern_path_locked(dfd, filename, path); putname(filename); return res; } EXPORT_SYMBOL(user_path_locked_at); int kern_path(const char *name, unsigned int flags, struct path *path) { struct filename *filename = getname_kernel(name); int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL); putname(filename); return ret; } EXPORT_SYMBOL(kern_path); /** * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair * @filename: filename structure * @flags: lookup flags * @parent: pointer to struct path to fill * @last: last component * @type: type of the last component * @root: pointer to struct path of the base directory */ int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root) { return __filename_parentat(AT_FDCWD, filename, flags, parent, last, type, root); } EXPORT_SYMBOL(vfs_path_parent_lookup); /** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair * @dentry: pointer to dentry of the base directory * @mnt: pointer to vfs mount of the base directory * @name: pointer to file name * @flags: lookup flags * @path: pointer to struct path to fill */ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, const char *name, unsigned int flags, struct path *path) { struct filename *filename; struct path root = {.mnt = mnt, .dentry = dentry}; int ret; filename = getname_kernel(name); /* the first argument of filename_lookup() is ignored with root */ ret = filename_lookup(AT_FDCWD, filename, flags, path, &root); putname(filename); return ret; } EXPORT_SYMBOL(vfs_path_lookup); static int lookup_one_common(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len, struct qstr *this) { this->name = name; this->len = len; this->hash = full_name_hash(base, name, len); if (!len) return -EACCES; if (is_dot_dotdot(name, len)) return -EACCES; while (len--) { unsigned int c = *(const unsigned char *)name++; if (c == '/' || c == '\0') return -EACCES; } /* * See if the low-level filesystem might want * to use its own hash.. */ if (base->d_flags & DCACHE_OP_HASH) { int err = base->d_op->d_hash(base, this); if (err < 0) return err; } return inode_permission(idmap, base->d_inode, MAY_EXEC); } /** * try_lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Look up a dentry by name in the dcache, returning NULL if it does not * currently exist. The function does not try to create a dentry. * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The caller must hold base->i_mutex. */ struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len) { struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); return lookup_dcache(&this, base, 0); } EXPORT_SYMBOL(try_lookup_one_len); /** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The caller must hold base->i_mutex. */ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { struct dentry *dentry; struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); dentry = lookup_dcache(&this, base, 0); return dentry ? dentry : __lookup_slow(&this, base, 0); } EXPORT_SYMBOL(lookup_one_len); /** * lookup_one - filesystem helper to lookup single pathname component * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The caller must hold base->i_mutex. */ struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct dentry *dentry; struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_common(idmap, name, base, len, &this); if (err) return ERR_PTR(err); dentry = lookup_dcache(&this, base, 0); return dentry ? dentry : __lookup_slow(&this, base, 0); } EXPORT_SYMBOL(lookup_one); /** * lookup_one_unlocked - filesystem helper to lookup single pathname component * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * Unlike lookup_one_len, it should be called without the parent * i_mutex held, and will take the i_mutex itself if necessary. */ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct qstr this; int err; struct dentry *ret; err = lookup_one_common(idmap, name, base, len, &this); if (err) return ERR_PTR(err); ret = lookup_dcache(&this, base, 0); if (!ret) ret = lookup_slow(&this, base, 0); return ret; } EXPORT_SYMBOL(lookup_one_unlocked); /** * lookup_one_positive_unlocked - filesystem helper to lookup single * pathname component * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns * known positive or ERR_PTR(). This is what most of the users want. * * Note that pinned negative with unlocked parent _can_ become positive at any * time, so callers of lookup_one_unlocked() need to be very careful; pinned * positives have >d_inode stable, so this one avoids such problems. * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The helper should be called without i_mutex held. */ struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct dentry *ret = lookup_one_unlocked(idmap, name, base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { dput(ret); ret = ERR_PTR(-ENOENT); } return ret; } EXPORT_SYMBOL(lookup_one_positive_unlocked); /** * lookup_one_len_unlocked - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * Unlike lookup_one_len, it should be called without the parent * i_mutex held, and will take the i_mutex itself if necessary. */ struct dentry *lookup_one_len_unlocked(const char *name, struct dentry *base, int len) { return lookup_one_unlocked(&nop_mnt_idmap, name, base, len); } EXPORT_SYMBOL(lookup_one_len_unlocked); /* * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT) * on negatives. Returns known positive or ERR_PTR(); that's what * most of the users want. Note that pinned negative with unlocked parent * _can_ become positive at any time, so callers of lookup_one_len_unlocked() * need to be very careful; pinned positives have ->d_inode stable, so * this one avoids such problems. */ struct dentry *lookup_positive_unlocked(const char *name, struct dentry *base, int len) { return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len); } EXPORT_SYMBOL(lookup_positive_unlocked); #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) { /* Find something mounted on "pts" in the same directory as * the input path. */ struct dentry *parent = dget_parent(path->dentry); struct dentry *child; struct qstr this = QSTR_INIT("pts", 3); if (unlikely(!path_connected(path->mnt, parent))) { dput(parent); return -ENOENT; } dput(path->dentry); path->dentry = parent; child = d_hash_and_lookup(parent, &this); if (IS_ERR_OR_NULL(child)) return -ENOENT; path->dentry = child; dput(parent); follow_down(path, 0); return 0; } #endif int user_path_at(int dfd, const char __user *name, unsigned flags, struct path *path) { struct filename *filename = getname_flags(name, flags); int ret = filename_lookup(dfd, filename, flags, path, NULL); putname(filename); return ret; } EXPORT_SYMBOL(user_path_at); int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid)) return 0; if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid)) return 0; return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER); } EXPORT_SYMBOL(__check_sticky); /* * Check whether we can remove a link victim from directory dir, check * whether the type of victim is right. * 1. We can't do it if dir is read-only (done in permission()) * 2. We should have write and exec permissions on dir * 3. We can't remove anything from append-only dir * 4. We can't do anything with immutable dir (done in permission()) * 5. If the sticky bit on dir is set we should either * a. be owner of dir, or * b. be owner of victim, or * c. have CAP_FOWNER capability * 6. If the victim is append-only or immutable we can't do antyhing with * links pointing to it. * 7. If the victim has an unknown uid or gid we can't change the inode. * 8. If we were asked to remove a directory and victim isn't one - ENOTDIR. * 9. If we were asked to remove a non-directory and victim isn't one - EISDIR. * 10. We can't remove a root or mountpoint. * 11. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ static int may_delete(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, bool isdir) { struct inode *inode = d_backing_inode(victim); int error; if (d_is_negative(victim)) return -ENOENT; BUG_ON(!inode); BUG_ON(victim->d_parent->d_inode != dir); /* Inode writeback is not safe when the uid or gid are invalid. */ if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (isdir) { if (!d_is_dir(victim)) return -ENOTDIR; if (IS_ROOT(victim)) return -EBUSY; } else if (d_is_dir(victim)) return -EISDIR; if (IS_DEADDIR(dir)) return -ENOENT; if (victim->d_flags & DCACHE_NFSFS_RENAMED) return -EBUSY; return 0; } /* Check whether we can create an object with dentry child in directory * dir. * 1. We can't do it if child already exists (open has special treatment for * this case, but since we are inlined it's OK) * 2. We can't do it if dir is read-only (done in permission()) * 3. We can't do it if the fs can't represent the fsuid or fsgid. * 4. We should have write and exec permissions on dir * 5. We can't do it if dir is immutable (done in permission()) */ static inline int may_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *child) { audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; if (!fsuidgid_has_mapping(dir->i_sb, idmap)) return -EOVERFLOW; return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); } // p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2) { struct dentry *p = p1, *q = p2, *r; while ((r = p->d_parent) != p2 && r != p) p = r; if (r == p2) { // p is a child of p2 and an ancestor of p1 or p1 itself inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2); return p; } // p is the root of connected component that contains p1 // p2 does not occur on the path from p to p1 while ((r = q->d_parent) != p1 && r != p && r != q) q = r; if (r == p1) { // q is a child of p1 and an ancestor of p2 or p2 itself inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return q; } else if (likely(r == p)) { // both p2 and p1 are descendents of p inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return NULL; } else { // no common ancestor at the time we'd been called mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); return ERR_PTR(-EXDEV); } } /* * p1 and p2 should be directories on the same fs. */ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) { if (p1 == p2) { inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); return NULL; } mutex_lock(&p1->d_sb->s_vfs_rename_mutex); return lock_two_directories(p1, p2); } EXPORT_SYMBOL(lock_rename); /* * c1 and p2 should be on the same fs. */ struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2) { if (READ_ONCE(c1->d_parent) == p2) { /* * hopefully won't need to touch ->s_vfs_rename_mutex at all. */ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); /* * now that p2 is locked, nobody can move in or out of it, * so the test below is safe. */ if (likely(c1->d_parent == p2)) return NULL; /* * c1 got moved out of p2 while we'd been taking locks; * unlock and fall back to slow case. */ inode_unlock(p2->d_inode); } mutex_lock(&c1->d_sb->s_vfs_rename_mutex); /* * nobody can move out of any directories on this fs. */ if (likely(c1->d_parent != p2)) return lock_two_directories(c1->d_parent, p2); /* * c1 got moved into p2 while we were taking locks; * we need p2 locked and ->s_vfs_rename_mutex unlocked, * for consistency with lock_rename(). */ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); mutex_unlock(&c1->d_sb->s_vfs_rename_mutex); return NULL; } EXPORT_SYMBOL(lock_rename_child); void unlock_rename(struct dentry *p1, struct dentry *p2) { inode_unlock(p1->d_inode); if (p1 != p2) { inode_unlock(p2->d_inode); mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); } } EXPORT_SYMBOL(unlock_rename); /** * vfs_prepare_mode - prepare the mode to be used for a new inode * @idmap: idmap of the mount the inode was found from * @dir: parent directory of the new inode * @mode: mode of the new inode * @mask_perms: allowed permission by the vfs * @type: type of file to be created * * This helper consolidates and enforces vfs restrictions on the @mode of a new * object to be created. * * Umask stripping depends on whether the filesystem supports POSIX ACLs (see * the kernel documentation for mode_strip_umask()). Moving umask stripping * after setgid stripping allows the same ordering for both non-POSIX ACL and * POSIX ACL supporting filesystems. * * Note that it's currently valid for @type to be 0 if a directory is created. * Filesystems raise that flag individually and we need to check whether each * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a * non-zero type. * * Returns: mode to be passed to the filesystem */ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode, umode_t mask_perms, umode_t type) { mode = mode_strip_sgid(idmap, dir, mode); mode = mode_strip_umask(dir, mode); /* * Apply the vfs mandated allowed permission mask and set the type of * file to be created before we call into the filesystem. */ mode &= (mask_perms & ~S_IFMT); mode |= (type & S_IFMT); return mode; } /** * vfs_create - create new file * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child file * @mode: mode of the child file * @want_excl: whether the file must not yet exist * * Create a new file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) { int error; error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->create) return -EACCES; /* shouldn't it be ENOSYS? */ mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG); error = security_inode_create(dir, dentry, mode); if (error) return error; error = dir->i_op->create(idmap, dir, dentry, mode, want_excl); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_create); int vfs_mkobj(struct dentry *dentry, umode_t mode, int (*f)(struct dentry *, umode_t, void *), void *arg) { struct inode *dir = dentry->d_parent->d_inode; int error = may_create(&nop_mnt_idmap, dir, dentry); if (error) return error; mode &= S_IALLUGO; mode |= S_IFREG; error = security_inode_create(dir, dentry, mode); if (error) return error; error = f(dentry, mode, arg); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mkobj); bool may_open_dev(const struct path *path) { return !(path->mnt->mnt_flags & MNT_NODEV) && !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); } static int may_open(struct mnt_idmap *idmap, const struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; int error; if (!inode) return -ENOENT; switch (inode->i_mode & S_IFMT) { case S_IFLNK: return -ELOOP; case S_IFDIR: if (acc_mode & MAY_WRITE) return -EISDIR; if (acc_mode & MAY_EXEC) return -EACCES; break; case S_IFBLK: case S_IFCHR: if (!may_open_dev(path)) return -EACCES; fallthrough; case S_IFIFO: case S_IFSOCK: if (acc_mode & MAY_EXEC) return -EACCES; flag &= ~O_TRUNC; break; case S_IFREG: if ((acc_mode & MAY_EXEC) && path_noexec(path)) return -EACCES; break; } error = inode_permission(idmap, inode, MAY_OPEN | acc_mode); if (error) return error; /* * An append-only file must be opened in append mode for writing. */ if (IS_APPEND(inode)) { if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND)) return -EPERM; if (flag & O_TRUNC) return -EPERM; } /* O_NOATIME can only be set by the owner or superuser */ if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode)) return -EPERM; return 0; } static int handle_truncate(struct mnt_idmap *idmap, struct file *filp) { const struct path *path = &filp->f_path; struct inode *inode = path->dentry->d_inode; int error = get_write_access(inode); if (error) return error; error = security_file_truncate(filp); if (!error) { error = do_truncate(idmap, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, filp); } put_write_access(inode); return error; } static inline int open_to_namei_flags(int flag) { if ((flag & O_ACCMODE) == 3) flag--; return flag; } static int may_o_create(struct mnt_idmap *idmap, const struct path *dir, struct dentry *dentry, umode_t mode) { int error = security_path_mknod(dir, dentry, mode, 0); if (error) return error; if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap)) return -EOVERFLOW; error = inode_permission(idmap, dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); if (error) return error; return security_inode_create(dir->dentry->d_inode, dentry, mode); } /* * Attempt to atomically look up, create and open a file from a negative * dentry. * * Returns 0 if successful. The file will have been created and attached to * @file by the filesystem calling finish_open(). * * If the file was looked up only or didn't need creating, FMODE_OPENED won't * be set. The caller will need to perform the open themselves. @path will * have been updated to point to the new dentry. This may be negative. * * Returns an error code otherwise. */ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, struct file *file, int open_flag, umode_t mode) { struct dentry *const DENTRY_NOT_SET = (void *) -1UL; struct inode *dir = nd->path.dentry->d_inode; int error; if (nd->flags & LOOKUP_DIRECTORY) open_flag |= O_DIRECTORY; file->f_path.dentry = DENTRY_NOT_SET; file->f_path.mnt = nd->path.mnt; error = dir->i_op->atomic_open(dir, dentry, file, open_to_namei_flags(open_flag), mode); d_lookup_done(dentry); if (!error) { if (file->f_mode & FMODE_OPENED) { if (unlikely(dentry != file->f_path.dentry)) { dput(dentry); dentry = dget(file->f_path.dentry); } } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { error = -EIO; } else { if (file->f_path.dentry) { dput(dentry); dentry = file->f_path.dentry; } if (unlikely(d_is_negative(dentry))) error = -ENOENT; } } if (error) { dput(dentry); dentry = ERR_PTR(error); } return dentry; } /* * Look up and maybe create and open the last component. * * Must be called with parent locked (exclusive in O_CREAT case). * * Returns 0 on success, that is, if * the file was successfully atomically created (if necessary) and opened, or * the file was not completely opened at this time, though lookups and * creations were performed. * These case are distinguished by presence of FMODE_OPENED on file->f_mode. * In the latter case dentry returned in @path might be negative if O_CREAT * hadn't been specified. * * An error code is returned on failure. */ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, const struct open_flags *op, bool got_write) { struct mnt_idmap *idmap; struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; int open_flag = op->open_flag; struct dentry *dentry; int error, create_error = 0; umode_t mode = op->mode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); if (unlikely(IS_DEADDIR(dir_inode))) return ERR_PTR(-ENOENT); file->f_mode &= ~FMODE_CREATED; dentry = d_lookup(dir, &nd->last); for (;;) { if (!dentry) { dentry = d_alloc_parallel(dir, &nd->last, &wq); if (IS_ERR(dentry)) return dentry; } if (d_in_lookup(dentry)) break; error = d_revalidate(dir_inode, &nd->last, dentry, nd->flags); if (likely(error > 0)) break; if (error) goto out_dput; d_invalidate(dentry); dput(dentry); dentry = NULL; } if (dentry->d_inode) { /* Cached positive dentry: will open in f_op->open */ return dentry; } if (open_flag & O_CREAT) audit_inode(nd->name, dir, AUDIT_INODE_PARENT); /* * Checking write permission is tricky, bacuse we don't know if we are * going to actually need it: O_CREAT opens should work as long as the * file exists. But checking existence breaks atomicity. The trick is * to check access and if not granted clear O_CREAT from the flags. * * Another problem is returing the "right" error value (e.g. for an * O_EXCL open we want to return EEXIST not EROFS). */ if (unlikely(!got_write)) open_flag &= ~O_TRUNC; idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if (open_flag & O_EXCL) open_flag &= ~O_TRUNC; mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode); if (likely(got_write)) create_error = may_o_create(idmap, &nd->path, dentry, mode); else create_error = -EROFS; } if (create_error) open_flag &= ~O_CREAT; if (dir_inode->i_op->atomic_open) { dentry = atomic_open(nd, dentry, file, open_flag, mode); if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT)) dentry = ERR_PTR(create_error); return dentry; } if (d_in_lookup(dentry)) { struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry, nd->flags); d_lookup_done(dentry); if (unlikely(res)) { if (IS_ERR(res)) { error = PTR_ERR(res); goto out_dput; } dput(dentry); dentry = res; } } /* Negative dentry, just create the file */ if (!dentry->d_inode && (open_flag & O_CREAT)) { file->f_mode |= FMODE_CREATED; audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE); if (!dir_inode->i_op->create) { error = -EACCES; goto out_dput; } error = dir_inode->i_op->create(idmap, dir_inode, dentry, mode, open_flag & O_EXCL); if (error) goto out_dput; } if (unlikely(create_error) && !dentry->d_inode) { error = create_error; goto out_dput; } return dentry; out_dput: dput(dentry); return ERR_PTR(error); } static inline bool trailing_slashes(struct nameidata *nd) { return (bool)nd->last.name[nd->last.len]; } static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag) { struct dentry *dentry; if (open_flag & O_CREAT) { if (trailing_slashes(nd)) return ERR_PTR(-EISDIR); /* Don't bother on an O_EXCL create */ if (open_flag & O_EXCL) return NULL; } if (trailing_slashes(nd)) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; dentry = lookup_fast(nd); if (IS_ERR_OR_NULL(dentry)) return dentry; if (open_flag & O_CREAT) { /* Discard negative dentries. Need inode_lock to do the create */ if (!dentry->d_inode) { if (!(nd->flags & LOOKUP_RCU)) dput(dentry); dentry = NULL; } } return dentry; } static const char *open_last_lookups(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool got_write = false; struct dentry *dentry; const char *res; nd->flags |= op->intent; if (nd->last_type != LAST_NORM) { if (nd->depth) put_link(nd); return handle_dots(nd, nd->last_type); } /* We _can_ be in RCU mode here */ dentry = lookup_fast_for_open(nd, open_flag); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (likely(dentry)) goto finish_lookup; if (!(open_flag & O_CREAT)) { if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU)) return ERR_PTR(-ECHILD); } else { if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); } } if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { got_write = !mnt_want_write(nd->path.mnt); /* * do _not_ fail yet - we might not need that or fail with * a different error; let lookup_open() decide; we'll be * dropping this one anyway. */ } if (open_flag & O_CREAT) inode_lock(dir->d_inode); else inode_lock_shared(dir->d_inode); dentry = lookup_open(nd, file, op, got_write); if (!IS_ERR(dentry)) { if (file->f_mode & FMODE_CREATED) fsnotify_create(dir->d_inode, dentry); if (file->f_mode & FMODE_OPENED) fsnotify_open(file); } if (open_flag & O_CREAT) inode_unlock(dir->d_inode); else inode_unlock_shared(dir->d_inode); if (got_write) mnt_drop_write(nd->path.mnt); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) { dput(nd->path.dentry); nd->path.dentry = dentry; return NULL; } finish_lookup: if (nd->depth) put_link(nd); res = step_into(nd, WALK_TRAILING, dentry); if (unlikely(res)) nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); return res; } /* * Handle the last step of open() */ static int do_open(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct mnt_idmap *idmap; int open_flag = op->open_flag; bool do_truncate; int acc_mode; int error; if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) { error = complete_walk(nd); if (error) return error; } if (!(file->f_mode & FMODE_CREATED)) audit_inode(nd->name, nd->path.dentry, 0); idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED)) return -EEXIST; if (d_is_dir(nd->path.dentry)) return -EISDIR; error = may_create_in_sticky(idmap, nd, d_backing_inode(nd->path.dentry)); if (unlikely(error)) return error; } if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) return -ENOTDIR; do_truncate = false; acc_mode = op->acc_mode; if (file->f_mode & FMODE_CREATED) { /* Don't check for write permission, don't truncate */ open_flag &= ~O_TRUNC; acc_mode = 0; } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) { error = mnt_want_write(nd->path.mnt); if (error) return error; do_truncate = true; } error = may_open(idmap, &nd->path, acc_mode, open_flag); if (!error && !(file->f_mode & FMODE_OPENED)) error = vfs_open(&nd->path, file); if (!error) error = security_file_post_open(file, op->acc_mode); if (!error && do_truncate) error = handle_truncate(idmap, file); if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; } if (do_truncate) mnt_drop_write(nd->path.mnt); return error; } /** * vfs_tmpfile - create tmpfile * @idmap: idmap of the mount the inode was found from * @parentpath: pointer to the path of the base directory * @file: file descriptor of the new tmpfile * @mode: mode of the new tmpfile * * Create a temporary file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, struct file *file, umode_t mode) { struct dentry *child; struct inode *dir = d_inode(parentpath->dentry); struct inode *inode; int error; int open_flag = file->f_flags; /* we want directory to be writable */ error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (!dir->i_op->tmpfile) return -EOPNOTSUPP; child = d_alloc(parentpath->dentry, &slash_name); if (unlikely(!child)) return -ENOMEM; file->f_path.mnt = parentpath->mnt; file->f_path.dentry = child; mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = dir->i_op->tmpfile(idmap, dir, file, mode); dput(child); if (file->f_mode & FMODE_OPENED) fsnotify_open(file); if (error) return error; /* Don't check for other permissions, the inode was just created */ error = may_open(idmap, &file->f_path, 0, file->f_flags); if (error) return error; inode = file_inode(file); if (!(open_flag & O_EXCL)) { spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } security_inode_post_create_tmpfile(idmap, inode); return 0; } /** * kernel_tmpfile_open - open a tmpfile for kernel internal use * @idmap: idmap of the mount the inode was found from * @parentpath: path of the base directory * @mode: mode of the new tmpfile * @open_flag: flags * @cred: credentials for open * * Create and open a temporary file. The file is not accounted in nr_files, * hence this is only for kernel internal use, and must not be installed into * file tables or such. */ struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred) { struct file *file; int error; file = alloc_empty_file_noaccount(open_flag, cred); if (IS_ERR(file)) return file; error = vfs_tmpfile(idmap, parentpath, file, mode); if (error) { fput(file); file = ERR_PTR(error); } return file; } EXPORT_SYMBOL(kernel_tmpfile_open); static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file) { struct path path; int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); if (unlikely(error)) return error; error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode); if (error) goto out2; audit_inode(nd->name, file->f_path.dentry, 0); out2: mnt_drop_write(path.mnt); out: path_put(&path); return error; } static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file) { struct path path; int error = path_lookupat(nd, flags, &path); if (!error) { audit_inode(nd->name, path.dentry, 0); error = vfs_open(&path, file); path_put(&path); } return error; } static struct file *path_openat(struct nameidata *nd, const struct open_flags *op, unsigned flags) { struct file *file; int error; file = alloc_empty_file(op->open_flag, current_cred()); if (IS_ERR(file)) return file; if (unlikely(file->f_flags & __O_TMPFILE)) { error = do_tmpfile(nd, flags, op, file); } else if (unlikely(file->f_flags & O_PATH)) { error = do_o_path(nd, flags, file); } else { const char *s = path_init(nd, flags); while (!(error = link_path_walk(s, nd)) && (s = open_last_lookups(nd, file, op)) != NULL) ; if (!error) error = do_open(nd, file, op); terminate_walk(nd); } if (likely(!error)) { if (likely(file->f_mode & FMODE_OPENED)) return file; WARN_ON(1); error = -EINVAL; } fput(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; else error = -ESTALE; } return ERR_PTR(error); } struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op) { struct nameidata nd; int flags = op->lookup_flags; struct file *filp; set_nameidata(&nd, dfd, pathname, NULL); filp = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(filp == ERR_PTR(-ECHILD))) filp = path_openat(&nd, op, flags); if (unlikely(filp == ERR_PTR(-ESTALE))) filp = path_openat(&nd, op, flags | LOOKUP_REVAL); restore_nameidata(); return filp; } struct file *do_file_open_root(const struct path *root, const char *name, const struct open_flags *op) { struct nameidata nd; struct file *file; struct filename *filename; int flags = op->lookup_flags; if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); filename = getname_kernel(name); if (IS_ERR(filename)) return ERR_CAST(filename); set_nameidata(&nd, -1, filename, root); file = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) file = path_openat(&nd, op, flags); if (unlikely(file == ERR_PTR(-ESTALE))) file = path_openat(&nd, op, flags | LOOKUP_REVAL); restore_nameidata(); putname(filename); return file; } static struct dentry *filename_create(int dfd, struct filename *name, struct path *path, unsigned int lookup_flags) { struct dentry *dentry = ERR_PTR(-EEXIST); struct qstr last; bool want_dir = lookup_flags & LOOKUP_DIRECTORY; unsigned int reval_flag = lookup_flags & LOOKUP_REVAL; unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL; int type; int err2; int error; error = filename_parentat(dfd, name, reval_flag, path, &last, &type); if (error) return ERR_PTR(error); /* * Yucky last component or no last component at all? * (foo/., foo/.., /////) */ if (unlikely(type != LAST_NORM)) goto out; /* don't fail immediately if it's r/o, at least try to report other errors */ err2 = mnt_want_write(path->mnt); /* * Do the final lookup. Suppress 'create' if there is a trailing * '/', and a directory wasn't requested. */ if (last.name[last.len] && !want_dir) create_flags = 0; inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path->dentry, reval_flag | create_flags); if (IS_ERR(dentry)) goto unlock; error = -EEXIST; if (d_is_positive(dentry)) goto fail; /* * Special case - lookup gave negative, but... we had foo/bar/ * From the vfs_mknod() POV we just have a negative dentry - * all is fine. Let's be bastards - you had / on the end, you've * been asking for (non-existent) directory. -ENOENT for you. */ if (unlikely(!create_flags)) { error = -ENOENT; goto fail; } if (unlikely(err2)) { error = err2; goto fail; } return dentry; fail: dput(dentry); dentry = ERR_PTR(error); unlock: inode_unlock(path->dentry->d_inode); if (!err2) mnt_drop_write(path->mnt); out: path_put(path); return dentry; } struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, unsigned int lookup_flags) { struct filename *filename = getname_kernel(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); putname(filename); return res; } EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) { dput(dentry); inode_unlock(path->dentry->d_inode); mnt_drop_write(path->mnt); path_put(path); } EXPORT_SYMBOL(done_path_create); inline struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, unsigned int lookup_flags) { struct filename *filename = getname(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); putname(filename); return res; } EXPORT_SYMBOL(user_path_create); /** * vfs_mknod - create device node or file * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child device node * @mode: mode of the child device node * @dev: device number of device to create * * Create a device node or file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; int error = may_create(idmap, dir, dentry); if (error) return error; if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout && !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) return -EPERM; mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = devcgroup_inode_mknod(mode, dev); if (error) return error; error = security_inode_mknod(dir, dentry, mode, dev); if (error) return error; error = dir->i_op->mknod(idmap, dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mknod); static int may_mknod(umode_t mode) { switch (mode & S_IFMT) { case S_IFREG: case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: case 0: /* zero mode translates to S_IFREG */ return 0; case S_IFDIR: return -EPERM; default: return -EINVAL; } } static int do_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev) { struct mnt_idmap *idmap; struct dentry *dentry; struct path path; int error; unsigned int lookup_flags = 0; error = may_mknod(mode); if (error) goto out1; retry: dentry = filename_create(dfd, name, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out1; error = security_path_mknod(&path, dentry, mode_strip_umask(path.dentry->d_inode, mode), dev); if (error) goto out2; idmap = mnt_idmap(path.mnt); switch (mode & S_IFMT) { case 0: case S_IFREG: error = vfs_create(idmap, path.dentry->d_inode, dentry, mode, true); if (!error) security_path_post_mknod(idmap, dentry); break; case S_IFCHR: case S_IFBLK: error = vfs_mknod(idmap, path.dentry->d_inode, dentry, mode, new_decode_dev(dev)); break; case S_IFIFO: case S_IFSOCK: error = vfs_mknod(idmap, path.dentry->d_inode, dentry, mode, 0); break; } out2: done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out1: putname(name); return error; } SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, unsigned int, dev) { return do_mknodat(dfd, getname(filename), mode, dev); } SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev) { return do_mknodat(AT_FDCWD, getname(filename), mode, dev); } /** * vfs_mkdir - create directory * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child directory * @mode: mode of the child directory * * Create a directory. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int error; unsigned max_links = dir->i_sb->s_max_links; error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->mkdir) return -EPERM; mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0); error = security_inode_mkdir(dir, dentry, mode); if (error) return error; if (max_links && dir->i_nlink >= max_links) return -EMLINK; error = dir->i_op->mkdir(idmap, dir, dentry, mode); if (!error) fsnotify_mkdir(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mkdir); int do_mkdirat(int dfd, struct filename *name, umode_t mode) { struct dentry *dentry; struct path path; int error; unsigned int lookup_flags = LOOKUP_DIRECTORY; retry: dentry = filename_create(dfd, name, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_putname; error = security_path_mkdir(&path, dentry, mode_strip_umask(path.dentry->d_inode, mode)); if (!error) { error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode); } done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out_putname: putname(name); return error; } SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) { return do_mkdirat(dfd, getname(pathname), mode); } SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) { return do_mkdirat(AT_FDCWD, getname(pathname), mode); } /** * vfs_rmdir - remove directory * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child directory * * Remove a directory. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { int error = may_delete(idmap, dir, dentry, 1); if (error) return error; if (!dir->i_op->rmdir) return -EPERM; dget(dentry); inode_lock(dentry->d_inode); error = -EBUSY; if (is_local_mountpoint(dentry) || (dentry->d_inode->i_flags & S_KERNEL_FILE)) goto out; error = security_inode_rmdir(dir, dentry); if (error) goto out; error = dir->i_op->rmdir(dir, dentry); if (error) goto out; shrink_dcache_parent(dentry); dentry->d_inode->i_flags |= S_DEAD; dont_mount(dentry); detach_mounts(dentry); out: inode_unlock(dentry->d_inode); dput(dentry); if (!error) d_delete_notify(dir, dentry); return error; } EXPORT_SYMBOL(vfs_rmdir); int do_rmdir(int dfd, struct filename *name) { int error; struct dentry *dentry; struct path path; struct qstr last; int type; unsigned int lookup_flags = 0; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) goto exit1; switch (type) { case LAST_DOTDOT: error = -ENOTEMPTY; goto exit2; case LAST_DOT: error = -EINVAL; goto exit2; case LAST_ROOT: error = -EBUSY; goto exit2; } error = mnt_want_write(path.mnt); if (error) goto exit2; inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit3; if (!dentry->d_inode) { error = -ENOENT; goto exit4; } error = security_path_rmdir(&path, dentry); if (error) goto exit4; error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry); exit4: dput(dentry); exit3: inode_unlock(path.dentry->d_inode); mnt_drop_write(path.mnt); exit2: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } exit1: putname(name); return error; } SYSCALL_DEFINE1(rmdir, const char __user *, pathname) { return do_rmdir(AT_FDCWD, getname(pathname)); } /** * vfs_unlink - unlink a filesystem object * @idmap: idmap of the mount the inode was found from * @dir: parent directory * @dentry: victim * @delegated_inode: returns victim inode, if the inode is delegated. * * The caller must hold dir->i_mutex. * * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and * return a reference to the inode in delegated_inode. The caller * should then break the delegation on that inode and retry. Because * breaking a delegation may take a long time, the caller should drop * dir->i_mutex before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) { struct inode *target = dentry->d_inode; int error = may_delete(idmap, dir, dentry, 0); if (error) return error; if (!dir->i_op->unlink) return -EPERM; inode_lock(target); if (IS_SWAPFILE(target)) error = -EPERM; else if (is_local_mountpoint(dentry)) error = -EBUSY; else { error = security_inode_unlink(dir, dentry); if (!error) { error = try_break_deleg(target, delegated_inode); if (error) goto out; error = dir->i_op->unlink(dir, dentry); if (!error) { dont_mount(dentry); detach_mounts(dentry); } } } out: inode_unlock(target); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) { fsnotify_unlink(dir, dentry); } else if (!error) { fsnotify_link_count(target); d_delete_notify(dir, dentry); } return error; } EXPORT_SYMBOL(vfs_unlink); /* * Make sure that the actual truncation of the file will occur outside its * directory's i_mutex. Truncate can take a long time if there is a lot of * writeout happening, and we don't want to prevent access to the directory * while waiting on the I/O. */ int do_unlinkat(int dfd, struct filename *name) { int error; struct dentry *dentry; struct path path; struct qstr last; int type; struct inode *inode = NULL; struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) goto exit1; error = -EISDIR; if (type != LAST_NORM) goto exit2; error = mnt_want_write(path.mnt); if (error) goto exit2; retry_deleg: inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ if (last.name[last.len] || d_is_negative(dentry)) goto slashes; inode = dentry->d_inode; ihold(inode); error = security_path_unlink(&path, dentry); if (error) goto exit3; error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, &delegated_inode); exit3: dput(dentry); } inode_unlock(path.dentry->d_inode); if (inode) iput(inode); /* truncate the inode here */ inode = NULL; if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(path.mnt); exit2: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; inode = NULL; goto retry; } exit1: putname(name); return error; slashes: if (d_is_negative(dentry)) error = -ENOENT; else if (d_is_dir(dentry)) error = -EISDIR; else error = -ENOTDIR; goto exit3; } SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) { if ((flag & ~AT_REMOVEDIR) != 0) return -EINVAL; if (flag & AT_REMOVEDIR) return do_rmdir(dfd, getname(pathname)); return do_unlinkat(dfd, getname(pathname)); } SYSCALL_DEFINE1(unlink, const char __user *, pathname) { return do_unlinkat(AT_FDCWD, getname(pathname)); } /** * vfs_symlink - create symlink * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child symlink file * @oldname: name of the file to link to * * Create a symlink. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *oldname) { int error; error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->symlink) return -EPERM; error = security_inode_symlink(dir, dentry, oldname); if (error) return error; error = dir->i_op->symlink(idmap, dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_symlink); int do_symlinkat(struct filename *from, int newdfd, struct filename *to) { int error; struct dentry *dentry; struct path path; unsigned int lookup_flags = 0; if (IS_ERR(from)) { error = PTR_ERR(from); goto out_putnames; } retry: dentry = filename_create(newdfd, to, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_putnames; error = security_path_symlink(&path, dentry, from->name); if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, from->name); done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out_putnames: putname(to); putname(from); return error; } SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, int, newdfd, const char __user *, newname) { return do_symlinkat(getname(oldname), newdfd, getname(newname)); } SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname) { return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname)); } /** * vfs_link - create a new link * @old_dentry: object to be linked * @idmap: idmap of the mount * @dir: new parent * @new_dentry: where to create the new link * @delegated_inode: returns inode needing a delegation break * * The caller must hold dir->i_mutex * * If vfs_link discovers a delegation on the to-be-linked file in need * of breaking, it will return -EWOULDBLOCK and return a reference to the * inode in delegated_inode. The caller should then break the delegation * and retry. Because breaking a delegation may take a long time, the * caller should drop the i_mutex before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; int error; if (!inode) return -ENOENT; error = may_create(idmap, dir, new_dentry); if (error) return error; if (dir->i_sb != inode->i_sb) return -EXDEV; /* * A link to an append-only or immutable file cannot be created. */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; /* * Updating the link count will likely cause i_uid and i_gid to * be writen back improperly if their true value is unknown to * the vfs. */ if (HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (!dir->i_op->link) return -EPERM; if (S_ISDIR(inode->i_mode)) return -EPERM; error = security_inode_link(old_dentry, dir, new_dentry); if (error) return error; inode_lock(inode); /* Make sure we don't allow creating hardlink to an unlinked file */ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) error = -ENOENT; else if (max_links && inode->i_nlink >= max_links) error = -EMLINK; else { error = try_break_deleg(inode, delegated_inode); if (!error) error = dir->i_op->link(old_dentry, dir, new_dentry); } if (!error && (inode->i_state & I_LINKABLE)) { spin_lock(&inode->i_lock); inode->i_state &= ~I_LINKABLE; spin_unlock(&inode->i_lock); } inode_unlock(inode); if (!error) fsnotify_link(dir, inode, new_dentry); return error; } EXPORT_SYMBOL(vfs_link); /* * Hardlinks are often used in delicate situations. We avoid * security-related surprises by not following symlinks on the * newname. --KAB * * We don't follow them on the oldname either to be compatible * with linux 2.0, and to avoid hard-linking to directories * and other special files. --ADM */ int do_linkat(int olddfd, struct filename *old, int newdfd, struct filename *new, int flags) { struct mnt_idmap *idmap; struct dentry *new_dentry; struct path old_path, new_path; struct inode *delegated_inode = NULL; int how = 0; int error; if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) { error = -EINVAL; goto out_putnames; } /* * To use null names we require CAP_DAC_READ_SEARCH or * that the open-time creds of the dfd matches current. * This ensures that not everyone will be able to create * a hardlink using the passed file descriptor. */ if (flags & AT_EMPTY_PATH) how |= LOOKUP_LINKAT_EMPTY; if (flags & AT_SYMLINK_FOLLOW) how |= LOOKUP_FOLLOW; retry: error = filename_lookup(olddfd, old, how, &old_path, NULL); if (error) goto out_putnames; new_dentry = filename_create(newdfd, new, &new_path, (how & LOOKUP_REVAL)); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto out_putpath; error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; idmap = mnt_idmap(new_path.mnt); error = may_linkat(idmap, &old_path); if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: done_path_create(&new_path, new_dentry); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) { path_put(&old_path); goto retry; } } if (retry_estale(error, how)) { path_put(&old_path); how |= LOOKUP_REVAL; goto retry; } out_putpath: path_put(&old_path); out_putnames: putname(old); putname(new); return error; } SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, int, flags) { return do_linkat(olddfd, getname_uflags(oldname, flags), newdfd, getname(newname), flags); } SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname) { return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0); } /** * vfs_rename - rename a filesystem object * @rd: pointer to &struct renamedata info * * The caller must hold multiple mutexes--see lock_rename()). * * If vfs_rename discovers a delegation in need of breaking at either * the source or destination, it will return -EWOULDBLOCK and return a * reference to the inode in delegated_inode. The caller should then * break the delegation and retry. Because breaking a delegation may * take a long time, the caller should drop all locks before doing * so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * * The worst of all namespace operations - renaming directory. "Perverted" * doesn't even start to describe it. Somebody in UCB had a heck of a trip... * Problems: * * a) we can get into loop creation. * b) race potential - two innocent renames can create a loop together. * That's where 4.4BSD screws up. Current fix: serialization on * sb->s_vfs_rename_mutex. We might be more accurate, but that's another * story. * c) we may have to lock up to _four_ objects - parents and victim (if it exists), * and source (if it's a non-directory or a subdirectory that moves to * different parent). * And that - after we got ->i_mutex on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change * only under ->s_vfs_rename_mutex _and_ that parent of the object we * move will be locked. Thus we can rank directories by the tree * (ancestors first) and rank all non-directories after them. * That works since everybody except rename does "lock parent, lookup, * lock child" and rename is under ->s_vfs_rename_mutex. * HOWEVER, it relies on the assumption that any object with ->lookup() * has no more than 1 dentry. If "hybrid" objects will ever appear, * we'd better make sure that there's no link(2) for them. * d) conversion from fhandle to dentry may come in the wrong moment - when * we are removing the target. Solution: we will have to grab ->i_mutex * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on * ->i_mutex on parents, which works but leads to some truly excessive * locking]. */ int vfs_rename(struct renamedata *rd) { int error; struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir; struct dentry *old_dentry = rd->old_dentry; struct dentry *new_dentry = rd->new_dentry; struct inode **delegated_inode = rd->delegated_inode; unsigned int flags = rd->flags; bool is_dir = d_is_dir(old_dentry); struct inode *source = old_dentry->d_inode; struct inode *target = new_dentry->d_inode; bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; struct name_snapshot old_name; bool lock_old_subdir, lock_new_subdir; if (source == target) return 0; error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir); if (error) return error; if (!target) { error = may_create(rd->new_mnt_idmap, new_dir, new_dentry); } else { new_is_dir = d_is_dir(new_dentry); if (!(flags & RENAME_EXCHANGE)) error = may_delete(rd->new_mnt_idmap, new_dir, new_dentry, is_dir); else error = may_delete(rd->new_mnt_idmap, new_dir, new_dentry, new_is_dir); } if (error) return error; if (!old_dir->i_op->rename) return -EPERM; /* * If we are going to change the parent - check write permissions, * we'll need to flip '..'. */ if (new_dir != old_dir) { if (is_dir) { error = inode_permission(rd->old_mnt_idmap, source, MAY_WRITE); if (error) return error; } if ((flags & RENAME_EXCHANGE) && new_is_dir) { error = inode_permission(rd->new_mnt_idmap, target, MAY_WRITE); if (error) return error; } } error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry, flags); if (error) return error; take_dentry_name_snapshot(&old_name, old_dentry); dget(new_dentry); /* * Lock children. * The source subdirectory needs to be locked on cross-directory * rename or cross-directory exchange since its parent changes. * The target subdirectory needs to be locked on cross-directory * exchange due to parent change and on any rename due to becoming * a victim. * Non-directories need locking in all cases (for NFS reasons); * they get locked after any subdirectories (in inode address order). * * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE. * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex. */ lock_old_subdir = new_dir != old_dir; lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE); if (is_dir) { if (lock_old_subdir) inode_lock_nested(source, I_MUTEX_CHILD); if (target && (!new_is_dir || lock_new_subdir)) inode_lock(target); } else if (new_is_dir) { if (lock_new_subdir) inode_lock_nested(target, I_MUTEX_CHILD); inode_lock(source); } else { lock_two_nondirectories(source, target); } error = -EPERM; if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target))) goto out; error = -EBUSY; if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) goto out; if (max_links && new_dir != old_dir) { error = -EMLINK; if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links) goto out; if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir && old_dir->i_nlink >= max_links) goto out; } if (!is_dir) { error = try_break_deleg(source, delegated_inode); if (error) goto out; } if (target && !new_is_dir) { error = try_break_deleg(target, delegated_inode); if (error) goto out; } error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry, new_dir, new_dentry, flags); if (error) goto out; if (!(flags & RENAME_EXCHANGE) && target) { if (is_dir) { shrink_dcache_parent(new_dentry); target->i_flags |= S_DEAD; } dont_mount(new_dentry); detach_mounts(new_dentry); } if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) { if (!(flags & RENAME_EXCHANGE)) d_move(old_dentry, new_dentry); else d_exchange(old_dentry, new_dentry); } out: if (!is_dir || lock_old_subdir) inode_unlock(source); if (target && (!new_is_dir || lock_new_subdir)) inode_unlock(target); dput(new_dentry); if (!error) { fsnotify_move(old_dir, new_dir, &old_name.name, is_dir, !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); if (flags & RENAME_EXCHANGE) { fsnotify_move(new_dir, old_dir, &old_dentry->d_name, new_is_dir, NULL, new_dentry); } } release_dentry_name_snapshot(&old_name); return error; } EXPORT_SYMBOL(vfs_rename); int do_renameat2(int olddfd, struct filename *from, int newdfd, struct filename *to, unsigned int flags) { struct renamedata rd; struct dentry *old_dentry, *new_dentry; struct dentry *trap; struct path old_path, new_path; struct qstr old_last, new_last; int old_type, new_type; struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; bool should_retry = false; int error = -EINVAL; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) goto put_names; if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && (flags & RENAME_EXCHANGE)) goto put_names; if (flags & RENAME_EXCHANGE) target_flags = 0; retry: error = filename_parentat(olddfd, from, lookup_flags, &old_path, &old_last, &old_type); if (error) goto put_names; error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last, &new_type); if (error) goto exit1; error = -EXDEV; if (old_path.mnt != new_path.mnt) goto exit2; error = -EBUSY; if (old_type != LAST_NORM) goto exit2; if (flags & RENAME_NOREPLACE) error = -EEXIST; if (new_type != LAST_NORM) goto exit2; error = mnt_want_write(old_path.mnt); if (error) goto exit2; retry_deleg: trap = lock_rename(new_path.dentry, old_path.dentry); if (IS_ERR(trap)) { error = PTR_ERR(trap); goto exit_lock_rename; } old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry, lookup_flags); error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; /* source must exist */ error = -ENOENT; if (d_is_negative(old_dentry)) goto exit4; new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, lookup_flags | target_flags); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto exit4; error = -EEXIST; if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) goto exit5; if (flags & RENAME_EXCHANGE) { error = -ENOENT; if (d_is_negative(new_dentry)) goto exit5; if (!d_is_dir(new_dentry)) { error = -ENOTDIR; if (new_last.name[new_last.len]) goto exit5; } } /* unless the source is a directory trailing slashes give -ENOTDIR */ if (!d_is_dir(old_dentry)) { error = -ENOTDIR; if (old_last.name[old_last.len]) goto exit5; if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len]) goto exit5; } /* source should not be ancestor of target */ error = -EINVAL; if (old_dentry == trap) goto exit5; /* target should not be an ancestor of source */ if (!(flags & RENAME_EXCHANGE)) error = -ENOTEMPTY; if (new_dentry == trap) goto exit5; error = security_path_rename(&old_path, old_dentry, &new_path, new_dentry, flags); if (error) goto exit5; rd.old_dir = old_path.dentry->d_inode; rd.old_dentry = old_dentry; rd.old_mnt_idmap = mnt_idmap(old_path.mnt); rd.new_dir = new_path.dentry->d_inode; rd.new_dentry = new_dentry; rd.new_mnt_idmap = mnt_idmap(new_path.mnt); rd.delegated_inode = &delegated_inode; rd.flags = flags; error = vfs_rename(&rd); exit5: dput(new_dentry); exit4: dput(old_dentry); exit3: unlock_rename(new_path.dentry, old_path.dentry); exit_lock_rename: if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(old_path.mnt); exit2: if (retry_estale(error, lookup_flags)) should_retry = true; path_put(&new_path); exit1: path_put(&old_path); if (should_retry) { should_retry = false; lookup_flags |= LOOKUP_REVAL; goto retry; } put_names: putname(from); putname(to); return error; } SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, unsigned int, flags) { return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), flags); } SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname) { return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), 0); } SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) { return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0); } int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen) { int copylen; copylen = linklen; if (unlikely(copylen > (unsigned) buflen)) copylen = buflen; if (copy_to_user(buffer, link, copylen)) copylen = -EFAULT; return copylen; } /** * vfs_readlink - copy symlink body into userspace buffer * @dentry: dentry on which to get symbolic link * @buffer: user memory pointer * @buflen: size of buffer * * Does not touch atime. That's up to the caller if necessary * * Does not call security hook. */ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct inode *inode = d_inode(dentry); DEFINE_DELAYED_CALL(done); const char *link; int res; if (inode->i_opflags & IOP_CACHED_LINK) return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen); if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) { if (unlikely(inode->i_op->readlink)) return inode->i_op->readlink(dentry, buffer, buflen); if (!d_is_symlink(dentry)) return -EINVAL; spin_lock(&inode->i_lock); inode->i_opflags |= IOP_DEFAULT_READLINK; spin_unlock(&inode->i_lock); } link = READ_ONCE(inode->i_link); if (!link) { link = inode->i_op->get_link(dentry, inode, &done); if (IS_ERR(link)) return PTR_ERR(link); } res = readlink_copy(buffer, buflen, link, strlen(link)); do_delayed_call(&done); return res; } EXPORT_SYMBOL(vfs_readlink); /** * vfs_get_link - get symlink body * @dentry: dentry on which to get symbolic link * @done: caller needs to free returned data with this * * Calls security hook and i_op->get_link() on the supplied inode. * * It does not touch atime. That's up to the caller if necessary. * * Does not work on "special" symlinks like /proc/$$/fd/N */ const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) { const char *res = ERR_PTR(-EINVAL); struct inode *inode = d_inode(dentry); if (d_is_symlink(dentry)) { res = ERR_PTR(security_inode_readlink(dentry)); if (!res) res = inode->i_op->get_link(dentry, inode, done); } return res; } EXPORT_SYMBOL(vfs_get_link); /* get the link contents into pagecache */ const char *page_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { char *kaddr; struct page *page; struct address_space *mapping = inode->i_mapping; if (!dentry) { page = find_get_page(mapping, 0); if (!page) return ERR_PTR(-ECHILD); if (!PageUptodate(page)) { put_page(page); return ERR_PTR(-ECHILD); } } else { page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) return (char*)page; } set_delayed_call(callback, page_put_link, page); BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM); kaddr = page_address(page); nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); return kaddr; } EXPORT_SYMBOL(page_get_link); void page_put_link(void *arg) { put_page(arg); } EXPORT_SYMBOL(page_put_link); int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { const char *link; int res; DEFINE_DELAYED_CALL(done); link = page_get_link(dentry, d_inode(dentry), &done); res = PTR_ERR(link); if (!IS_ERR(link)) res = readlink_copy(buffer, buflen, link, strlen(link)); do_delayed_call(&done); return res; } EXPORT_SYMBOL(page_readlink); int page_symlink(struct inode *inode, const char *symname, int len) { struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS); struct folio *folio; void *fsdata = NULL; int err; unsigned int flags; retry: if (nofs) flags = memalloc_nofs_save(); err = aops->write_begin(NULL, mapping, 0, len-1, &folio, &fsdata); if (nofs) memalloc_nofs_restore(flags); if (err) goto fail; memcpy(folio_address(folio), symname, len - 1); err = aops->write_end(NULL, mapping, 0, len - 1, len - 1, folio, fsdata); if (err < 0) goto fail; if (err < len-1) goto retry; mark_inode_dirty(inode); return 0; fail: return err; } EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { .get_link = page_get_link, }; EXPORT_SYMBOL(page_symlink_inode_operations); |
| 30 20 18 18 2 2 2 2 2 2 2 3 3 2 3 1 1 2 2 2 2 13 87 87 33 85 85 85 11 11 11 11 3 3 4 4 4 91 2 2 2 2 2 3 3 2 3 2 3 2 3 3 3 3 92 3 2 48 48 39 38 2 92 95 95 95 2 2 3 3 2 3 3 1 2 2 2 3 3 3 3 3 2 2 2 2 2 2 2 2 2 17 3 3 3 3 2 97 93 2 2 3 1 1 1 1 1 1 17 17 17 17 1 1 1 1 23 23 4 1 1 1 3 1 1 12 12 12 1 1 2 8 6 4 2 3 4 1 1 1 1 2 2 82 92 3 91 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2015 Linaro Ltd. * Author: Shannon Zhao <shannon.zhao@linaro.org> */ #include <linux/cpu.h> #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/list.h> #include <linux/perf_event.h> #include <linux/perf/arm_pmu.h> #include <linux/uaccess.h> #include <asm/kvm_emulate.h> #include <kvm/arm_pmu.h> #include <kvm/arm_vgic.h> #define PERF_ATTR_CFG1_COUNTER_64BIT BIT(0) DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available); static LIST_HEAD(arm_pmus); static DEFINE_MUTEX(arm_pmus_lock); static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc); static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc); static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc); static struct kvm_vcpu *kvm_pmc_to_vcpu(const struct kvm_pmc *pmc) { return container_of(pmc, struct kvm_vcpu, arch.pmu.pmc[pmc->idx]); } static struct kvm_pmc *kvm_vcpu_idx_to_pmc(struct kvm_vcpu *vcpu, int cnt_idx) { return &vcpu->arch.pmu.pmc[cnt_idx]; } static u32 __kvm_pmu_event_mask(unsigned int pmuver) { switch (pmuver) { case ID_AA64DFR0_EL1_PMUVer_IMP: return GENMASK(9, 0); case ID_AA64DFR0_EL1_PMUVer_V3P1: case ID_AA64DFR0_EL1_PMUVer_V3P4: case ID_AA64DFR0_EL1_PMUVer_V3P5: case ID_AA64DFR0_EL1_PMUVer_V3P7: return GENMASK(15, 0); default: /* Shouldn't be here, just for sanity */ WARN_ONCE(1, "Unknown PMU version %d\n", pmuver); return 0; } } static u32 kvm_pmu_event_mask(struct kvm *kvm) { u64 dfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1); u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, dfr0); return __kvm_pmu_event_mask(pmuver); } u64 kvm_pmu_evtyper_mask(struct kvm *kvm) { u64 mask = ARMV8_PMU_EXCLUDE_EL1 | ARMV8_PMU_EXCLUDE_EL0 | kvm_pmu_event_mask(kvm); if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL2, IMP)) mask |= ARMV8_PMU_INCLUDE_EL2; if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL3, IMP)) mask |= ARMV8_PMU_EXCLUDE_NS_EL0 | ARMV8_PMU_EXCLUDE_NS_EL1 | ARMV8_PMU_EXCLUDE_EL3; return mask; } /** * kvm_pmc_is_64bit - determine if counter is 64bit * @pmc: counter context */ static bool kvm_pmc_is_64bit(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); return (pmc->idx == ARMV8_PMU_CYCLE_IDX || kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)); } static bool kvm_pmc_has_64bit_overflow(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); u64 val = kvm_vcpu_read_pmcr(vcpu); if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx)) return __vcpu_sys_reg(vcpu, MDCR_EL2) & MDCR_EL2_HLP; return (pmc->idx < ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LP)) || (pmc->idx == ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LC)); } static bool kvm_pmu_counter_can_chain(struct kvm_pmc *pmc) { return (!(pmc->idx & 1) && (pmc->idx + 1) < ARMV8_PMU_CYCLE_IDX && !kvm_pmc_has_64bit_overflow(pmc)); } static u32 counter_index_to_reg(u64 idx) { return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + idx; } static u32 counter_index_to_evtreg(u64 idx) { return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + idx; } static u64 kvm_pmc_read_evtreg(const struct kvm_pmc *pmc) { return __vcpu_sys_reg(kvm_pmc_to_vcpu(pmc), counter_index_to_evtreg(pmc->idx)); } static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); u64 counter, reg, enabled, running; reg = counter_index_to_reg(pmc->idx); counter = __vcpu_sys_reg(vcpu, reg); /* * The real counter value is equal to the value of counter register plus * the value perf event counts. */ if (pmc->perf_event) counter += perf_event_read_value(pmc->perf_event, &enabled, &running); if (!kvm_pmc_is_64bit(pmc)) counter = lower_32_bits(counter); return counter; } /** * kvm_pmu_get_counter_value - get PMU counter value * @vcpu: The vcpu pointer * @select_idx: The counter index */ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) { if (!kvm_vcpu_has_pmu(vcpu)) return 0; return kvm_pmu_get_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx)); } static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); u64 reg; kvm_pmu_release_perf_event(pmc); reg = counter_index_to_reg(pmc->idx); if (vcpu_mode_is_32bit(vcpu) && pmc->idx != ARMV8_PMU_CYCLE_IDX && !force) { /* * Even with PMUv3p5, AArch32 cannot write to the top * 32bit of the counters. The only possible course of * action is to use PMCR.P, which will reset them to * 0 (the only use of the 'force' parameter). */ val = __vcpu_sys_reg(vcpu, reg) & GENMASK(63, 32); val |= lower_32_bits(val); } __vcpu_sys_reg(vcpu, reg) = val; /* Recreate the perf event to reflect the updated sample_period */ kvm_pmu_create_perf_event(pmc); } /** * kvm_pmu_set_counter_value - set PMU counter value * @vcpu: The vcpu pointer * @select_idx: The counter index * @val: The counter value */ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) { if (!kvm_vcpu_has_pmu(vcpu)) return; kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx), val, false); } /** * kvm_pmu_release_perf_event - remove the perf event * @pmc: The PMU counter pointer */ static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc) { if (pmc->perf_event) { perf_event_disable(pmc->perf_event); perf_event_release_kernel(pmc->perf_event); pmc->perf_event = NULL; } } /** * kvm_pmu_stop_counter - stop PMU counter * @pmc: The PMU counter pointer * * If this counter has been configured to monitor some event, release it here. */ static void kvm_pmu_stop_counter(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); u64 reg, val; if (!pmc->perf_event) return; val = kvm_pmu_get_pmc_value(pmc); reg = counter_index_to_reg(pmc->idx); __vcpu_sys_reg(vcpu, reg) = val; kvm_pmu_release_perf_event(pmc); } /** * kvm_pmu_vcpu_init - assign pmu counter idx for cpu * @vcpu: The vcpu pointer * */ void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) { int i; struct kvm_pmu *pmu = &vcpu->arch.pmu; for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) pmu->pmc[i].idx = i; } /** * kvm_pmu_vcpu_reset - reset pmu state for cpu * @vcpu: The vcpu pointer * */ void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) { unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu); int i; for_each_set_bit(i, &mask, 32) kvm_pmu_stop_counter(kvm_vcpu_idx_to_pmc(vcpu, i)); } /** * kvm_pmu_vcpu_destroy - free perf event of PMU for cpu * @vcpu: The vcpu pointer * */ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) { int i; for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, i)); irq_work_sync(&vcpu->arch.pmu.overflow_work); } static u64 kvm_pmu_hyp_counter_mask(struct kvm_vcpu *vcpu) { unsigned int hpmn, n; if (!vcpu_has_nv(vcpu)) return 0; hpmn = SYS_FIELD_GET(MDCR_EL2, HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); n = vcpu->kvm->arch.pmcr_n; /* * Programming HPMN to a value greater than PMCR_EL0.N is * CONSTRAINED UNPREDICTABLE. Make the implementation choice that an * UNKNOWN number of counters (in our case, zero) are reserved for EL2. */ if (hpmn >= n) return 0; /* * Programming HPMN=0 is CONSTRAINED UNPREDICTABLE if FEAT_HPMN0 isn't * implemented. Since KVM's ability to emulate HPMN=0 does not directly * depend on hardware (all PMU registers are trapped), make the * implementation choice that all counters are included in the second * range reserved for EL2/EL3. */ return GENMASK(n - 1, hpmn); } bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx) { return kvm_pmu_hyp_counter_mask(vcpu) & BIT(idx); } u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu) { u64 mask = kvm_pmu_implemented_counter_mask(vcpu); if (!vcpu_has_nv(vcpu) || vcpu_is_el2(vcpu)) return mask; return mask & ~kvm_pmu_hyp_counter_mask(vcpu); } u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) { u64 val = FIELD_GET(ARMV8_PMU_PMCR_N, kvm_vcpu_read_pmcr(vcpu)); if (val == 0) return BIT(ARMV8_PMU_CYCLE_IDX); else return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX); } static void kvm_pmc_enable_perf_event(struct kvm_pmc *pmc) { if (!pmc->perf_event) { kvm_pmu_create_perf_event(pmc); return; } perf_event_enable(pmc->perf_event); if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE) kvm_debug("fail to enable perf event\n"); } static void kvm_pmc_disable_perf_event(struct kvm_pmc *pmc) { if (pmc->perf_event) perf_event_disable(pmc->perf_event); } void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val) { int i; if (!kvm_vcpu_has_pmu(vcpu) || !val) return; for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) { struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); if (!(val & BIT(i))) continue; if (kvm_pmu_counter_is_enabled(pmc)) kvm_pmc_enable_perf_event(pmc); else kvm_pmc_disable_perf_event(pmc); } kvm_vcpu_pmu_restore_guest(vcpu); } /* * Returns the PMU overflow state, which is true if there exists an event * counter where the values of the global enable control, PMOVSSET_EL0[n], and * PMINTENSET_EL1[n] are all 1. */ static bool kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) { u64 reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1); /* * PMCR_EL0.E is the global enable control for event counters available * to EL0 and EL1. */ if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E)) reg &= kvm_pmu_hyp_counter_mask(vcpu); /* * Otherwise, MDCR_EL2.HPME is the global enable control for event * counters reserved for EL2. */ if (!(vcpu_read_sys_reg(vcpu, MDCR_EL2) & MDCR_EL2_HPME)) reg &= ~kvm_pmu_hyp_counter_mask(vcpu); return reg; } static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = &vcpu->arch.pmu; bool overflow; if (!kvm_vcpu_has_pmu(vcpu)) return; overflow = kvm_pmu_overflow_status(vcpu); if (pmu->irq_level == overflow) return; pmu->irq_level = overflow; if (likely(irqchip_in_kernel(vcpu->kvm))) { int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu, pmu->irq_num, overflow, pmu); WARN_ON(ret); } } bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = &vcpu->arch.pmu; struct kvm_sync_regs *sregs = &vcpu->run->s.regs; bool run_level = sregs->device_irq_level & KVM_ARM_DEV_PMU; if (likely(irqchip_in_kernel(vcpu->kvm))) return false; return pmu->irq_level != run_level; } /* * Reflect the PMU overflow interrupt output level into the kvm_run structure */ void kvm_pmu_update_run(struct kvm_vcpu *vcpu) { struct kvm_sync_regs *regs = &vcpu->run->s.regs; /* Populate the timer bitmap for user space */ regs->device_irq_level &= ~KVM_ARM_DEV_PMU; if (vcpu->arch.pmu.irq_level) regs->device_irq_level |= KVM_ARM_DEV_PMU; } /** * kvm_pmu_flush_hwstate - flush pmu state to cpu * @vcpu: The vcpu pointer * * Check if the PMU has overflowed while we were running in the host, and inject * an interrupt if that was the case. */ void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) { kvm_pmu_update_state(vcpu); } /** * kvm_pmu_sync_hwstate - sync pmu state from cpu * @vcpu: The vcpu pointer * * Check if the PMU has overflowed while we were running in the guest, and * inject an interrupt if that was the case. */ void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) { kvm_pmu_update_state(vcpu); } /* * When perf interrupt is an NMI, we cannot safely notify the vcpu corresponding * to the event. * This is why we need a callback to do it once outside of the NMI context. */ static void kvm_pmu_perf_overflow_notify_vcpu(struct irq_work *work) { struct kvm_vcpu *vcpu; vcpu = container_of(work, struct kvm_vcpu, arch.pmu.overflow_work); kvm_vcpu_kick(vcpu); } /* * Perform an increment on any of the counters described in @mask, * generating the overflow if required, and propagate it as a chained * event if possible. */ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu, unsigned long mask, u32 event) { int i; if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E)) return; /* Weed out disabled counters */ mask &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); for_each_set_bit(i, &mask, ARMV8_PMU_CYCLE_IDX) { struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); u64 type, reg; /* Filter on event type */ type = __vcpu_sys_reg(vcpu, counter_index_to_evtreg(i)); type &= kvm_pmu_event_mask(vcpu->kvm); if (type != event) continue; /* Increment this counter */ reg = __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) + 1; if (!kvm_pmc_is_64bit(pmc)) reg = lower_32_bits(reg); __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) = reg; /* No overflow? move on */ if (kvm_pmc_has_64bit_overflow(pmc) ? reg : lower_32_bits(reg)) continue; /* Mark overflow */ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i); if (kvm_pmu_counter_can_chain(pmc)) kvm_pmu_counter_increment(vcpu, BIT(i + 1), ARMV8_PMUV3_PERFCTR_CHAIN); } } /* Compute the sample period for a given counter value */ static u64 compute_period(struct kvm_pmc *pmc, u64 counter) { u64 val; if (kvm_pmc_is_64bit(pmc) && kvm_pmc_has_64bit_overflow(pmc)) val = (-counter) & GENMASK(63, 0); else val = (-counter) & GENMASK(31, 0); return val; } /* * When the perf event overflows, set the overflow status and inform the vcpu. */ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, struct perf_sample_data *data, struct pt_regs *regs) { struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct arm_pmu *cpu_pmu = to_arm_pmu(perf_event->pmu); struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); int idx = pmc->idx; u64 period; cpu_pmu->pmu.stop(perf_event, PERF_EF_UPDATE); /* * Reset the sample period to the architectural limit, * i.e. the point where the counter overflows. */ period = compute_period(pmc, local64_read(&perf_event->count)); local64_set(&perf_event->hw.period_left, 0); perf_event->attr.sample_period = period; perf_event->hw.sample_period = period; __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx); if (kvm_pmu_counter_can_chain(pmc)) kvm_pmu_counter_increment(vcpu, BIT(idx + 1), ARMV8_PMUV3_PERFCTR_CHAIN); if (kvm_pmu_overflow_status(vcpu)) { kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); if (!in_nmi()) kvm_vcpu_kick(vcpu); else irq_work_queue(&vcpu->arch.pmu.overflow_work); } cpu_pmu->pmu.start(perf_event, PERF_EF_RELOAD); } /** * kvm_pmu_software_increment - do software increment * @vcpu: The vcpu pointer * @val: the value guest writes to PMSWINC register */ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) { kvm_pmu_counter_increment(vcpu, val, ARMV8_PMUV3_PERFCTR_SW_INCR); } /** * kvm_pmu_handle_pmcr - handle PMCR register * @vcpu: The vcpu pointer * @val: the value guest writes to PMCR register */ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) { int i; if (!kvm_vcpu_has_pmu(vcpu)) return; /* Fixup PMCR_EL0 to reconcile the PMU version and the LP bit */ if (!kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)) val &= ~ARMV8_PMU_PMCR_LP; /* Request a reload of the PMU to enable/disable affected counters */ if ((__vcpu_sys_reg(vcpu, PMCR_EL0) ^ val) & ARMV8_PMU_PMCR_E) kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); /* The reset bits don't indicate any state, and shouldn't be saved. */ __vcpu_sys_reg(vcpu, PMCR_EL0) = val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P); if (val & ARMV8_PMU_PMCR_C) kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); if (val & ARMV8_PMU_PMCR_P) { /* * Unlike other PMU sysregs, the controls in PMCR_EL0 always apply * to the 'guest' range of counters and never the 'hyp' range. */ unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu) & ~kvm_pmu_hyp_counter_mask(vcpu) & ~BIT(ARMV8_PMU_CYCLE_IDX); for_each_set_bit(i, &mask, 32) kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true); } } static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); unsigned int mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); if (!(__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx))) return false; if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx)) return mdcr & MDCR_EL2_HPME; return kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E; } static bool kvm_pmc_counts_at_el0(struct kvm_pmc *pmc) { u64 evtreg = kvm_pmc_read_evtreg(pmc); bool nsu = evtreg & ARMV8_PMU_EXCLUDE_NS_EL0; bool u = evtreg & ARMV8_PMU_EXCLUDE_EL0; return u == nsu; } static bool kvm_pmc_counts_at_el1(struct kvm_pmc *pmc) { u64 evtreg = kvm_pmc_read_evtreg(pmc); bool nsk = evtreg & ARMV8_PMU_EXCLUDE_NS_EL1; bool p = evtreg & ARMV8_PMU_EXCLUDE_EL1; return p == nsk; } static bool kvm_pmc_counts_at_el2(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); u64 mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); if (!kvm_pmu_counter_is_hyp(vcpu, pmc->idx) && (mdcr & MDCR_EL2_HPMD)) return false; return kvm_pmc_read_evtreg(pmc) & ARMV8_PMU_INCLUDE_EL2; } /** * kvm_pmu_create_perf_event - create a perf event for a counter * @pmc: Counter context */ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); struct arm_pmu *arm_pmu = vcpu->kvm->arch.arm_pmu; struct perf_event *event; struct perf_event_attr attr; u64 eventsel, evtreg; evtreg = kvm_pmc_read_evtreg(pmc); kvm_pmu_stop_counter(pmc); if (pmc->idx == ARMV8_PMU_CYCLE_IDX) eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; else eventsel = evtreg & kvm_pmu_event_mask(vcpu->kvm); /* * Neither SW increment nor chained events need to be backed * by a perf event. */ if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR || eventsel == ARMV8_PMUV3_PERFCTR_CHAIN) return; /* * If we have a filter in place and that the event isn't allowed, do * not install a perf event either. */ if (vcpu->kvm->arch.pmu_filter && !test_bit(eventsel, vcpu->kvm->arch.pmu_filter)) return; memset(&attr, 0, sizeof(struct perf_event_attr)); attr.type = arm_pmu->pmu.type; attr.size = sizeof(attr); attr.pinned = 1; attr.disabled = !kvm_pmu_counter_is_enabled(pmc); attr.exclude_user = !kvm_pmc_counts_at_el0(pmc); attr.exclude_hv = 1; /* Don't count EL2 events */ attr.exclude_host = 1; /* Don't count host events */ attr.config = eventsel; /* * Filter events at EL1 (i.e. vEL2) when in a hyp context based on the * guest's EL2 filter. */ if (unlikely(is_hyp_ctxt(vcpu))) attr.exclude_kernel = !kvm_pmc_counts_at_el2(pmc); else attr.exclude_kernel = !kvm_pmc_counts_at_el1(pmc); /* * If counting with a 64bit counter, advertise it to the perf * code, carefully dealing with the initial sample period * which also depends on the overflow. */ if (kvm_pmc_is_64bit(pmc)) attr.config1 |= PERF_ATTR_CFG1_COUNTER_64BIT; attr.sample_period = compute_period(pmc, kvm_pmu_get_pmc_value(pmc)); event = perf_event_create_kernel_counter(&attr, -1, current, kvm_pmu_perf_overflow, pmc); if (IS_ERR(event)) { pr_err_once("kvm: pmu event creation failed %ld\n", PTR_ERR(event)); return; } pmc->perf_event = event; } /** * kvm_pmu_set_counter_event_type - set selected counter to monitor some event * @vcpu: The vcpu pointer * @data: The data guest writes to PMXEVTYPER_EL0 * @select_idx: The number of selected counter * * When OS accesses PMXEVTYPER_EL0, that means it wants to set a PMC to count an * event with given hardware event number. Here we call perf_event API to * emulate this action and create a kernel perf event for it. */ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, u64 select_idx) { struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, select_idx); u64 reg; if (!kvm_vcpu_has_pmu(vcpu)) return; reg = counter_index_to_evtreg(pmc->idx); __vcpu_sys_reg(vcpu, reg) = data & kvm_pmu_evtyper_mask(vcpu->kvm); kvm_pmu_create_perf_event(pmc); } void kvm_host_pmu_init(struct arm_pmu *pmu) { struct arm_pmu_entry *entry; /* * Check the sanitised PMU version for the system, as KVM does not * support implementations where PMUv3 exists on a subset of CPUs. */ if (!pmuv3_implemented(kvm_arm_pmu_get_pmuver_limit())) return; mutex_lock(&arm_pmus_lock); entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) goto out_unlock; entry->arm_pmu = pmu; list_add_tail(&entry->entry, &arm_pmus); if (list_is_singular(&arm_pmus)) static_branch_enable(&kvm_arm_pmu_available); out_unlock: mutex_unlock(&arm_pmus_lock); } static struct arm_pmu *kvm_pmu_probe_armpmu(void) { struct arm_pmu *tmp, *pmu = NULL; struct arm_pmu_entry *entry; int cpu; mutex_lock(&arm_pmus_lock); /* * It is safe to use a stale cpu to iterate the list of PMUs so long as * the same value is used for the entirety of the loop. Given this, and * the fact that no percpu data is used for the lookup there is no need * to disable preemption. * * It is still necessary to get a valid cpu, though, to probe for the * default PMU instance as userspace is not required to specify a PMU * type. In order to uphold the preexisting behavior KVM selects the * PMU instance for the core during vcpu init. A dependent use * case would be a user with disdain of all things big.LITTLE that * affines the VMM to a particular cluster of cores. * * In any case, userspace should just do the sane thing and use the UAPI * to select a PMU type directly. But, be wary of the baggage being * carried here. */ cpu = raw_smp_processor_id(); list_for_each_entry(entry, &arm_pmus, entry) { tmp = entry->arm_pmu; if (cpumask_test_cpu(cpu, &tmp->supported_cpus)) { pmu = tmp; break; } } mutex_unlock(&arm_pmus_lock); return pmu; } u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) { unsigned long *bmap = vcpu->kvm->arch.pmu_filter; u64 val, mask = 0; int base, i, nr_events; if (!kvm_vcpu_has_pmu(vcpu)) return 0; if (!pmceid1) { val = read_sysreg(pmceid0_el0); /* always support CHAIN */ val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN); base = 0; } else { val = read_sysreg(pmceid1_el0); /* * Don't advertise STALL_SLOT*, as PMMIR_EL0 is handled * as RAZ */ val &= ~(BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32) | BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND - 32) | BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND - 32)); base = 32; } if (!bmap) return val; nr_events = kvm_pmu_event_mask(vcpu->kvm) + 1; for (i = 0; i < 32; i += 8) { u64 byte; byte = bitmap_get_value8(bmap, base + i); mask |= byte << i; if (nr_events >= (0x4000 + base + 32)) { byte = bitmap_get_value8(bmap, 0x4000 + base + i); mask |= byte << (32 + i); } } return val & mask; } void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu) { u64 mask = kvm_pmu_implemented_counter_mask(vcpu); __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= mask; __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= mask; __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= mask; kvm_pmu_reprogram_counter_mask(vcpu, mask); } int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) { if (!kvm_vcpu_has_pmu(vcpu)) return 0; if (!vcpu->arch.pmu.created) return -EINVAL; /* * A valid interrupt configuration for the PMU is either to have a * properly configured interrupt number and using an in-kernel * irqchip, or to not have an in-kernel GIC and not set an IRQ. */ if (irqchip_in_kernel(vcpu->kvm)) { int irq = vcpu->arch.pmu.irq_num; /* * If we are using an in-kernel vgic, at this point we know * the vgic will be initialized, so we can check the PMU irq * number against the dimensions of the vgic and make sure * it's valid. */ if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq)) return -EINVAL; } else if (kvm_arm_pmu_irq_initialized(vcpu)) { return -EINVAL; } /* One-off reload of the PMU on first run */ kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); return 0; } static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) { if (irqchip_in_kernel(vcpu->kvm)) { int ret; /* * If using the PMU with an in-kernel virtual GIC * implementation, we require the GIC to be already * initialized when initializing the PMU. */ if (!vgic_initialized(vcpu->kvm)) return -ENODEV; if (!kvm_arm_pmu_irq_initialized(vcpu)) return -ENXIO; ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num, &vcpu->arch.pmu); if (ret) return ret; } init_irq_work(&vcpu->arch.pmu.overflow_work, kvm_pmu_perf_overflow_notify_vcpu); vcpu->arch.pmu.created = true; return 0; } /* * For one VM the interrupt type must be same for each vcpu. * As a PPI, the interrupt number is the same for all vcpus, * while as an SPI it must be a separate number per vcpu. */ static bool pmu_irq_is_valid(struct kvm *kvm, int irq) { unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { if (!kvm_arm_pmu_irq_initialized(vcpu)) continue; if (irq_is_ppi(irq)) { if (vcpu->arch.pmu.irq_num != irq) return false; } else { if (vcpu->arch.pmu.irq_num == irq) return false; } } return true; } /** * kvm_arm_pmu_get_max_counters - Return the max number of PMU counters. * @kvm: The kvm pointer */ u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) { struct arm_pmu *arm_pmu = kvm->arch.arm_pmu; /* * The arm_pmu->cntr_mask considers the fixed counter(s) as well. * Ignore those and return only the general-purpose counters. */ return bitmap_weight(arm_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS); } static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu) { lockdep_assert_held(&kvm->arch.config_lock); kvm->arch.arm_pmu = arm_pmu; kvm->arch.pmcr_n = kvm_arm_pmu_get_max_counters(kvm); } /** * kvm_arm_set_default_pmu - No PMU set, get the default one. * @kvm: The kvm pointer * * The observant among you will notice that the supported_cpus * mask does not get updated for the default PMU even though it * is quite possible the selected instance supports only a * subset of cores in the system. This is intentional, and * upholds the preexisting behavior on heterogeneous systems * where vCPUs can be scheduled on any core but the guest * counters could stop working. */ int kvm_arm_set_default_pmu(struct kvm *kvm) { struct arm_pmu *arm_pmu = kvm_pmu_probe_armpmu(); if (!arm_pmu) return -ENODEV; kvm_arm_set_pmu(kvm, arm_pmu); return 0; } static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id) { struct kvm *kvm = vcpu->kvm; struct arm_pmu_entry *entry; struct arm_pmu *arm_pmu; int ret = -ENXIO; lockdep_assert_held(&kvm->arch.config_lock); mutex_lock(&arm_pmus_lock); list_for_each_entry(entry, &arm_pmus, entry) { arm_pmu = entry->arm_pmu; if (arm_pmu->pmu.type == pmu_id) { if (kvm_vm_has_ran_once(kvm) || (kvm->arch.pmu_filter && kvm->arch.arm_pmu != arm_pmu)) { ret = -EBUSY; break; } kvm_arm_set_pmu(kvm, arm_pmu); cpumask_copy(kvm->arch.supported_cpus, &arm_pmu->supported_cpus); ret = 0; break; } } mutex_unlock(&arm_pmus_lock); return ret; } int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { struct kvm *kvm = vcpu->kvm; lockdep_assert_held(&kvm->arch.config_lock); if (!kvm_vcpu_has_pmu(vcpu)) return -ENODEV; if (vcpu->arch.pmu.created) return -EBUSY; switch (attr->attr) { case KVM_ARM_VCPU_PMU_V3_IRQ: { int __user *uaddr = (int __user *)(long)attr->addr; int irq; if (!irqchip_in_kernel(kvm)) return -EINVAL; if (get_user(irq, uaddr)) return -EFAULT; /* The PMU overflow interrupt can be a PPI or a valid SPI. */ if (!(irq_is_ppi(irq) || irq_is_spi(irq))) return -EINVAL; if (!pmu_irq_is_valid(kvm, irq)) return -EINVAL; if (kvm_arm_pmu_irq_initialized(vcpu)) return -EBUSY; kvm_debug("Set kvm ARM PMU irq: %d\n", irq); vcpu->arch.pmu.irq_num = irq; return 0; } case KVM_ARM_VCPU_PMU_V3_FILTER: { u8 pmuver = kvm_arm_pmu_get_pmuver_limit(); struct kvm_pmu_event_filter __user *uaddr; struct kvm_pmu_event_filter filter; int nr_events; /* * Allow userspace to specify an event filter for the entire * event range supported by PMUVer of the hardware, rather * than the guest's PMUVer for KVM backward compatibility. */ nr_events = __kvm_pmu_event_mask(pmuver) + 1; uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr; if (copy_from_user(&filter, uaddr, sizeof(filter))) return -EFAULT; if (((u32)filter.base_event + filter.nevents) > nr_events || (filter.action != KVM_PMU_EVENT_ALLOW && filter.action != KVM_PMU_EVENT_DENY)) return -EINVAL; if (kvm_vm_has_ran_once(kvm)) return -EBUSY; if (!kvm->arch.pmu_filter) { kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL_ACCOUNT); if (!kvm->arch.pmu_filter) return -ENOMEM; /* * The default depends on the first applied filter. * If it allows events, the default is to deny. * Conversely, if the first filter denies a set of * events, the default is to allow. */ if (filter.action == KVM_PMU_EVENT_ALLOW) bitmap_zero(kvm->arch.pmu_filter, nr_events); else bitmap_fill(kvm->arch.pmu_filter, nr_events); } if (filter.action == KVM_PMU_EVENT_ALLOW) bitmap_set(kvm->arch.pmu_filter, filter.base_event, filter.nevents); else bitmap_clear(kvm->arch.pmu_filter, filter.base_event, filter.nevents); return 0; } case KVM_ARM_VCPU_PMU_V3_SET_PMU: { int __user *uaddr = (int __user *)(long)attr->addr; int pmu_id; if (get_user(pmu_id, uaddr)) return -EFAULT; return kvm_arm_pmu_v3_set_pmu(vcpu, pmu_id); } case KVM_ARM_VCPU_PMU_V3_INIT: return kvm_arm_pmu_v3_init(vcpu); } return -ENXIO; } int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { switch (attr->attr) { case KVM_ARM_VCPU_PMU_V3_IRQ: { int __user *uaddr = (int __user *)(long)attr->addr; int irq; if (!irqchip_in_kernel(vcpu->kvm)) return -EINVAL; if (!kvm_vcpu_has_pmu(vcpu)) return -ENODEV; if (!kvm_arm_pmu_irq_initialized(vcpu)) return -ENXIO; irq = vcpu->arch.pmu.irq_num; return put_user(irq, uaddr); } } return -ENXIO; } int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { switch (attr->attr) { case KVM_ARM_VCPU_PMU_V3_IRQ: case KVM_ARM_VCPU_PMU_V3_INIT: case KVM_ARM_VCPU_PMU_V3_FILTER: case KVM_ARM_VCPU_PMU_V3_SET_PMU: if (kvm_vcpu_has_pmu(vcpu)) return 0; } return -ENXIO; } u8 kvm_arm_pmu_get_pmuver_limit(void) { u64 tmp; tmp = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); tmp = cpuid_feature_cap_perfmon_field(tmp, ID_AA64DFR0_EL1_PMUVer_SHIFT, ID_AA64DFR0_EL1_PMUVer_V3P5); return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), tmp); } /** * kvm_vcpu_read_pmcr - Read PMCR_EL0 register for the vCPU * @vcpu: The vcpu pointer */ u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) { u64 pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0); return u64_replace_bits(pmcr, vcpu->kvm->arch.pmcr_n, ARMV8_PMU_PMCR_N); } void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu) { bool reprogrammed = false; unsigned long mask; int i; if (!kvm_vcpu_has_pmu(vcpu)) return; mask = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); for_each_set_bit(i, &mask, 32) { struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); /* * We only need to reconfigure events where the filter is * different at EL1 vs. EL2, as we're multiplexing the true EL1 * event filter bit for nested. */ if (kvm_pmc_counts_at_el1(pmc) == kvm_pmc_counts_at_el2(pmc)) continue; kvm_pmu_create_perf_event(pmc); reprogrammed = true; } if (reprogrammed) kvm_vcpu_pmu_restore_guest(vcpu); } |
| 2 2 1 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | // SPDX-License-Identifier: GPL-2.0-only /* * ratelimit.c - Do something with rate limit. * * Isolated from kernel/printk.c by Dave Young <hidave.darkstar@gmail.com> * * 2008-05-01 rewrite the function and use a ratelimit_state data struct as * parameter. Now every user can use their own standalone ratelimit_state. */ #include <linux/ratelimit.h> #include <linux/jiffies.h> #include <linux/export.h> /* * __ratelimit - rate limiting * @rs: ratelimit_state data * @func: name of calling function * * This enforces a rate limit: not more than @rs->burst callbacks * in every @rs->interval * * RETURNS: * 0 means callbacks will be suppressed. * 1 means go ahead and do it. */ int ___ratelimit(struct ratelimit_state *rs, const char *func) { /* Paired with WRITE_ONCE() in .proc_handler(). * Changing two values seperately could be inconsistent * and some message could be lost. (See: net_ratelimit_state). */ int interval = READ_ONCE(rs->interval); int burst = READ_ONCE(rs->burst); unsigned long flags; int ret; if (!interval) return 1; /* * If we contend on this state's lock then almost * by definition we are too busy to print a message, * in addition to the one that will be printed by * the entity that is holding the lock already: */ if (!raw_spin_trylock_irqsave(&rs->lock, flags)) return 0; if (!rs->begin) rs->begin = jiffies; if (time_is_before_jiffies(rs->begin + interval)) { if (rs->missed) { if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { printk_deferred(KERN_WARNING "%s: %d callbacks suppressed\n", func, rs->missed); rs->missed = 0; } } rs->begin = jiffies; rs->printed = 0; } if (burst && burst > rs->printed) { rs->printed++; ret = 1; } else { rs->missed++; ret = 0; } raw_spin_unlock_irqrestore(&rs->lock, flags); return ret; } EXPORT_SYMBOL(___ratelimit); |
| 17 17 17 17 17 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HIGHMEM_H #define _LINUX_HIGHMEM_H #include <linux/fs.h> #include <linux/kernel.h> #include <linux/bug.h> #include <linux/cacheflush.h> #include <linux/kmsan.h> #include <linux/mm.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include "highmem-internal.h" /** * kmap - Map a page for long term usage * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * Can only be invoked from preemptible task context because on 32bit * systems with CONFIG_HIGHMEM enabled this function might sleep. * * For systems with CONFIG_HIGHMEM=n and for pages in the low memory area * this returns the virtual address of the direct kernel mapping. * * The returned virtual address is globally visible and valid up to the * point where it is unmapped via kunmap(). The pointer can be handed to * other contexts. * * For highmem pages on 32bit systems this can be slow as the mapping space * is limited and protected by a global lock. In case that there is no * mapping slot available the function blocks until a slot is released via * kunmap(). */ static inline void *kmap(struct page *page); /** * kunmap - Unmap the virtual address mapped by kmap() * @page: Pointer to the page which was mapped by kmap() * * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of * pages in the low memory area. */ static inline void kunmap(struct page *page); /** * kmap_to_page - Get the page for a kmap'ed address * @addr: The address to look up * * Returns: The page which is mapped to @addr. */ static inline struct page *kmap_to_page(void *addr); /** * kmap_flush_unused - Flush all unused kmap mappings in order to * remove stray mappings */ static inline void kmap_flush_unused(void); /** * kmap_local_page - Map a page for temporary usage * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * Can be invoked from any context, including interrupts. * * Requires careful handling when nesting multiple mappings because the map * management is stack based. The unmap has to be in the reverse order of * the map operation: * * addr1 = kmap_local_page(page1); * addr2 = kmap_local_page(page2); * ... * kunmap_local(addr2); * kunmap_local(addr1); * * Unmapping addr1 before addr2 is invalid and causes malfunction. * * Contrary to kmap() mappings the mapping is only valid in the context of * the caller and cannot be handed to other contexts. * * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * * While kmap_local_page() is significantly faster than kmap() for the highmem * case it comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_page() can rely on this side effect. */ static inline void *kmap_local_page(struct page *page); /** * kmap_local_folio - Map a page in this folio for temporary usage * @folio: The folio containing the page. * @offset: The byte offset within the folio which identifies the page. * * Requires careful handling when nesting multiple mappings because the map * management is stack based. The unmap has to be in the reverse order of * the map operation:: * * addr1 = kmap_local_folio(folio1, offset1); * addr2 = kmap_local_folio(folio2, offset2); * ... * kunmap_local(addr2); * kunmap_local(addr1); * * Unmapping addr1 before addr2 is invalid and causes malfunction. * * Contrary to kmap() mappings the mapping is only valid in the context of * the caller and cannot be handed to other contexts. * * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * * While it is significantly faster than kmap() for the highmem case it * comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_folio() can rely on this side effect. * * Context: Can be invoked from any context. * Return: The virtual address of @offset. */ static inline void *kmap_local_folio(struct folio *folio, size_t offset); /** * kmap_atomic - Atomically map a page for temporary usage - Deprecated! * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * In fact a wrapper around kmap_local_page() which also disables pagefaults * and, depending on PREEMPT_RT configuration, also CPU migration and * preemption. Therefore users should not count on the latter two side effects. * * Mappings should always be released by kunmap_atomic(). * * Do not use in new code. Use kmap_local_page() instead. * * It is used in atomic context when code wants to access the contents of a * page that might be allocated from high memory (see __GFP_HIGHMEM), for * example a page in the pagecache. The API has two functions, and they * can be used in a manner similar to the following:: * * // Find the page of interest. * struct page *page = find_get_page(mapping, offset); * * // Gain access to the contents of that page. * void *vaddr = kmap_atomic(page); * * // Do something to the contents of that page. * memset(vaddr, 0, PAGE_SIZE); * * // Unmap that page. * kunmap_atomic(vaddr); * * Note that the kunmap_atomic() call takes the result of the kmap_atomic() * call, not the argument. * * If you need to map two pages because you want to copy from one page to * another you need to keep the kmap_atomic calls strictly nested, like: * * vaddr1 = kmap_atomic(page1); * vaddr2 = kmap_atomic(page2); * * memcpy(vaddr1, vaddr2, PAGE_SIZE); * * kunmap_atomic(vaddr2); * kunmap_atomic(vaddr1); */ static inline void *kmap_atomic(struct page *page); /* Highmem related interfaces for management code */ static inline unsigned long nr_free_highpages(void); static inline unsigned long totalhigh_pages(void); #ifndef ARCH_HAS_FLUSH_ANON_PAGE static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) { } #endif #ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE static inline void flush_kernel_vmap_range(void *vaddr, int size) { } static inline void invalidate_kernel_vmap_range(void *vaddr, int size) { } #endif /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ #ifndef clear_user_highpage static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { void *addr = kmap_local_page(page); clear_user_page(addr, vaddr, page); kunmap_local(addr); } #endif #ifndef vma_alloc_zeroed_movable_folio /** * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA. * @vma: The VMA the page is to be allocated for. * @vaddr: The virtual address the page will be inserted into. * * This function will allocate a page suitable for inserting into this * VMA at this virtual address. It may be allocated from highmem or * the movable zone. An architecture may provide its own implementation. * * Return: A folio containing one allocated and zeroed page or NULL if * we are out of memory. */ static inline struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { struct folio *folio; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); if (folio && user_alloc_needs_zeroing()) clear_user_highpage(&folio->page, vaddr); return folio; } #endif static inline void clear_highpage(struct page *page) { void *kaddr = kmap_local_page(page); clear_page(kaddr); kunmap_local(kaddr); } static inline void clear_highpage_kasan_tagged(struct page *page) { void *kaddr = kmap_local_page(page); clear_page(kasan_reset_tag(kaddr)); kunmap_local(kaddr); } #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE static inline void tag_clear_highpage(struct page *page) { } #endif /* * If we pass in a base or tail page, we can zero up to PAGE_SIZE. * If we pass in a head page, we can zero up to the size of the compound page. */ #ifdef CONFIG_HIGHMEM void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2); #else static inline void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2) { void *kaddr = kmap_local_page(page); unsigned int i; BUG_ON(end1 > page_size(page) || end2 > page_size(page)); if (end1 > start1) memset(kaddr + start1, 0, end1 - start1); if (end2 > start2) memset(kaddr + start2, 0, end2 - start2); kunmap_local(kaddr); for (i = 0; i < compound_nr(page); i++) flush_dcache_page(page + i); } #endif static inline void zero_user_segment(struct page *page, unsigned start, unsigned end) { zero_user_segments(page, start, end, 0, 0); } static inline void zero_user(struct page *page, unsigned start, unsigned size) { zero_user_segments(page, start, start + size, 0, 0); } #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE static inline void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_user_page(vto, vfrom, vaddr, to); kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); } #endif #ifndef __HAVE_ARCH_COPY_HIGHPAGE static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_page(vto, vfrom); kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); } #endif #ifdef copy_mc_to_kernel /* * If architecture supports machine check exception handling, define the * #MC versions of copy_user_highpage and copy_highpage. They copy a memory * page with #MC in source page (@from) handled, and return the number * of bytes not copied if there was a #MC, otherwise 0 for success. */ static inline int copy_mc_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { unsigned long ret; char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); if (!ret) kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); if (ret) memory_failure_queue(page_to_pfn(from), 0); return ret; } static inline int copy_mc_highpage(struct page *to, struct page *from) { unsigned long ret; char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); if (!ret) kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); if (ret) memory_failure_queue(page_to_pfn(from), 0); return ret; } #else static inline int copy_mc_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { copy_user_highpage(to, from, vaddr, vma); return 0; } static inline int copy_mc_highpage(struct page *to, struct page *from) { copy_highpage(to, from); return 0; } #endif static inline void memcpy_page(struct page *dst_page, size_t dst_off, struct page *src_page, size_t src_off, size_t len) { char *dst = kmap_local_page(dst_page); char *src = kmap_local_page(src_page); VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE); memcpy(dst + dst_off, src + src_off, len); kunmap_local(src); kunmap_local(dst); } static inline void memset_page(struct page *page, size_t offset, int val, size_t len) { char *addr = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memset(addr + offset, val, len); kunmap_local(addr); } static inline void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len) { char *from = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memcpy(to, from + offset, len); kunmap_local(from); } static inline void memcpy_to_page(struct page *page, size_t offset, const char *from, size_t len) { char *to = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memcpy(to + offset, from, len); flush_dcache_page(page); kunmap_local(to); } static inline void memzero_page(struct page *page, size_t offset, size_t len) { char *addr = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memset(addr + offset, 0, len); flush_dcache_page(page); kunmap_local(addr); } /** * memcpy_from_folio - Copy a range of bytes from a folio. * @to: The memory to copy to. * @folio: The folio to read from. * @offset: The first byte in the folio to read. * @len: The number of bytes to copy. */ static inline void memcpy_from_folio(char *to, struct folio *folio, size_t offset, size_t len) { VM_BUG_ON(offset + len > folio_size(folio)); do { const char *from = kmap_local_folio(folio, offset); size_t chunk = len; if (folio_test_highmem(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); kunmap_local(from); to += chunk; offset += chunk; len -= chunk; } while (len > 0); } /** * memcpy_to_folio - Copy a range of bytes to a folio. * @folio: The folio to write to. * @offset: The first byte in the folio to store to. * @from: The memory to copy from. * @len: The number of bytes to copy. */ static inline void memcpy_to_folio(struct folio *folio, size_t offset, const char *from, size_t len) { VM_BUG_ON(offset + len > folio_size(folio)); do { char *to = kmap_local_folio(folio, offset); size_t chunk = len; if (folio_test_highmem(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); kunmap_local(to); from += chunk; offset += chunk; len -= chunk; } while (len > 0); flush_dcache_folio(folio); } /** * folio_zero_tail - Zero the tail of a folio. * @folio: The folio to zero. * @offset: The byte offset in the folio to start zeroing at. * @kaddr: The address the folio is currently mapped to. * * If you have already used kmap_local_folio() to map a folio, written * some data to it and now need to zero the end of the folio (and flush * the dcache), you can use this function. If you do not have the * folio kmapped (eg the folio has been partially populated by DMA), * use folio_zero_range() or folio_zero_segment() instead. * * Return: An address which can be passed to kunmap_local(). */ static inline __must_check void *folio_zero_tail(struct folio *folio, size_t offset, void *kaddr) { size_t len = folio_size(folio) - offset; if (folio_test_highmem(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { memset(kaddr, 0, max); kunmap_local(kaddr); len -= max; offset += max; max = PAGE_SIZE; kaddr = kmap_local_folio(folio, offset); } } memset(kaddr, 0, len); flush_dcache_folio(folio); return kaddr; } /** * folio_fill_tail - Copy some data to a folio and pad with zeroes. * @folio: The destination folio. * @offset: The offset into @folio at which to start copying. * @from: The data to copy. * @len: How many bytes of data to copy. * * This function is most useful for filesystems which support inline data. * When they want to copy data from the inode into the page cache, this * function does everything for them. It supports large folios even on * HIGHMEM configurations. */ static inline void folio_fill_tail(struct folio *folio, size_t offset, const char *from, size_t len) { char *to = kmap_local_folio(folio, offset); VM_BUG_ON(offset + len > folio_size(folio)); if (folio_test_highmem(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { memcpy(to, from, max); kunmap_local(to); len -= max; from += max; offset += max; max = PAGE_SIZE; to = kmap_local_folio(folio, offset); } } memcpy(to, from, len); to = folio_zero_tail(folio, offset + len, to + len); kunmap_local(to); } /** * memcpy_from_file_folio - Copy some bytes from a file folio. * @to: The destination buffer. * @folio: The folio to copy from. * @pos: The position in the file. * @len: The maximum number of bytes to copy. * * Copy up to @len bytes from this folio. This may be limited by PAGE_SIZE * if the folio comes from HIGHMEM, and by the size of the folio. * * Return: The number of bytes copied from the folio. */ static inline size_t memcpy_from_file_folio(char *to, struct folio *folio, loff_t pos, size_t len) { size_t offset = offset_in_folio(folio, pos); char *from = kmap_local_folio(folio, offset); if (folio_test_highmem(folio)) { offset = offset_in_page(offset); len = min_t(size_t, len, PAGE_SIZE - offset); } else len = min(len, folio_size(folio) - offset); memcpy(to, from, len); kunmap_local(from); return len; } /** * folio_zero_segments() - Zero two byte ranges in a folio. * @folio: The folio to write to. * @start1: The first byte to zero. * @xend1: One more than the last byte in the first range. * @start2: The first byte to zero in the second range. * @xend2: One more than the last byte in the second range. */ static inline void folio_zero_segments(struct folio *folio, size_t start1, size_t xend1, size_t start2, size_t xend2) { zero_user_segments(&folio->page, start1, xend1, start2, xend2); } /** * folio_zero_segment() - Zero a byte range in a folio. * @folio: The folio to write to. * @start: The first byte to zero. * @xend: One more than the last byte to zero. */ static inline void folio_zero_segment(struct folio *folio, size_t start, size_t xend) { zero_user_segments(&folio->page, start, xend, 0, 0); } /** * folio_zero_range() - Zero a byte range in a folio. * @folio: The folio to write to. * @start: The first byte to zero. * @length: The number of bytes to zero. */ static inline void folio_zero_range(struct folio *folio, size_t start, size_t length) { zero_user_segments(&folio->page, start, start + length, 0, 0); } /** * folio_release_kmap - Unmap a folio and drop a refcount. * @folio: The folio to release. * @addr: The address previously returned by a call to kmap_local_folio(). * * It is common, eg in directory handling to kmap a folio. This function * unmaps the folio and drops the refcount that was being held to keep the * folio alive while we accessed it. */ static inline void folio_release_kmap(struct folio *folio, void *addr) { kunmap_local(addr); folio_put(folio); } static inline void unmap_and_put_page(struct page *page, void *addr) { folio_release_kmap(page_folio(page), addr); } #endif /* _LINUX_HIGHMEM_H */ |
| 4 4 4 4 4 4 4 4 4 4 4 4 4 4 11 1 9 10 11 11 11 11 11 11 11 11 11 11 11 10 11 11 11 11 11 2 9 2 9 11 11 11 11 11 11 11 11 11 11 11 4 4 4 1 1 1 4 4 4 4 4 4 4 4 4 4 4 4 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 23 22 22 23 24 4 6 6 6 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107 12108 12109 12110 12111 12112 12113 12114 12115 12116 12117 12118 12119 12120 12121 12122 12123 12124 12125 12126 12127 12128 12129 12130 12131 12132 12133 12134 12135 12136 12137 12138 12139 12140 12141 12142 12143 12144 12145 12146 12147 12148 12149 12150 12151 12152 12153 12154 12155 12156 12157 12158 12159 12160 12161 12162 12163 12164 12165 12166 12167 12168 12169 12170 12171 12172 12173 12174 12175 12176 12177 12178 12179 12180 12181 12182 12183 12184 12185 12186 12187 12188 12189 12190 12191 12192 12193 12194 12195 12196 12197 12198 12199 12200 12201 12202 12203 12204 12205 12206 12207 12208 12209 12210 12211 12212 12213 12214 12215 12216 12217 12218 12219 12220 12221 12222 12223 12224 12225 12226 12227 12228 12229 12230 12231 12232 12233 12234 12235 12236 12237 12238 12239 12240 12241 12242 12243 12244 12245 12246 12247 12248 12249 12250 12251 12252 12253 12254 12255 12256 12257 12258 12259 12260 12261 12262 12263 12264 12265 12266 12267 12268 12269 12270 12271 12272 12273 12274 12275 12276 12277 12278 12279 12280 12281 12282 12283 12284 12285 12286 12287 12288 12289 12290 12291 12292 12293 12294 12295 12296 12297 12298 12299 12300 12301 12302 12303 12304 12305 12306 12307 12308 12309 12310 12311 12312 12313 12314 12315 12316 12317 12318 12319 12320 12321 12322 12323 12324 12325 12326 12327 12328 12329 12330 12331 12332 12333 12334 12335 12336 12337 12338 12339 12340 12341 12342 12343 12344 12345 12346 12347 12348 12349 12350 12351 12352 12353 12354 12355 12356 12357 12358 12359 12360 12361 12362 12363 12364 12365 12366 12367 12368 12369 12370 12371 12372 12373 12374 12375 12376 12377 12378 12379 12380 12381 12382 12383 12384 12385 12386 12387 12388 12389 12390 12391 12392 12393 12394 12395 12396 12397 12398 12399 12400 12401 12402 12403 12404 12405 12406 12407 12408 12409 12410 12411 12412 12413 12414 12415 12416 12417 12418 12419 12420 12421 12422 12423 12424 12425 12426 12427 12428 12429 12430 12431 12432 12433 12434 12435 12436 12437 12438 12439 12440 12441 12442 12443 12444 12445 12446 12447 12448 12449 12450 12451 12452 12453 12454 12455 12456 12457 12458 12459 12460 12461 12462 12463 12464 12465 12466 12467 12468 12469 12470 12471 12472 12473 12474 12475 12476 12477 12478 12479 12480 12481 12482 12483 12484 12485 12486 12487 12488 12489 12490 12491 12492 12493 12494 12495 12496 12497 12498 12499 12500 12501 12502 12503 12504 12505 12506 12507 12508 12509 12510 12511 12512 12513 12514 12515 12516 12517 12518 12519 12520 12521 12522 12523 12524 12525 12526 12527 12528 12529 12530 12531 12532 12533 12534 12535 12536 12537 12538 12539 12540 12541 12542 12543 12544 12545 12546 12547 12548 12549 12550 12551 12552 12553 12554 12555 12556 12557 12558 12559 12560 12561 12562 12563 12564 12565 12566 12567 12568 12569 12570 12571 12572 12573 12574 12575 12576 12577 12578 12579 12580 12581 12582 12583 12584 12585 12586 12587 12588 | // SPDX-License-Identifier: GPL-2.0-or-later /* * NET3 Protocol independent device support routines. * * Derived from the non IP parts of dev.c 1.0.19 * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * * Additional Authors: * Florian la Roche <rzsfl@rz.uni-sb.de> * Alan Cox <gw4pts@gw4pts.ampr.org> * David Hinds <dahinds@users.sourceforge.net> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * Adam Sulmicki <adam@cfar.umd.edu> * Pekka Riikonen <priikone@poesidon.pspt.fi> * * Changes: * D.J. Barrow : Fixed bug where dev->refcnt gets set * to 2 if register_netdev gets called * before net_dev_init & also removed a * few lines of code in the process. * Alan Cox : device private ioctl copies fields back. * Alan Cox : Transmit queue code does relevant * stunts to keep the queue safe. * Alan Cox : Fixed double lock. * Alan Cox : Fixed promisc NULL pointer trap * ???????? : Support the full private ioctl range * Alan Cox : Moved ioctl permission check into * drivers * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI * Alan Cox : 100 backlog just doesn't cut it when * you start doing multicast video 8) * Alan Cox : Rewrote net_bh and list manager. * Alan Cox : Fix ETH_P_ALL echoback lengths. * Alan Cox : Took out transmit every packet pass * Saved a few bytes in the ioctl handler * Alan Cox : Network driver sets packet type before * calling netif_rx. Saves a function * call a packet. * Alan Cox : Hashed net_bh() * Richard Kooijman: Timestamp fixes. * Alan Cox : Wrong field in SIOCGIFDSTADDR * Alan Cox : Device lock protection. * Alan Cox : Fixed nasty side effect of device close * changes. * Rudi Cilibrasi : Pass the right thing to * set_mac_address() * Dave Miller : 32bit quantity for the device lock to * make it work out on a Sparc. * Bjorn Ekwall : Added KERNELD hack. * Alan Cox : Cleaned up the backlog initialise. * Craig Metz : SIOCGIFCONF fix if space for under * 1 device. * Thomas Bogendoerfer : Return ENODEV for dev_open, if there * is no device open function. * Andi Kleen : Fix error reporting for SIOCGIFCONF * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF * Cyrus Durgin : Cleaned for KMOD * Adam Sulmicki : Bug Fix : Network Device Unload * A network device unload needs to purge * the backlog queue. * Paul Rusty Russell : SIOCSIFNAME * Pekka Riikonen : Netdev boot-time settings code * Andrew Morton : Make unregister_netdevice wait * indefinitely on dev->refcnt * J Hadi Salim : - Backlog queue sampling * - netif_rx() feedback */ #include <linux/uaccess.h> #include <linux/bitmap.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/hash.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/isolation.h> #include <linux/sched/mm.h> #include <linux/smpboot.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/ethtool_netlink.h> #include <linux/skbuff.h> #include <linux/kthread.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/busy_poll.h> #include <linux/rtnetlink.h> #include <linux/stat.h> #include <net/dsa.h> #include <net/dst.h> #include <net/dst_metadata.h> #include <net/gro.h> #include <net/netdev_queues.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/checksum.h> #include <net/xfrm.h> #include <net/tcx.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netpoll.h> #include <linux/rcupdate.h> #include <linux/delay.h> #include <net/iw_handler.h> #include <asm/current.h> #include <linux/audit.h> #include <linux/dmaengine.h> #include <linux/err.h> #include <linux/ctype.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include <linux/ip.h> #include <net/ip.h> #include <net/mpls.h> #include <linux/ipv6.h> #include <linux/in.h> #include <linux/jhash.h> #include <linux/random.h> #include <trace/events/napi.h> #include <trace/events/net.h> #include <trace/events/skb.h> #include <trace/events/qdisc.h> #include <trace/events/xdp.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> #include <linux/static_key.h> #include <linux/hashtable.h> #include <linux/vmalloc.h> #include <linux/if_macvlan.h> #include <linux/errqueue.h> #include <linux/hrtimer.h> #include <linux/netfilter_netdev.h> #include <linux/crash_dump.h> #include <linux/sctp.h> #include <net/udp_tunnel.h> #include <linux/net_namespace.h> #include <linux/indirect_call_wrapper.h> #include <net/devlink.h> #include <linux/pm_runtime.h> #include <linux/prandom.h> #include <linux/once_lite.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/types.h> #include <net/page_pool/helpers.h> #include <net/rps.h> #include <linux/phy_link_topology.h> #include "dev.h" #include "devmem.h" #include "net-sysfs.h" static DEFINE_SPINLOCK(ptype_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack); static DEFINE_MUTEX(ifalias_mutex); /* protects napi_hash addition/deletion and napi_gen_id */ static DEFINE_SPINLOCK(napi_hash_lock); static unsigned int napi_gen_id = NR_CPUS; static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); static inline void dev_base_seq_inc(struct net *net) { unsigned int val = net->dev_base_seq + 1; WRITE_ONCE(net->dev_base_seq, val ?: 1); } static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) { unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ)); return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; } static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) { return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } #ifndef CONFIG_PREEMPT_RT static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key); static int __init setup_backlog_napi_threads(char *arg) { static_branch_enable(&use_backlog_threads_key); return 0; } early_param("thread_backlog_napi", setup_backlog_napi_threads); static bool use_backlog_threads(void) { return static_branch_unlikely(&use_backlog_threads_key); } #else static bool use_backlog_threads(void) { return true; } #endif static inline void backlog_lock_irq_save(struct softnet_data *sd, unsigned long *flags) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); else local_irq_save(*flags); } static inline void backlog_lock_irq_disable(struct softnet_data *sd) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irq(&sd->input_pkt_queue.lock); else local_irq_disable(); } static inline void backlog_unlock_irq_restore(struct softnet_data *sd, unsigned long *flags) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); else local_irq_restore(*flags); } static inline void backlog_unlock_irq_enable(struct softnet_data *sd) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irq(&sd->input_pkt_queue.lock); else local_irq_enable(); } static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; name_node = kmalloc(sizeof(*name_node), GFP_KERNEL); if (!name_node) return NULL; INIT_HLIST_NODE(&name_node->hlist); name_node->dev = dev; name_node->name = name; return name_node; } static struct netdev_name_node * netdev_name_node_head_alloc(struct net_device *dev) { struct netdev_name_node *name_node; name_node = netdev_name_node_alloc(dev, dev->name); if (!name_node) return NULL; INIT_LIST_HEAD(&name_node->list); return name_node; } static void netdev_name_node_free(struct netdev_name_node *name_node) { kfree(name_node); } static void netdev_name_node_add(struct net *net, struct netdev_name_node *name_node) { hlist_add_head_rcu(&name_node->hlist, dev_name_hash(net, name_node->name)); } static void netdev_name_node_del(struct netdev_name_node *name_node) { hlist_del_rcu(&name_node->hlist); } static struct netdev_name_node *netdev_name_node_lookup(struct net *net, const char *name) { struct hlist_head *head = dev_name_hash(net, name); struct netdev_name_node *name_node; hlist_for_each_entry(name_node, head, hlist) if (!strcmp(name_node->name, name)) return name_node; return NULL; } static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, const char *name) { struct hlist_head *head = dev_name_hash(net, name); struct netdev_name_node *name_node; hlist_for_each_entry_rcu(name_node, head, hlist) if (!strcmp(name_node->name, name)) return name_node; return NULL; } bool netdev_name_in_use(struct net *net, const char *name) { return netdev_name_node_lookup(net, name); } EXPORT_SYMBOL(netdev_name_in_use); int netdev_name_node_alt_create(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); name_node = netdev_name_node_lookup(net, name); if (name_node) return -EEXIST; name_node = netdev_name_node_alloc(dev, name); if (!name_node) return -ENOMEM; netdev_name_node_add(net, name_node); /* The node that holds dev->name acts as a head of per-device list. */ list_add_tail_rcu(&name_node->list, &dev->name_node->list); return 0; } static void netdev_name_node_alt_free(struct rcu_head *head) { struct netdev_name_node *name_node = container_of(head, struct netdev_name_node, rcu); kfree(name_node->name); netdev_name_node_free(name_node); } static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) { netdev_name_node_del(name_node); list_del(&name_node->list); call_rcu(&name_node->rcu, netdev_name_node_alt_free); } int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); name_node = netdev_name_node_lookup(net, name); if (!name_node) return -ENOENT; /* lookup might have found our primary name or a name belonging * to another device. */ if (name_node == dev->name_node || name_node->dev != dev) return -EINVAL; __netdev_name_node_alt_destroy(name_node); return 0; } static void netdev_name_node_alt_flush(struct net_device *dev) { struct netdev_name_node *name_node, *tmp; list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) { list_del(&name_node->list); netdev_name_node_alt_free(&name_node->rcu); } } /* Device list insertion */ static void list_netdevice(struct net_device *dev) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); ASSERT_RTNL(); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); netdev_for_each_altname(dev, name_node) netdev_name_node_add(net, name_node); /* We reserved the ifindex, this can't fail */ WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL)); dev_base_seq_inc(net); } /* Device list removal * caller must respect a RCU grace period before freeing/reusing dev */ static void unlist_netdevice(struct net_device *dev) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); ASSERT_RTNL(); xa_erase(&net->dev_by_index, dev->ifindex); netdev_for_each_altname(dev, name_node) netdev_name_node_del(name_node); /* Unlink dev from the device chain */ list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); dev_base_seq_inc(dev_net(dev)); } /* * Our notifier list */ static RAW_NOTIFIER_HEAD(netdev_chain); /* * Device drivers call our routines to queue packets here. We empty the * queue in the local softnet handler. */ DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = { .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock), }; EXPORT_PER_CPU_SYMBOL(softnet_data); /* Page_pool has a lockless array/stack to alloc/recycle pages. * PP consumers must pay attention to run APIs in the appropriate context * (e.g. NAPI context). */ DEFINE_PER_CPU(struct page_pool *, system_page_pool); #ifdef CONFIG_LOCKDEP /* * register_netdevice() inits txq->_xmit_lock and sets lockdep class * according to dev->type */ static const unsigned short netdev_lock_type[] = { ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; static const char *const netdev_lock_name[] = { "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; static inline unsigned short netdev_lock_pos(unsigned short dev_type) { int i; for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) if (netdev_lock_type[i] == dev_type) return i; /* the last key is used by default */ return ARRAY_SIZE(netdev_lock_type) - 1; } static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, unsigned short dev_type) { int i; i = netdev_lock_pos(dev_type); lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], netdev_lock_name[i]); } static inline void netdev_set_addr_lockdep_class(struct net_device *dev) { int i; i = netdev_lock_pos(dev->type); lockdep_set_class_and_name(&dev->addr_list_lock, &netdev_addr_lock_key[i], netdev_lock_name[i]); } #else static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, unsigned short dev_type) { } static inline void netdev_set_addr_lockdep_class(struct net_device *dev) { } #endif /******************************************************************************* * * Protocol management and registration routines * *******************************************************************************/ /* * Add a protocol ID to the list. Now that the input handler is * smarter we can dispense with all the messy stuff that used to be * here. * * BEWARE!!! Protocol handlers, mangling input packets, * MUST BE last in hash buckets and checking protocol handlers * MUST start from promiscuous ptype_all chain in net_bh. * It is true now, do not change it. * Explanation follows: if protocol handler, mangling packet, will * be the first on list, it is not able to sense, that packet * is cloned and should be copied-on-write, so that it will * change it and subsequent readers will get broken packet. * --ANK (980803) */ static inline struct list_head *ptype_head(const struct packet_type *pt) { if (pt->type == htons(ETH_P_ALL)) return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all; else return pt->dev ? &pt->dev->ptype_specific : &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; } /** * dev_add_pack - add packet handler * @pt: packet type declaration * * Add a protocol handler to the networking stack. The passed &packet_type * is linked into kernel lists and may not be freed until it has been * removed from the kernel lists. * * This call does not sleep therefore it can not * guarantee all CPU's that are in middle of receiving packets * will see the new packet type (until the next received packet). */ void dev_add_pack(struct packet_type *pt) { struct list_head *head = ptype_head(pt); spin_lock(&ptype_lock); list_add_rcu(&pt->list, head); spin_unlock(&ptype_lock); } EXPORT_SYMBOL(dev_add_pack); /** * __dev_remove_pack - remove packet handler * @pt: packet type declaration * * Remove a protocol handler that was previously added to the kernel * protocol handlers by dev_add_pack(). The passed &packet_type is removed * from the kernel lists and can be freed or reused once this function * returns. * * The packet type might still be in use by receivers * and must not be freed until after all the CPU's have gone * through a quiescent state. */ void __dev_remove_pack(struct packet_type *pt) { struct list_head *head = ptype_head(pt); struct packet_type *pt1; spin_lock(&ptype_lock); list_for_each_entry(pt1, head, list) { if (pt == pt1) { list_del_rcu(&pt->list); goto out; } } pr_warn("dev_remove_pack: %p not found\n", pt); out: spin_unlock(&ptype_lock); } EXPORT_SYMBOL(__dev_remove_pack); /** * dev_remove_pack - remove packet handler * @pt: packet type declaration * * Remove a protocol handler that was previously added to the kernel * protocol handlers by dev_add_pack(). The passed &packet_type is removed * from the kernel lists and can be freed or reused once this function * returns. * * This call sleeps to guarantee that no CPU is looking at the packet * type after return. */ void dev_remove_pack(struct packet_type *pt) { __dev_remove_pack(pt); synchronize_net(); } EXPORT_SYMBOL(dev_remove_pack); /******************************************************************************* * * Device Interface Subroutines * *******************************************************************************/ /** * dev_get_iflink - get 'iflink' value of a interface * @dev: targeted interface * * Indicates the ifindex the interface is linked to. * Physical interfaces have the same 'ifindex' and 'iflink' values. */ int dev_get_iflink(const struct net_device *dev) { if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) return dev->netdev_ops->ndo_get_iflink(dev); return READ_ONCE(dev->ifindex); } EXPORT_SYMBOL(dev_get_iflink); /** * dev_fill_metadata_dst - Retrieve tunnel egress information. * @dev: targeted interface * @skb: The packet. * * For better visibility of tunnel traffic OVS needs to retrieve * egress tunnel information for a packet. Following API allows * user to get this info. */ int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info; if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) return -EINVAL; info = skb_tunnel_info_unclone(skb); if (!info) return -ENOMEM; if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) return -EINVAL; return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); } EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack) { int k = stack->num_paths++; if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX)) return NULL; return &stack->path[k]; } int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, struct net_device_path_stack *stack) { const struct net_device *last_dev; struct net_device_path_ctx ctx = { .dev = dev, }; struct net_device_path *path; int ret = 0; memcpy(ctx.daddr, daddr, sizeof(ctx.daddr)); stack->num_paths = 0; while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { last_dev = ctx.dev; path = dev_fwd_path(stack); if (!path) return -1; memset(path, 0, sizeof(struct net_device_path)); ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path); if (ret < 0) return -1; if (WARN_ON_ONCE(last_dev == ctx.dev)) return -1; } if (!ctx.dev) return ret; path = dev_fwd_path(stack); if (!path) return -1; path->type = DEV_PATH_ETHERNET; path->dev = ctx.dev; return ret; } EXPORT_SYMBOL_GPL(dev_fill_forward_path); /* must be called under rcu_read_lock(), as we dont take a reference */ static struct napi_struct *napi_by_id(unsigned int napi_id) { unsigned int hash = napi_id % HASH_SIZE(napi_hash); struct napi_struct *napi; hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) if (napi->napi_id == napi_id) return napi; return NULL; } /* must be called under rcu_read_lock(), as we dont take a reference */ static struct napi_struct * netdev_napi_by_id(struct net *net, unsigned int napi_id) { struct napi_struct *napi; napi = napi_by_id(napi_id); if (!napi) return NULL; if (WARN_ON_ONCE(!napi->dev)) return NULL; if (!net_eq(net, dev_net(napi->dev))) return NULL; return napi; } /** * netdev_napi_by_id_lock() - find a device by NAPI ID and lock it * @net: the applicable net namespace * @napi_id: ID of a NAPI of a target device * * Find a NAPI instance with @napi_id. Lock its device. * The device must be in %NETREG_REGISTERED state for lookup to succeed. * netdev_unlock() must be called to release it. * * Return: pointer to NAPI, its device with lock held, NULL if not found. */ struct napi_struct * netdev_napi_by_id_lock(struct net *net, unsigned int napi_id) { struct napi_struct *napi; struct net_device *dev; rcu_read_lock(); napi = netdev_napi_by_id(net, napi_id); if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) { rcu_read_unlock(); return NULL; } dev = napi->dev; dev_hold(dev); rcu_read_unlock(); dev = __netdev_put_lock(dev); if (!dev) return NULL; rcu_read_lock(); napi = netdev_napi_by_id(net, napi_id); if (napi && napi->dev != dev) napi = NULL; rcu_read_unlock(); if (!napi) netdev_unlock(dev); return napi; } /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace * @name: name to find * * Find an interface by name. Must be called under RTNL semaphore. * If the name is found a pointer to the device is returned. * If the name is not found then %NULL is returned. The * reference counters are not incremented so the caller must be * careful with locks. */ struct net_device *__dev_get_by_name(struct net *net, const char *name) { struct netdev_name_node *node_name; node_name = netdev_name_node_lookup(net, name); return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(__dev_get_by_name); /** * dev_get_by_name_rcu - find a device by its name * @net: the applicable net namespace * @name: name to find * * Find an interface by name. * If the name is found a pointer to the device is returned. * If the name is not found then %NULL is returned. * The reference counters are not incremented so the caller must be * careful with locks. The caller must hold RCU lock. */ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) { struct netdev_name_node *node_name; node_name = netdev_name_node_lookup_rcu(net, name); return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(dev_get_by_name_rcu); /* Deprecated for new users, call netdev_get_by_name() instead */ struct net_device *dev_get_by_name(struct net *net, const char *name) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_name_rcu(net, name); dev_hold(dev); rcu_read_unlock(); return dev; } EXPORT_SYMBOL(dev_get_by_name); /** * netdev_get_by_name() - find a device by its name * @net: the applicable net namespace * @name: name to find * @tracker: tracking object for the acquired reference * @gfp: allocation flags for the tracker * * Find an interface by name. This can be called from any * context and does its own locking. The returned handle has * the usage count incremented and the caller must use netdev_put() to * release it when it is no longer needed. %NULL is returned if no * matching device is found. */ struct net_device *netdev_get_by_name(struct net *net, const char *name, netdevice_tracker *tracker, gfp_t gfp) { struct net_device *dev; dev = dev_get_by_name(net, name); if (dev) netdev_tracker_alloc(dev, tracker, gfp); return dev; } EXPORT_SYMBOL(netdev_get_by_name); /** * __dev_get_by_index - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful * about locking. The caller must hold the RTNL semaphore. */ struct net_device *__dev_get_by_index(struct net *net, int ifindex) { struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex); hlist_for_each_entry(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev; return NULL; } EXPORT_SYMBOL(__dev_get_by_index); /** * dev_get_by_index_rcu - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful * about locking. The caller must hold RCU lock. */ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) { struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex); hlist_for_each_entry_rcu(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev; return NULL; } EXPORT_SYMBOL(dev_get_by_index_rcu); /* Deprecated for new users, call netdev_get_by_index() instead */ struct net_device *dev_get_by_index(struct net *net, int ifindex) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); dev_hold(dev); rcu_read_unlock(); return dev; } EXPORT_SYMBOL(dev_get_by_index); /** * netdev_get_by_index() - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * @tracker: tracking object for the acquired reference * @gfp: allocation flags for the tracker * * Search for an interface by index. Returns NULL if the device * is not found or a pointer to the device. The device returned has * had a reference added and the pointer is safe until the user calls * netdev_put() to indicate they have finished with it. */ struct net_device *netdev_get_by_index(struct net *net, int ifindex, netdevice_tracker *tracker, gfp_t gfp) { struct net_device *dev; dev = dev_get_by_index(net, ifindex); if (dev) netdev_tracker_alloc(dev, tracker, gfp); return dev; } EXPORT_SYMBOL(netdev_get_by_index); /** * dev_get_by_napi_id - find a device by napi_id * @napi_id: ID of the NAPI struct * * Search for an interface by NAPI ID. Returns %NULL if the device * is not found or a pointer to the device. The device has not had * its reference counter increased so the caller must be careful * about locking. The caller must hold RCU lock. */ struct net_device *dev_get_by_napi_id(unsigned int napi_id) { struct napi_struct *napi; WARN_ON_ONCE(!rcu_read_lock_held()); if (napi_id < MIN_NAPI_ID) return NULL; napi = napi_by_id(napi_id); return napi ? napi->dev : NULL; } /* Release the held reference on the net_device, and if the net_device * is still registered try to lock the instance lock. If device is being * unregistered NULL will be returned (but the reference has been released, * either way!) * * This helper is intended for locking net_device after it has been looked up * using a lockless lookup helper. Lock prevents the instance from going away. */ struct net_device *__netdev_put_lock(struct net_device *dev) { netdev_lock(dev); if (dev->reg_state > NETREG_REGISTERED) { netdev_unlock(dev); dev_put(dev); return NULL; } dev_put(dev); return dev; } /** * netdev_get_by_index_lock() - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. If a valid device * with @ifindex is found it will be returned with netdev->lock held. * netdev_unlock() must be called to release it. * * Return: pointer to a device with lock held, NULL if not found. */ struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex) { struct net_device *dev; dev = dev_get_by_index(net, ifindex); if (!dev) return NULL; return __netdev_put_lock(dev); } struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index) { if (dev) netdev_unlock(dev); do { rcu_read_lock(); dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); if (!dev) { rcu_read_unlock(); return NULL; } dev_hold(dev); rcu_read_unlock(); dev = __netdev_put_lock(dev); if (dev) return dev; (*index)++; } while (true); } static DEFINE_SEQLOCK(netdev_rename_lock); void netdev_copy_name(struct net_device *dev, char *name) { unsigned int seq; do { seq = read_seqbegin(&netdev_rename_lock); strscpy(name, dev->name, IFNAMSIZ); } while (read_seqretry(&netdev_rename_lock, seq)); } /** * netdev_get_name - get a netdevice name, knowing its ifindex. * @net: network namespace * @name: a pointer to the buffer where the name will be stored. * @ifindex: the ifindex of the interface to get the name from. */ int netdev_get_name(struct net *net, char *name, int ifindex) { struct net_device *dev; int ret; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (!dev) { ret = -ENODEV; goto out; } netdev_copy_name(dev, name); ret = 0; out: rcu_read_unlock(); return ret; } /** * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace * @type: media type of device * @ha: hardware address * * Search for an interface by MAC address. Returns NULL if the device * is not found or a pointer to the device. * The caller must hold RCU or RTNL. * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * */ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, const char *ha) { struct net_device *dev; for_each_netdev_rcu(net, dev) if (dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len)) return dev; return NULL; } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; rcu_read_lock(); for_each_netdev_rcu(net, dev) if (dev->type == type) { dev_hold(dev); ret = dev; break; } rcu_read_unlock(); return ret; } EXPORT_SYMBOL(dev_getfirstbyhwtype); /** * __dev_get_by_flags - find any device with given flags * @net: the applicable net namespace * @if_flags: IFF_* values * @mask: bitmask of bits in if_flags to check * * Search for any interface with the given flags. Returns NULL if a device * is not found or a pointer to the device. Must be called inside * rtnl_lock(), and result refcount is unchanged. */ struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask) { struct net_device *dev, *ret; ASSERT_RTNL(); ret = NULL; for_each_netdev(net, dev) { if (((dev->flags ^ if_flags) & mask) == 0) { ret = dev; break; } } return ret; } EXPORT_SYMBOL(__dev_get_by_flags); /** * dev_valid_name - check if name is okay for network device * @name: name string * * Network device names need to be valid file names to * allow sysfs to work. We also disallow any kind of * whitespace. */ bool dev_valid_name(const char *name) { if (*name == '\0') return false; if (strnlen(name, IFNAMSIZ) == IFNAMSIZ) return false; if (!strcmp(name, ".") || !strcmp(name, "..")) return false; while (*name) { if (*name == '/' || *name == ':' || isspace(*name)) return false; name++; } return true; } EXPORT_SYMBOL(dev_valid_name); /** * __dev_alloc_name - allocate a name for a device * @net: network namespace to allocate the device name in * @name: name format string * @res: result name string * * Passed a format string - eg "lt%d" it will try and find a suitable * id. It scans list of devices to build up a free map, then chooses * the first empty slot. The caller must hold the dev_base or rtnl lock * while allocating the name and adding the device in order to avoid * duplicates. * Limited to bits_per_byte * page size devices (ie 32K on most platforms). * Returns the number of the unit assigned or a negative errno code. */ static int __dev_alloc_name(struct net *net, const char *name, char *res) { int i = 0; const char *p; const int max_netdevices = 8*PAGE_SIZE; unsigned long *inuse; struct net_device *d; char buf[IFNAMSIZ]; /* Verify the string as this thing may have come from the user. * There must be one "%d" and no other "%" characters. */ p = strchr(name, '%'); if (!p || p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL; /* Use one page as a bit array of possible slots */ inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC); if (!inuse) return -ENOMEM; for_each_netdev(net, d) { struct netdev_name_node *name_node; netdev_for_each_altname(d, name_node) { if (!sscanf(name_node->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) continue; /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, name_node->name, IFNAMSIZ)) __set_bit(i, inuse); } if (!sscanf(d->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) continue; /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, d->name, IFNAMSIZ)) __set_bit(i, inuse); } i = find_first_zero_bit(inuse, max_netdevices); bitmap_free(inuse); if (i == max_netdevices) return -ENFILE; /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */ strscpy(buf, name, IFNAMSIZ); snprintf(res, IFNAMSIZ, buf, i); return i; } /* Returns negative errno or allocated unit id (see __dev_alloc_name()) */ static int dev_prep_valid_name(struct net *net, struct net_device *dev, const char *want_name, char *out_name, int dup_errno) { if (!dev_valid_name(want_name)) return -EINVAL; if (strchr(want_name, '%')) return __dev_alloc_name(net, want_name, out_name); if (netdev_name_in_use(net, want_name)) return -dup_errno; if (out_name != want_name) strscpy(out_name, want_name, IFNAMSIZ); return 0; } /** * dev_alloc_name - allocate a name for a device * @dev: device * @name: name format string * * Passed a format string - eg "lt%d" it will try and find a suitable * id. It scans list of devices to build up a free map, then chooses * the first empty slot. The caller must hold the dev_base or rtnl lock * while allocating the name and adding the device in order to avoid * duplicates. * Limited to bits_per_byte * page size devices (ie 32K on most platforms). * Returns the number of the unit assigned or a negative errno code. */ int dev_alloc_name(struct net_device *dev, const char *name) { return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE); } EXPORT_SYMBOL(dev_alloc_name); static int dev_get_valid_name(struct net *net, struct net_device *dev, const char *name) { int ret; ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST); return ret < 0 ? ret : 0; } /** * dev_change_name - change name of a device * @dev: device * @newname: name (or format string) must be at least IFNAMSIZ * * Change name of a device, can pass format strings "eth%d". * for wildcarding. */ int dev_change_name(struct net_device *dev, const char *newname) { struct net *net = dev_net(dev); unsigned char old_assign_type; char oldname[IFNAMSIZ]; int err = 0; int ret; ASSERT_RTNL_NET(net); if (!strncmp(newname, dev->name, IFNAMSIZ)) return 0; memcpy(oldname, dev->name, IFNAMSIZ); write_seqlock_bh(&netdev_rename_lock); err = dev_get_valid_name(net, dev, newname); write_sequnlock_bh(&netdev_rename_lock); if (err < 0) return err; if (oldname[0] && !strchr(oldname, '%')) netdev_info(dev, "renamed from %s%s\n", oldname, dev->flags & IFF_UP ? " (while UP)" : ""); old_assign_type = dev->name_assign_type; WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED); rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); write_sequnlock_bh(&netdev_rename_lock); WRITE_ONCE(dev->name_assign_type, old_assign_type); return ret; } netdev_adjacent_rename_links(dev, oldname); netdev_name_node_del(dev->name_node); synchronize_net(); netdev_name_node_add(net, dev->name_node); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); if (ret) { /* err >= 0 after dev_alloc_name() or stores the first errno */ if (err >= 0) { err = ret; write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); write_sequnlock_bh(&netdev_rename_lock); memcpy(oldname, newname, IFNAMSIZ); WRITE_ONCE(dev->name_assign_type, old_assign_type); old_assign_type = NET_NAME_RENAMED; goto rollback; } else { netdev_err(dev, "name change rollback failed: %d\n", ret); } } return err; } /** * dev_set_alias - change ifalias of a device * @dev: device * @alias: name up to IFALIASZ * @len: limit of bytes to copy from info * * Set ifalias for a device, */ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) { struct dev_ifalias *new_alias = NULL; if (len >= IFALIASZ) return -EINVAL; if (len) { new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL); if (!new_alias) return -ENOMEM; memcpy(new_alias->ifalias, alias, len); new_alias->ifalias[len] = 0; } mutex_lock(&ifalias_mutex); new_alias = rcu_replace_pointer(dev->ifalias, new_alias, mutex_is_locked(&ifalias_mutex)); mutex_unlock(&ifalias_mutex); if (new_alias) kfree_rcu(new_alias, rcuhead); return len; } EXPORT_SYMBOL(dev_set_alias); /** * dev_get_alias - get ifalias of a device * @dev: device * @name: buffer to store name of ifalias * @len: size of buffer * * get ifalias for a device. Caller must make sure dev cannot go * away, e.g. rcu read lock or own a reference count to device. */ int dev_get_alias(const struct net_device *dev, char *name, size_t len) { const struct dev_ifalias *alias; int ret = 0; rcu_read_lock(); alias = rcu_dereference(dev->ifalias); if (alias) ret = snprintf(name, len, "%s", alias->ifalias); rcu_read_unlock(); return ret; } /** * netdev_features_change - device changes features * @dev: device to cause notification * * Called to indicate a device has changed features. */ void netdev_features_change(struct net_device *dev) { call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); } EXPORT_SYMBOL(netdev_features_change); /** * netdev_state_change - device changes state * @dev: device to cause notification * * Called to indicate a device has changed state. This function calls * the notifier chains for netdev_chain and sends a NEWLINK message * to the routing socket. */ void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { struct netdev_notifier_change_info change_info = { .info.dev = dev, }; call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL); } } EXPORT_SYMBOL(netdev_state_change); /** * __netdev_notify_peers - notify network peers about existence of @dev, * to be called when rtnl lock is already held. * @dev: network device * * Generate traffic such that interested network peers are aware of * @dev, such as by generating a gratuitous ARP. This may be used when * a device wants to inform the rest of the network about some sort of * reconfiguration such as a failover event or virtual machine * migration. */ void __netdev_notify_peers(struct net_device *dev) { ASSERT_RTNL(); call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); } EXPORT_SYMBOL(__netdev_notify_peers); /** * netdev_notify_peers - notify network peers about existence of @dev * @dev: network device * * Generate traffic such that interested network peers are aware of * @dev, such as by generating a gratuitous ARP. This may be used when * a device wants to inform the rest of the network about some sort of * reconfiguration such as a failover event or virtual machine * migration. */ void netdev_notify_peers(struct net_device *dev) { rtnl_lock(); __netdev_notify_peers(dev); rtnl_unlock(); } EXPORT_SYMBOL(netdev_notify_peers); static int napi_threaded_poll(void *data); static int napi_kthread_create(struct napi_struct *n) { int err = 0; /* Create and wake up the kthread once to put it in * TASK_INTERRUPTIBLE mode to avoid the blocked task * warning and work with loadavg. */ n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d", n->dev->name, n->napi_id); if (IS_ERR(n->thread)) { err = PTR_ERR(n->thread); pr_err("kthread_run failed with err %d\n", err); n->thread = NULL; } return err; } static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int ret; ASSERT_RTNL(); dev_addr_check(dev); if (!netif_device_present(dev)) { /* may be detached because parent is runtime-suspended */ if (dev->dev.parent) pm_runtime_resume(dev->dev.parent); if (!netif_device_present(dev)) return -ENODEV; } /* Block netpoll from trying to do any rx path servicing. * If we don't do this there is a chance ndo_poll_controller * or ndo_poll may be running while we open the device */ netpoll_poll_disable(dev); ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack); ret = notifier_to_errno(ret); if (ret) return ret; set_bit(__LINK_STATE_START, &dev->state); if (ops->ndo_validate_addr) ret = ops->ndo_validate_addr(dev); if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); netpoll_poll_enable(dev); if (ret) clear_bit(__LINK_STATE_START, &dev->state); else { netif_set_up(dev, true); dev_set_rx_mode(dev); dev_activate(dev); add_device_randomness(dev->dev_addr, dev->addr_len); } return ret; } /** * dev_open - prepare an interface for use. * @dev: device to open * @extack: netlink extended ack * * Takes a device from down to up state. The device's private open * function is invoked and then the multicast lists are loaded. Finally * the device is moved into the up state and a %NETDEV_UP message is * sent to the netdev notifier chain. * * Calling this function on an active interface is a nop. On a failure * a negative errno code is returned. */ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { int ret; if (dev->flags & IFF_UP) return 0; ret = __dev_open(dev, extack); if (ret < 0) return ret; rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL); call_netdevice_notifiers(NETDEV_UP, dev); return ret; } EXPORT_SYMBOL(dev_open); static void __dev_close_many(struct list_head *head) { struct net_device *dev; ASSERT_RTNL(); might_sleep(); list_for_each_entry(dev, head, close_list) { /* Temporarily disable netpoll until the interface is down */ netpoll_poll_disable(dev); call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); clear_bit(__LINK_STATE_START, &dev->state); /* Synchronize to scheduled poll. We cannot touch poll list, it * can be even on different cpu. So just clear netif_running(). * * dev->stop() will invoke napi_disable() on all of it's * napi_struct instances on this device. */ smp_mb__after_atomic(); /* Commit netif_running(). */ } dev_deactivate_many(head); list_for_each_entry(dev, head, close_list) { const struct net_device_ops *ops = dev->netdev_ops; /* * Call the device specific close. This cannot fail. * Only if device is UP * * We allow it to be called even after a DETACH hot-plug * event. */ if (ops->ndo_stop) ops->ndo_stop(dev); netif_set_up(dev, false); netpoll_poll_enable(dev); } } static void __dev_close(struct net_device *dev) { LIST_HEAD(single); list_add(&dev->close_list, &single); __dev_close_many(&single); list_del(&single); } void dev_close_many(struct list_head *head, bool unlink) { struct net_device *dev, *tmp; /* Remove the devices that don't need to be closed */ list_for_each_entry_safe(dev, tmp, head, close_list) if (!(dev->flags & IFF_UP)) list_del_init(&dev->close_list); __dev_close_many(head); list_for_each_entry_safe(dev, tmp, head, close_list) { rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL); call_netdevice_notifiers(NETDEV_DOWN, dev); if (unlink) list_del_init(&dev->close_list); } } EXPORT_SYMBOL(dev_close_many); /** * dev_close - shutdown an interface. * @dev: device to shutdown * * This function moves an active device into down state. A * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier * chain. */ void dev_close(struct net_device *dev) { if (dev->flags & IFF_UP) { LIST_HEAD(single); list_add(&dev->close_list, &single); dev_close_many(&single, true); list_del(&single); } } EXPORT_SYMBOL(dev_close); /** * dev_disable_lro - disable Large Receive Offload on a device * @dev: device * * Disable Large Receive Offload (LRO) on a net device. Must be * called under RTNL. This is needed if received packets may be * forwarded to another interface. */ void dev_disable_lro(struct net_device *dev) { struct net_device *lower_dev; struct list_head *iter; dev->wanted_features &= ~NETIF_F_LRO; netdev_update_features(dev); if (unlikely(dev->features & NETIF_F_LRO)) netdev_WARN(dev, "failed to disable LRO!\n"); netdev_for_each_lower_dev(dev, lower_dev, iter) dev_disable_lro(lower_dev); } EXPORT_SYMBOL(dev_disable_lro); /** * dev_disable_gro_hw - disable HW Generic Receive Offload on a device * @dev: device * * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be * called under RTNL. This is needed if Generic XDP is installed on * the device. */ static void dev_disable_gro_hw(struct net_device *dev) { dev->wanted_features &= ~NETIF_F_GRO_HW; netdev_update_features(dev); if (unlikely(dev->features & NETIF_F_GRO_HW)) netdev_WARN(dev, "failed to disable GRO_HW!\n"); } const char *netdev_cmd_to_name(enum netdev_cmd cmd) { #define N(val) \ case NETDEV_##val: \ return "NETDEV_" __stringify(val); switch (cmd) { N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER) N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE) N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE) N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE) N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA) N(XDP_FEAT_CHANGE) } #undef N return "UNKNOWN_NETDEV_EVENT"; } EXPORT_SYMBOL_GPL(netdev_cmd_to_name); static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { struct netdev_notifier_info info = { .dev = dev, }; return nb->notifier_call(nb, val, &info); } static int call_netdevice_register_notifiers(struct notifier_block *nb, struct net_device *dev) { int err; err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); err = notifier_to_errno(err); if (err) return err; if (!(dev->flags & IFF_UP)) return 0; call_netdevice_notifier(nb, NETDEV_UP, dev); return 0; } static void call_netdevice_unregister_notifiers(struct notifier_block *nb, struct net_device *dev) { if (dev->flags & IFF_UP) { call_netdevice_notifier(nb, NETDEV_GOING_DOWN, dev); call_netdevice_notifier(nb, NETDEV_DOWN, dev); } call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } static int call_netdevice_register_net_notifiers(struct notifier_block *nb, struct net *net) { struct net_device *dev; int err; for_each_netdev(net, dev) { err = call_netdevice_register_notifiers(nb, dev); if (err) goto rollback; } return 0; rollback: for_each_netdev_continue_reverse(net, dev) call_netdevice_unregister_notifiers(nb, dev); return err; } static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb, struct net *net) { struct net_device *dev; for_each_netdev(net, dev) call_netdevice_unregister_notifiers(nb, dev); } static int dev_boot_phase = 1; /** * register_netdevice_notifier - register a network notifier block * @nb: notifier * * Register a notifier to be called when network device events occur. * The notifier passed is linked into the kernel structures and must * not be reused until it has been unregistered. A negative errno code * is returned on a failure. * * When registered all registration and up events are replayed * to the new notifier to allow device to have a race free * view of the network device list. */ int register_netdevice_notifier(struct notifier_block *nb) { struct net *net; int err; /* Close race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); /* When RTNL is removed, we need protection for netdev_chain. */ rtnl_lock(); err = raw_notifier_chain_register(&netdev_chain, nb); if (err) goto unlock; if (dev_boot_phase) goto unlock; for_each_net(net) { __rtnl_net_lock(net); err = call_netdevice_register_net_notifiers(nb, net); __rtnl_net_unlock(net); if (err) goto rollback; } unlock: rtnl_unlock(); up_write(&pernet_ops_rwsem); return err; rollback: for_each_net_continue_reverse(net) { __rtnl_net_lock(net); call_netdevice_unregister_net_notifiers(nb, net); __rtnl_net_unlock(net); } raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } EXPORT_SYMBOL(register_netdevice_notifier); /** * unregister_netdevice_notifier - unregister a network notifier block * @nb: notifier * * Unregister a notifier previously registered by * register_netdevice_notifier(). The notifier is unlinked into the * kernel structures and may then be reused. A negative errno code * is returned on a failure. * * After unregistering unregister and down device events are synthesized * for all devices on the device list to the removed notifier to remove * the need for special case cleanup code. */ int unregister_netdevice_notifier(struct notifier_block *nb) { struct net *net; int err; /* Close race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock(); err = raw_notifier_chain_unregister(&netdev_chain, nb); if (err) goto unlock; for_each_net(net) { __rtnl_net_lock(net); call_netdevice_unregister_net_notifiers(nb, net); __rtnl_net_unlock(net); } unlock: rtnl_unlock(); up_write(&pernet_ops_rwsem); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier); static int __register_netdevice_notifier_net(struct net *net, struct notifier_block *nb, bool ignore_call_fail) { int err; err = raw_notifier_chain_register(&net->netdev_chain, nb); if (err) return err; if (dev_boot_phase) return 0; err = call_netdevice_register_net_notifiers(nb, net); if (err && !ignore_call_fail) goto chain_unregister; return 0; chain_unregister: raw_notifier_chain_unregister(&net->netdev_chain, nb); return err; } static int __unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb) { int err; err = raw_notifier_chain_unregister(&net->netdev_chain, nb); if (err) return err; call_netdevice_unregister_net_notifiers(nb, net); return 0; } /** * register_netdevice_notifier_net - register a per-netns network notifier block * @net: network namespace * @nb: notifier * * Register a notifier to be called when network device events occur. * The notifier passed is linked into the kernel structures and must * not be reused until it has been unregistered. A negative errno code * is returned on a failure. * * When registered all registration and up events are replayed * to the new notifier to allow device to have a race free * view of the network device list. */ int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) { int err; rtnl_net_lock(net); err = __register_netdevice_notifier_net(net, nb, false); rtnl_net_unlock(net); return err; } EXPORT_SYMBOL(register_netdevice_notifier_net); /** * unregister_netdevice_notifier_net - unregister a per-netns * network notifier block * @net: network namespace * @nb: notifier * * Unregister a notifier previously registered by * register_netdevice_notifier_net(). The notifier is unlinked from the * kernel structures and may then be reused. A negative errno code * is returned on a failure. * * After unregistering unregister and down device events are synthesized * for all devices on the device list to the removed notifier to remove * the need for special case cleanup code. */ int unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb) { int err; rtnl_net_lock(net); err = __unregister_netdevice_notifier_net(net, nb); rtnl_net_unlock(net); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier_net); static void __move_netdevice_notifier_net(struct net *src_net, struct net *dst_net, struct notifier_block *nb) { __unregister_netdevice_notifier_net(src_net, nb); __register_netdevice_notifier_net(dst_net, nb, true); } int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { struct net *net = dev_net(dev); int err; rtnl_net_lock(net); err = __register_netdevice_notifier_net(net, nb, false); if (!err) { nn->nb = nb; list_add(&nn->list, &dev->net_notifier_list); } rtnl_net_unlock(net); return err; } EXPORT_SYMBOL(register_netdevice_notifier_dev_net); int unregister_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { struct net *net = dev_net(dev); int err; rtnl_net_lock(net); list_del(&nn->list); err = __unregister_netdevice_notifier_net(net, nb); rtnl_net_unlock(net); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net); static void move_netdevice_notifiers_dev_net(struct net_device *dev, struct net *net) { struct netdev_net_notifier *nn; list_for_each_entry(nn, &dev->net_notifier_list, list) __move_netdevice_notifier_net(dev_net(dev), net, nn->nb); } /** * call_netdevice_notifiers_info - call all network notifier blocks * @val: value passed unmodified to notifier function * @info: notifier information data * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info) { struct net *net = dev_net(info->dev); int ret; ASSERT_RTNL(); /* Run per-netns notifier block chain first, then run the global one. * Hopefully, one day, the global one is going to be removed after * all notifier block registrators get converted to be per-netns. */ ret = raw_notifier_call_chain(&net->netdev_chain, val, info); if (ret & NOTIFY_STOP_MASK) return ret; return raw_notifier_call_chain(&netdev_chain, val, info); } /** * call_netdevice_notifiers_info_robust - call per-netns notifier blocks * for and rollback on error * @val_up: value passed unmodified to notifier function * @val_down: value passed unmodified to the notifier function when * recovering from an error on @val_up * @info: notifier information data * * Call all per-netns network notifier blocks, but not notifier blocks on * the global notifier chain. Parameters and return value are as for * raw_notifier_call_chain_robust(). */ static int call_netdevice_notifiers_info_robust(unsigned long val_up, unsigned long val_down, struct netdev_notifier_info *info) { struct net *net = dev_net(info->dev); ASSERT_RTNL(); return raw_notifier_call_chain_robust(&net->netdev_chain, val_up, val_down, info); } static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack) { struct netdev_notifier_info info = { .dev = dev, .extack = extack, }; return call_netdevice_notifiers_info(val, &info); } /** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function * @dev: net_device pointer passed unmodified to notifier function * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { return call_netdevice_notifiers_extack(val, dev, NULL); } EXPORT_SYMBOL(call_netdevice_notifiers); /** * call_netdevice_notifiers_mtu - call all network notifier blocks * @val: value passed unmodified to notifier function * @dev: net_device pointer passed unmodified to notifier function * @arg: additional u32 argument passed to the notifier function * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ static int call_netdevice_notifiers_mtu(unsigned long val, struct net_device *dev, u32 arg) { struct netdev_notifier_info_ext info = { .info.dev = dev, .ext.mtu = arg, }; BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0); return call_netdevice_notifiers_info(val, &info.info); } #ifdef CONFIG_NET_INGRESS static DEFINE_STATIC_KEY_FALSE(ingress_needed_key); void net_inc_ingress_queue(void) { static_branch_inc(&ingress_needed_key); } EXPORT_SYMBOL_GPL(net_inc_ingress_queue); void net_dec_ingress_queue(void) { static_branch_dec(&ingress_needed_key); } EXPORT_SYMBOL_GPL(net_dec_ingress_queue); #endif #ifdef CONFIG_NET_EGRESS static DEFINE_STATIC_KEY_FALSE(egress_needed_key); void net_inc_egress_queue(void) { static_branch_inc(&egress_needed_key); } EXPORT_SYMBOL_GPL(net_inc_egress_queue); void net_dec_egress_queue(void) { static_branch_dec(&egress_needed_key); } EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif #ifdef CONFIG_NET_CLS_ACT DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key); EXPORT_SYMBOL(tcf_sw_enabled_key); #endif DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); EXPORT_SYMBOL(netstamp_needed_key); #ifdef CONFIG_JUMP_LABEL static atomic_t netstamp_needed_deferred; static atomic_t netstamp_wanted; static void netstamp_clear(struct work_struct *work) { int deferred = atomic_xchg(&netstamp_needed_deferred, 0); int wanted; wanted = atomic_add_return(deferred, &netstamp_wanted); if (wanted > 0) static_branch_enable(&netstamp_needed_key); else static_branch_disable(&netstamp_needed_key); } static DECLARE_WORK(netstamp_work, netstamp_clear); #endif void net_enable_timestamp(void) { #ifdef CONFIG_JUMP_LABEL int wanted = atomic_read(&netstamp_wanted); while (wanted > 0) { if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1)) return; } atomic_inc(&netstamp_needed_deferred); schedule_work(&netstamp_work); #else static_branch_inc(&netstamp_needed_key); #endif } EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { #ifdef CONFIG_JUMP_LABEL int wanted = atomic_read(&netstamp_wanted); while (wanted > 1) { if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1)) return; } atomic_dec(&netstamp_needed_deferred); schedule_work(&netstamp_work); #else static_branch_dec(&netstamp_needed_key); #endif } EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp = 0; skb->tstamp_type = SKB_CLOCK_REALTIME; if (static_branch_unlikely(&netstamp_needed_key)) skb->tstamp = ktime_get_real(); } #define net_timestamp_check(COND, SKB) \ if (static_branch_unlikely(&netstamp_needed_key)) { \ if ((COND) && !(SKB)->tstamp) \ (SKB)->tstamp = ktime_get_real(); \ } \ bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) { return __is_skb_forwardable(dev, skb, true); } EXPORT_SYMBOL_GPL(is_skb_forwardable); static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb, bool check_mtu) { int ret = ____dev_forward_skb(dev, skb, check_mtu); if (likely(!ret)) { skb->protocol = eth_type_trans(skb, dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } return ret; } int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { return __dev_forward_skb2(dev, skb, true); } EXPORT_SYMBOL_GPL(__dev_forward_skb); /** * dev_forward_skb - loopback an skb to another netif * * @dev: destination network device * @skb: buffer to forward * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped, but freed) * * dev_forward_skb can be used for injecting an skb from the * start_xmit function of one device into the receive queue * of another device. * * The receiving device may be in another namespace, so * we have to clear all information in the skb that could * impact namespace isolation. */ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb) { return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb); } static inline int deliver_skb(struct sk_buff *skb, struct packet_type *pt_prev, struct net_device *orig_dev) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) return -ENOMEM; refcount_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } static inline void deliver_ptype_list_skb(struct sk_buff *skb, struct packet_type **pt, struct net_device *orig_dev, __be16 type, struct list_head *ptype_list) { struct packet_type *ptype, *pt_prev = *pt; list_for_each_entry_rcu(ptype, ptype_list, list) { if (ptype->type != type) continue; if (pt_prev) deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } *pt = pt_prev; } static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) { if (!ptype->af_packet_priv || !skb->sk) return false; if (ptype->id_match) return ptype->id_match(ptype, skb->sk); else if ((struct sock *)ptype->af_packet_priv == skb->sk) return true; return false; } /** * dev_nit_active - return true if any network interface taps are in use * * @dev: network device to check for the presence of taps */ bool dev_nit_active(struct net_device *dev) { return !list_empty(&net_hotdata.ptype_all) || !list_empty(&dev->ptype_all); } EXPORT_SYMBOL_GPL(dev_nit_active); /* * Support routine. Sends outgoing frames to any network * taps currently in use. */ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct list_head *ptype_list = &net_hotdata.ptype_all; struct packet_type *ptype, *pt_prev = NULL; struct sk_buff *skb2 = NULL; rcu_read_lock(); again: list_for_each_entry_rcu(ptype, ptype_list, list) { if (READ_ONCE(ptype->ignore_outgoing)) continue; /* Never send packets back to the socket * they originated from - MvS (miquels@drinkel.ow.org) */ if (skb_loop_sk(ptype, skb)) continue; if (pt_prev) { deliver_skb(skb2, pt_prev, skb->dev); pt_prev = ptype; continue; } /* need to clone skb, done only once */ skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) goto out_unlock; net_timestamp_set(skb2); /* skb->nh should be correctly * set by sender, so that the second statement is * just protection against buggy protocols. */ skb_reset_mac_header(skb2); if (skb_network_header(skb2) < skb2->data || skb_network_header(skb2) > skb_tail_pointer(skb2)) { net_crit_ratelimited("protocol %04x is buggy, dev %s\n", ntohs(skb2->protocol), dev->name); skb_reset_network_header(skb2); } skb2->transport_header = skb2->network_header; skb2->pkt_type = PACKET_OUTGOING; pt_prev = ptype; } if (ptype_list == &net_hotdata.ptype_all) { ptype_list = &dev->ptype_all; goto again; } out_unlock: if (pt_prev) { if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC)) pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); else kfree_skb(skb2); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); /** * netif_setup_tc - Handle tc mappings on real_num_tx_queues change * @dev: Network device * @txq: number of queues available * * If real_num_tx_queues is changed the tc mappings may no longer be * valid. To resolve this verify the tc mapping remains valid and if * not NULL the mapping. With no priorities mapping to this * offset/count pair it will no longer be used. In the worst case TC0 * is invalid nothing can be done so disable priority mappings. If is * expected that drivers will fix this mapping if they can before * calling netif_set_real_num_tx_queues. */ static void netif_setup_tc(struct net_device *dev, unsigned int txq) { int i; struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; /* If TC0 is invalidated disable TC mapping */ if (tc->offset + tc->count > txq) { netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); dev->num_tc = 0; return; } /* Invalidated prio to tc mappings set to TC0 */ for (i = 1; i < TC_BITMASK + 1; i++) { int q = netdev_get_prio_tc_map(dev, i); tc = &dev->tc_to_txq[q]; if (tc->offset + tc->count > txq) { netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", i, q); netdev_set_prio_tc_map(dev, i, 0); } } } int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) { if (dev->num_tc) { struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; int i; /* walk through the TCs and see if it falls into any of them */ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { if ((txq - tc->offset) < tc->count) return i; } /* didn't find it, just return -1 to indicate no match */ return -1; } return 0; } EXPORT_SYMBOL(netdev_txq_to_tc); #ifdef CONFIG_XPS static struct static_key xps_needed __read_mostly; static struct static_key xps_rxqs_needed __read_mostly; static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) static bool remove_xps_queue(struct xps_dev_maps *dev_maps, struct xps_dev_maps *old_maps, int tci, u16 index) { struct xps_map *map = NULL; int pos; map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) return false; for (pos = map->len; pos--;) { if (map->queues[pos] != index) continue; if (map->len > 1) { map->queues[pos] = map->queues[--map->len]; break; } if (old_maps) RCU_INIT_POINTER(old_maps->attr_map[tci], NULL); RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); return false; } return true; } static bool remove_xps_queue_cpu(struct net_device *dev, struct xps_dev_maps *dev_maps, int cpu, u16 offset, u16 count) { int num_tc = dev_maps->num_tc; bool active = false; int tci; for (tci = cpu * num_tc; num_tc--; tci++) { int i, j; for (i = count, j = offset; i--; j++) { if (!remove_xps_queue(dev_maps, NULL, tci, j)) break; } active |= i < 0; } return active; } static void reset_xps_maps(struct net_device *dev, struct xps_dev_maps *dev_maps, enum xps_map_type type) { static_key_slow_dec_cpuslocked(&xps_needed); if (type == XPS_RXQS) static_key_slow_dec_cpuslocked(&xps_rxqs_needed); RCU_INIT_POINTER(dev->xps_maps[type], NULL); kfree_rcu(dev_maps, rcu); } static void clean_xps_maps(struct net_device *dev, enum xps_map_type type, u16 offset, u16 count) { struct xps_dev_maps *dev_maps; bool active = false; int i, j; dev_maps = xmap_dereference(dev->xps_maps[type]); if (!dev_maps) return; for (j = 0; j < dev_maps->nr_ids; j++) active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); if (!active) reset_xps_maps(dev, dev_maps, type); if (type == XPS_CPUS) { for (i = offset + (count - 1); count--; i--) netdev_queue_numa_node_write( netdev_get_tx_queue(dev, i), NUMA_NO_NODE); } } static void netif_reset_xps_queues(struct net_device *dev, u16 offset, u16 count) { if (!static_key_false(&xps_needed)) return; cpus_read_lock(); mutex_lock(&xps_map_mutex); if (static_key_false(&xps_rxqs_needed)) clean_xps_maps(dev, XPS_RXQS, offset, count); clean_xps_maps(dev, XPS_CPUS, offset, count); mutex_unlock(&xps_map_mutex); cpus_read_unlock(); } static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) { netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); } static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, u16 index, bool is_rxqs_map) { struct xps_map *new_map; int alloc_len = XPS_MIN_MAP_ALLOC; int i, pos; for (pos = 0; map && pos < map->len; pos++) { if (map->queues[pos] != index) continue; return map; } /* Need to add tx-queue to this CPU's/rx-queue's existing map */ if (map) { if (pos < map->alloc_len) return map; alloc_len = map->alloc_len * 2; } /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's * map */ if (is_rxqs_map) new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL); else new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, cpu_to_node(attr_index)); if (!new_map) return NULL; for (i = 0; i < pos; i++) new_map->queues[i] = map->queues[i]; new_map->alloc_len = alloc_len; new_map->len = pos; return new_map; } /* Copy xps maps at a given index */ static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps, struct xps_dev_maps *new_dev_maps, int index, int tc, bool skip_tc) { int i, tci = index * dev_maps->num_tc; struct xps_map *map; /* copy maps belonging to foreign traffic classes */ for (i = 0; i < dev_maps->num_tc; i++, tci++) { if (i == tc && skip_tc) continue; /* fill in the new device map from the old device map */ map = xmap_dereference(dev_maps->attr_map[tci]); RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } } /* Must be called under cpus_read_lock */ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, enum xps_map_type type) { struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL; const unsigned long *online_mask = NULL; bool active = false, copy = false; int i, j, tci, numa_node_id = -2; int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; unsigned int nr_ids; WARN_ON_ONCE(index >= dev->num_tx_queues); if (dev->num_tc) { /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; if (num_tc < 0) return -EINVAL; /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; } mutex_lock(&xps_map_mutex); dev_maps = xmap_dereference(dev->xps_maps[type]); if (type == XPS_RXQS) { maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); nr_ids = dev->num_rx_queues; } else { maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); if (num_possible_cpus() > 1) online_mask = cpumask_bits(cpu_online_mask); nr_ids = nr_cpu_ids; } if (maps_sz < L1_CACHE_BYTES) maps_sz = L1_CACHE_BYTES; /* The old dev_maps could be larger or smaller than the one we're * setting up now, as dev->num_tc or nr_ids could have been updated in * between. We could try to be smart, but let's be safe instead and only * copy foreign traffic classes if the two map sizes match. */ if (dev_maps && dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids) copy = true; /* allocate memory for queue storage */ for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), j < nr_ids;) { if (!new_dev_maps) { new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { mutex_unlock(&xps_map_mutex); return -ENOMEM; } new_dev_maps->nr_ids = nr_ids; new_dev_maps->num_tc = num_tc; } tci = j * num_tc + tc; map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; map = expand_xps_map(map, j, index, type == XPS_RXQS); if (!map) goto error; RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } if (!new_dev_maps) goto out_no_new_maps; if (!dev_maps) { /* Increment static keys at most once per type */ static_key_slow_inc_cpuslocked(&xps_needed); if (type == XPS_RXQS) static_key_slow_inc_cpuslocked(&xps_rxqs_needed); } for (j = 0; j < nr_ids; j++) { bool skip_tc = false; tci = j * num_tc + tc; if (netif_attr_test_mask(j, mask, nr_ids) && netif_attr_test_online(j, online_mask, nr_ids)) { /* add tx-queue to CPU/rx-queue maps */ int pos = 0; skip_tc = true; map = xmap_dereference(new_dev_maps->attr_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; if (pos == map->len) map->queues[map->len++] = index; #ifdef CONFIG_NUMA if (type == XPS_CPUS) { if (numa_node_id == -2) numa_node_id = cpu_to_node(j); else if (numa_node_id != cpu_to_node(j)) numa_node_id = -1; } #endif } if (copy) xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc, skip_tc); } rcu_assign_pointer(dev->xps_maps[type], new_dev_maps); /* Cleanup old maps */ if (!dev_maps) goto out_no_old_maps; for (j = 0; j < dev_maps->nr_ids; j++) { for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) continue; if (copy) { new_map = xmap_dereference(new_dev_maps->attr_map[tci]); if (map == new_map) continue; } RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); } } old_dev_maps = dev_maps; out_no_old_maps: dev_maps = new_dev_maps; active = true; out_no_new_maps: if (type == XPS_CPUS) /* update Tx queue numa node */ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), (numa_node_id >= 0) ? numa_node_id : NUMA_NO_NODE); if (!dev_maps) goto out_no_maps; /* removes tx-queue from unused CPUs/rx-queues */ for (j = 0; j < dev_maps->nr_ids; j++) { tci = j * dev_maps->num_tc; for (i = 0; i < dev_maps->num_tc; i++, tci++) { if (i == tc && netif_attr_test_mask(j, mask, dev_maps->nr_ids) && netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) continue; active |= remove_xps_queue(dev_maps, copy ? old_dev_maps : NULL, tci, index); } } if (old_dev_maps) kfree_rcu(old_dev_maps, rcu); /* free map if not active */ if (!active) reset_xps_maps(dev, dev_maps, type); out_no_maps: mutex_unlock(&xps_map_mutex); return 0; error: /* remove any maps that we added */ for (j = 0; j < nr_ids; j++) { for (i = num_tc, tci = j * num_tc; i--; tci++) { new_map = xmap_dereference(new_dev_maps->attr_map[tci]); map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; if (new_map && new_map != map) kfree(new_map); } } mutex_unlock(&xps_map_mutex); kfree(new_dev_maps); return -ENOMEM; } EXPORT_SYMBOL_GPL(__netif_set_xps_queue); int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index) { int ret; cpus_read_lock(); ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS); cpus_read_unlock(); return ret; } EXPORT_SYMBOL(netif_set_xps_queue); #endif static void netdev_unbind_all_sb_channels(struct net_device *dev) { struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; /* Unbind any subordinate channels */ while (txq-- != &dev->_tx[0]) { if (txq->sb_dev) netdev_unbind_sb_channel(dev, txq->sb_dev); } } void netdev_reset_tc(struct net_device *dev) { #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif netdev_unbind_all_sb_channels(dev); /* Reset TC configuration of device */ dev->num_tc = 0; memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); } EXPORT_SYMBOL(netdev_reset_tc); int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset) { if (tc >= dev->num_tc) return -EINVAL; #ifdef CONFIG_XPS netif_reset_xps_queues(dev, offset, count); #endif dev->tc_to_txq[tc].count = count; dev->tc_to_txq[tc].offset = offset; return 0; } EXPORT_SYMBOL(netdev_set_tc_queue); int netdev_set_num_tc(struct net_device *dev, u8 num_tc) { if (num_tc > TC_MAX_QUEUE) return -EINVAL; #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif netdev_unbind_all_sb_channels(dev); dev->num_tc = num_tc; return 0; } EXPORT_SYMBOL(netdev_set_num_tc); void netdev_unbind_sb_channel(struct net_device *dev, struct net_device *sb_dev) { struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; #ifdef CONFIG_XPS netif_reset_xps_queues_gt(sb_dev, 0); #endif memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq)); memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map)); while (txq-- != &dev->_tx[0]) { if (txq->sb_dev == sb_dev) txq->sb_dev = NULL; } } EXPORT_SYMBOL(netdev_unbind_sb_channel); int netdev_bind_sb_channel_queue(struct net_device *dev, struct net_device *sb_dev, u8 tc, u16 count, u16 offset) { /* Make certain the sb_dev and dev are already configured */ if (sb_dev->num_tc >= 0 || tc >= dev->num_tc) return -EINVAL; /* We cannot hand out queues we don't have */ if ((offset + count) > dev->real_num_tx_queues) return -EINVAL; /* Record the mapping */ sb_dev->tc_to_txq[tc].count = count; sb_dev->tc_to_txq[tc].offset = offset; /* Provide a way for Tx queue to find the tc_to_txq map or * XPS map for itself. */ while (count--) netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev; return 0; } EXPORT_SYMBOL(netdev_bind_sb_channel_queue); int netdev_set_sb_channel(struct net_device *dev, u16 channel) { /* Do not use a multiqueue device to represent a subordinate channel */ if (netif_is_multiqueue(dev)) return -ENODEV; /* We allow channels 1 - 32767 to be used for subordinate channels. * Channel 0 is meant to be "native" mode and used only to represent * the main root device. We allow writing 0 to reset the device back * to normal mode after being used as a subordinate channel. */ if (channel > S16_MAX) return -EINVAL; dev->num_tc = -channel; return 0; } EXPORT_SYMBOL(netdev_set_sb_channel); /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. */ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) { bool disabling; int rc; disabling = txq < dev->real_num_tx_queues; if (txq < 1 || txq > dev->num_tx_queues) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) { ASSERT_RTNL(); rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, txq); if (rc) return rc; if (dev->num_tc) netif_setup_tc(dev, txq); net_shaper_set_real_num_tx_queues(dev, txq); dev_qdisc_change_real_num_tx(dev, txq); dev->real_num_tx_queues = txq; if (disabling) { synchronize_net(); qdisc_reset_all_tx_gt(dev, txq); #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, txq); #endif } } else { dev->real_num_tx_queues = txq; } return 0; } EXPORT_SYMBOL(netif_set_real_num_tx_queues); #ifdef CONFIG_SYSFS /** * netif_set_real_num_rx_queues - set actual number of RX queues used * @dev: Network device * @rxq: Actual number of RX queues * * This must be called either with the rtnl_lock held or before * registration of the net device. Returns 0 on success, or a * negative error code. If called before registration, it always * succeeds. */ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) { int rc; if (rxq < 1 || rxq > dev->num_rx_queues) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED) { ASSERT_RTNL(); rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, rxq); if (rc) return rc; } dev->real_num_rx_queues = rxq; return 0; } EXPORT_SYMBOL(netif_set_real_num_rx_queues); #endif /** * netif_set_real_num_queues - set actual number of RX and TX queues used * @dev: Network device * @txq: Actual number of TX queues * @rxq: Actual number of RX queues * * Set the real number of both TX and RX queues. * Does nothing if the number of queues is already correct. */ int netif_set_real_num_queues(struct net_device *dev, unsigned int txq, unsigned int rxq) { unsigned int old_rxq = dev->real_num_rx_queues; int err; if (txq < 1 || txq > dev->num_tx_queues || rxq < 1 || rxq > dev->num_rx_queues) return -EINVAL; /* Start from increases, so the error path only does decreases - * decreases can't fail. */ if (rxq > dev->real_num_rx_queues) { err = netif_set_real_num_rx_queues(dev, rxq); if (err) return err; } if (txq > dev->real_num_tx_queues) { err = netif_set_real_num_tx_queues(dev, txq); if (err) goto undo_rx; } if (rxq < dev->real_num_rx_queues) WARN_ON(netif_set_real_num_rx_queues(dev, rxq)); if (txq < dev->real_num_tx_queues) WARN_ON(netif_set_real_num_tx_queues(dev, txq)); return 0; undo_rx: WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq)); return err; } EXPORT_SYMBOL(netif_set_real_num_queues); /** * netif_set_tso_max_size() - set the max size of TSO frames supported * @dev: netdev to update * @size: max skb->len of a TSO frame * * Set the limit on the size of TSO super-frames the device can handle. * Unless explicitly set the stack will assume the value of * %GSO_LEGACY_MAX_SIZE. */ void netif_set_tso_max_size(struct net_device *dev, unsigned int size) { dev->tso_max_size = min(GSO_MAX_SIZE, size); if (size < READ_ONCE(dev->gso_max_size)) netif_set_gso_max_size(dev, size); if (size < READ_ONCE(dev->gso_ipv4_max_size)) netif_set_gso_ipv4_max_size(dev, size); } EXPORT_SYMBOL(netif_set_tso_max_size); /** * netif_set_tso_max_segs() - set the max number of segs supported for TSO * @dev: netdev to update * @segs: max number of TCP segments * * Set the limit on the number of TCP segments the device can generate from * a single TSO super-frame. * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS. */ void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs) { dev->tso_max_segs = segs; if (segs < READ_ONCE(dev->gso_max_segs)) netif_set_gso_max_segs(dev, segs); } EXPORT_SYMBOL(netif_set_tso_max_segs); /** * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper * @to: netdev to update * @from: netdev from which to copy the limits */ void netif_inherit_tso_max(struct net_device *to, const struct net_device *from) { netif_set_tso_max_size(to, from->tso_max_size); netif_set_tso_max_segs(to, from->tso_max_segs); } EXPORT_SYMBOL(netif_inherit_tso_max); /** * netif_get_num_default_rss_queues - default number of RSS queues * * Default value is the number of physical cores if there are only 1 or 2, or * divided by 2 if there are more. */ int netif_get_num_default_rss_queues(void) { cpumask_var_t cpus; int cpu, count = 0; if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL))) return 1; cpumask_copy(cpus, cpu_online_mask); for_each_cpu(cpu, cpus) { ++count; cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu)); } free_cpumask_var(cpus); return count > 2 ? DIV_ROUND_UP(count, 2) : count; } EXPORT_SYMBOL(netif_get_num_default_rss_queues); static void __netif_reschedule(struct Qdisc *q) { struct softnet_data *sd; unsigned long flags; local_irq_save(flags); sd = this_cpu_ptr(&softnet_data); q->next_sched = NULL; *sd->output_queue_tailp = q; sd->output_queue_tailp = &q->next_sched; raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } void __netif_schedule(struct Qdisc *q) { if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) __netif_reschedule(q); } EXPORT_SYMBOL(__netif_schedule); struct dev_kfree_skb_cb { enum skb_drop_reason reason; }; static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) { return (struct dev_kfree_skb_cb *)skb->cb; } void netif_schedule_queue(struct netdev_queue *txq) { rcu_read_lock(); if (!netif_xmit_stopped(txq)) { struct Qdisc *q = rcu_dereference(txq->qdisc); __netif_schedule(q); } rcu_read_unlock(); } EXPORT_SYMBOL(netif_schedule_queue); void netif_tx_wake_queue(struct netdev_queue *dev_queue) { if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { struct Qdisc *q; rcu_read_lock(); q = rcu_dereference(dev_queue->qdisc); __netif_schedule(q); rcu_read_unlock(); } } EXPORT_SYMBOL(netif_tx_wake_queue); void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason) { unsigned long flags; if (unlikely(!skb)) return; if (likely(refcount_read(&skb->users) == 1)) { smp_rmb(); refcount_set(&skb->users, 0); } else if (likely(!refcount_dec_and_test(&skb->users))) { return; } get_kfree_skb_cb(skb)->reason = reason; local_irq_save(flags); skb->next = __this_cpu_read(softnet_data.completion_queue); __this_cpu_write(softnet_data.completion_queue, skb); raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } EXPORT_SYMBOL(dev_kfree_skb_irq_reason); void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason) { if (in_hardirq() || irqs_disabled()) dev_kfree_skb_irq_reason(skb, reason); else kfree_skb_reason(skb, reason); } EXPORT_SYMBOL(dev_kfree_skb_any_reason); /** * netif_device_detach - mark device as removed * @dev: network device * * Mark device as removed from system and therefore no longer available. */ void netif_device_detach(struct net_device *dev) { if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { netif_tx_stop_all_queues(dev); } } EXPORT_SYMBOL(netif_device_detach); /** * netif_device_attach - mark device as attached * @dev: network device * * Mark device as attached from system and restart if needed. */ void netif_device_attach(struct net_device *dev) { if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { netif_tx_wake_all_queues(dev); netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_device_attach); /* * Returns a Tx hash based on the given packet descriptor a Tx queues' number * to be used as a distribution range. */ static u16 skb_tx_hash(const struct net_device *dev, const struct net_device *sb_dev, struct sk_buff *skb) { u32 hash; u16 qoffset = 0; u16 qcount = dev->real_num_tx_queues; if (dev->num_tc) { u8 tc = netdev_get_prio_tc_map(dev, skb->priority); qoffset = sb_dev->tc_to_txq[tc].offset; qcount = sb_dev->tc_to_txq[tc].count; if (unlikely(!qcount)) { net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n", sb_dev->name, qoffset, tc); qoffset = 0; qcount = dev->real_num_tx_queues; } } if (skb_rx_queue_recorded(skb)) { DEBUG_NET_WARN_ON_ONCE(qcount == 0); hash = skb_get_rx_queue(skb); if (hash >= qoffset) hash -= qoffset; while (unlikely(hash >= qcount)) hash -= qcount; return hash + qoffset; } return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; } void skb_warn_bad_offload(const struct sk_buff *skb) { static const netdev_features_t null_features; struct net_device *dev = skb->dev; const char *name = ""; if (!net_ratelimit()) return; if (dev) { if (dev->dev.parent) name = dev_driver_string(dev->dev.parent); else name = netdev_name(dev); } skb_dump(KERN_WARNING, skb, false); WARN(1, "%s: caps=(%pNF, %pNF)\n", name, dev ? &dev->features : &null_features, skb->sk ? &skb->sk->sk_route_caps : &null_features); } /* * Invalidate hardware checksum when packet is to be mangled, and * complete checksum manually on outgoing path. */ int skb_checksum_help(struct sk_buff *skb) { __wsum csum; int ret = 0, offset; if (skb->ip_summed == CHECKSUM_COMPLETE) goto out_set_summed; if (unlikely(skb_is_gso(skb))) { skb_warn_bad_offload(skb); return -EINVAL; } if (!skb_frags_readable(skb)) { return -EFAULT; } /* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong. */ if (skb_has_shared_frag(skb)) { ret = __skb_linearize(skb); if (ret) goto out; } offset = skb_checksum_start_offset(skb); ret = -EINVAL; if (unlikely(offset >= skb_headlen(skb))) { DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n", offset, skb_headlen(skb)); goto out; } csum = skb_checksum(skb, offset, skb->len - offset, 0); offset += skb->csum_offset; if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) { DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n", offset + sizeof(__sum16), skb_headlen(skb)); goto out; } ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); if (ret) goto out; *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; out_set_summed: skb->ip_summed = CHECKSUM_NONE; out: return ret; } EXPORT_SYMBOL(skb_checksum_help); int skb_crc32c_csum_help(struct sk_buff *skb) { __le32 crc32c_csum; int ret = 0, offset, start; if (skb->ip_summed != CHECKSUM_PARTIAL) goto out; if (unlikely(skb_is_gso(skb))) goto out; /* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong. */ if (unlikely(skb_has_shared_frag(skb))) { ret = __skb_linearize(skb); if (ret) goto out; } start = skb_checksum_start_offset(skb); offset = start + offsetof(struct sctphdr, checksum); if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { ret = -EINVAL; goto out; } ret = skb_ensure_writable(skb, offset + sizeof(__le32)); if (ret) goto out; crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, skb->len - start, ~(__u32)0, crc32c_csum_stub)); *(__le32 *)(skb->data + offset) = crc32c_csum; skb_reset_csum_not_inet(skb); out: return ret; } EXPORT_SYMBOL(skb_crc32c_csum_help); __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { __be16 type = skb->protocol; /* Tunnel gso handlers can set protocol to ethernet. */ if (type == htons(ETH_P_TEB)) { struct ethhdr *eth; if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) return 0; eth = (struct ethhdr *)skb->data; type = eth->h_proto; } return vlan_get_protocol_and_depth(skb, type, depth); } /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { netdev_err(dev, "hw csum failure\n"); skb_dump(KERN_ERR, skb, true); dump_stack(); } void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb); } EXPORT_SYMBOL(netdev_rx_csum_fault); #endif /* XXX: check that highmem exists at all on the given machine. */ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; if (!(dev->features & NETIF_F_HIGHDMA)) { for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; struct page *page = skb_frag_page(frag); if (page && PageHighMem(page)) return 1; } } #endif return 0; } /* If MPLS offload request, verify we are testing hardware MPLS features * instead of standard features for the netdev. */ #if IS_ENABLED(CONFIG_NET_MPLS_GSO) static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) { if (eth_p_mpls(type)) features &= skb->dev->mpls_features; return features; } #else static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) { return features; } #endif static netdev_features_t harmonize_features(struct sk_buff *skb, netdev_features_t features) { __be16 type; type = skb_network_protocol(skb, NULL); features = net_mpls_features(skb, features, type); if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, type)) { features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); } if (illegal_highdma(skb->dev, skb)) features &= ~NETIF_F_SG; return features; } netdev_features_t passthru_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { return features; } EXPORT_SYMBOL(passthru_features_check); static netdev_features_t dflt_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { return vlan_features_check(skb, features); } static netdev_features_t gso_features_check(const struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { u16 gso_segs = skb_shinfo(skb)->gso_segs; if (gso_segs > READ_ONCE(dev->gso_max_segs)) return features & ~NETIF_F_GSO_MASK; if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb))) return features & ~NETIF_F_GSO_MASK; if (!skb_shinfo(skb)->gso_type) { skb_warn_bad_offload(skb); return features & ~NETIF_F_GSO_MASK; } /* Support for GSO partial features requires software * intervention before we can actually process the packets * so we need to strip support for any partial features now * and we can pull them back in after we have partially * segmented the frame. */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) features &= ~dev->gso_partial_features; /* Make sure to clear the IPv4 ID mangling feature if the * IPv4 header has the potential to be fragmented. */ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { struct iphdr *iph = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb); if (!(iph->frag_off & htons(IP_DF))) features &= ~NETIF_F_TSO_MANGLEID; } return features; } netdev_features_t netif_skb_features(struct sk_buff *skb) { struct net_device *dev = skb->dev; netdev_features_t features = dev->features; if (skb_is_gso(skb)) features = gso_features_check(skb, dev, features); /* If encapsulation offload request, verify we are testing * hardware encapsulation features instead of standard * features for the netdev */ if (skb->encapsulation) features &= dev->hw_enc_features; if (skb_vlan_tagged(skb)) features = netdev_intersect_features(features, dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); if (dev->netdev_ops->ndo_features_check) features &= dev->netdev_ops->ndo_features_check(skb, dev, features); else features &= dflt_features_check(skb, dev, features); return harmonize_features(skb, features); } EXPORT_SYMBOL(netif_skb_features); static int xmit_one(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, bool more) { unsigned int len; int rc; if (dev_nit_active(dev)) dev_queue_xmit_nit(skb, dev); len = skb->len; trace_net_dev_start_xmit(skb, dev); rc = netdev_start_xmit(skb, dev, txq, more); trace_net_dev_xmit(skb, rc, dev, len); return rc; } struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, struct netdev_queue *txq, int *ret) { struct sk_buff *skb = first; int rc = NETDEV_TX_OK; while (skb) { struct sk_buff *next = skb->next; skb_mark_not_on_list(skb); rc = xmit_one(skb, dev, txq, next != NULL); if (unlikely(!dev_xmit_complete(rc))) { skb->next = next; goto out; } skb = next; if (netif_tx_queue_stopped(txq) && skb) { rc = NETDEV_TX_BUSY; break; } } out: *ret = rc; return skb; } static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features) { if (skb_vlan_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) skb = __vlan_hwaccel_push_inside(skb); return skb; } int skb_csum_hwoffload_help(struct sk_buff *skb, const netdev_features_t features) { if (unlikely(skb_csum_is_sctp(skb))) return !!(features & NETIF_F_SCTP_CRC) ? 0 : skb_crc32c_csum_help(skb); if (features & NETIF_F_HW_CSUM) return 0; if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) && skb_network_header_len(skb) != sizeof(struct ipv6hdr) && !ipv6_has_hopopt_jumbo(skb)) goto sw_checksum; switch (skb->csum_offset) { case offsetof(struct tcphdr, check): case offsetof(struct udphdr, check): return 0; } } sw_checksum: return skb_checksum_help(skb); } EXPORT_SYMBOL(skb_csum_hwoffload_help); static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) { netdev_features_t features; features = netif_skb_features(skb); skb = validate_xmit_vlan(skb, features); if (unlikely(!skb)) goto out_null; skb = sk_validate_xmit_skb(skb, dev); if (unlikely(!skb)) goto out_null; if (netif_needs_gso(skb, features)) { struct sk_buff *segs; segs = skb_gso_segment(skb, features); if (IS_ERR(segs)) { goto out_kfree_skb; } else if (segs) { consume_skb(skb); skb = segs; } } else { if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) goto out_kfree_skb; /* If packet is not checksummed and device does not * support checksumming for this protocol, complete * checksumming here. */ if (skb->ip_summed == CHECKSUM_PARTIAL) { if (skb->encapsulation) skb_set_inner_transport_header(skb, skb_checksum_start_offset(skb)); else skb_set_transport_header(skb, skb_checksum_start_offset(skb)); if (skb_csum_hwoffload_help(skb, features)) goto out_kfree_skb; } } skb = validate_xmit_xfrm(skb, features, again); return skb; out_kfree_skb: kfree_skb(skb); out_null: dev_core_stats_tx_dropped_inc(dev); return NULL; } struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again) { struct sk_buff *next, *head = NULL, *tail; for (; skb != NULL; skb = next) { next = skb->next; skb_mark_not_on_list(skb); /* in case skb won't be segmented, point to itself */ skb->prev = skb; skb = validate_xmit_skb(skb, dev, again); if (!skb) continue; if (!head) head = skb; else tail->next = skb; /* If skb was segmented, skb->prev points to * the last segment. If not, it still contains skb. */ tail = skb->prev; } return head; } EXPORT_SYMBOL_GPL(validate_xmit_skb_list); static void qdisc_pkt_len_init(struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); qdisc_skb_cb(skb)->pkt_len = skb->len; /* To get more precise estimation of bytes sent on wire, * we add to pkt_len the headers size of all segments */ if (shinfo->gso_size && skb_transport_header_was_set(skb)) { u16 gso_segs = shinfo->gso_segs; unsigned int hdr_len; /* mac layer + network layer */ hdr_len = skb_transport_offset(skb); /* + transport layer */ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { const struct tcphdr *th; struct tcphdr _tcphdr; th = skb_header_pointer(skb, hdr_len, sizeof(_tcphdr), &_tcphdr); if (likely(th)) hdr_len += __tcp_hdrlen(th); } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { struct udphdr _udphdr; if (skb_header_pointer(skb, hdr_len, sizeof(_udphdr), &_udphdr)) hdr_len += sizeof(struct udphdr); } if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) { int payload = skb->len - hdr_len; /* Malicious packet. */ if (payload <= 0) return; gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size); } qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } } static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q, struct sk_buff **to_free, struct netdev_queue *txq) { int rc; rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK; if (rc == NET_XMIT_SUCCESS) trace_qdisc_enqueue(q, txq, skb); return rc; } static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq) { spinlock_t *root_lock = qdisc_lock(q); struct sk_buff *to_free = NULL; bool contended; int rc; qdisc_calculate_pkt_len(skb, q); tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP); if (q->flags & TCQ_F_NOLOCK) { if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) && qdisc_run_begin(q)) { /* Retest nolock_qdisc_is_empty() within the protection * of q->seqlock to protect from racing with requeuing. */ if (unlikely(!nolock_qdisc_is_empty(q))) { rc = dev_qdisc_enqueue(skb, q, &to_free, txq); __qdisc_run(q); qdisc_run_end(q); goto no_lock_out; } qdisc_bstats_cpu_update(q, skb); if (sch_direct_xmit(skb, q, dev, txq, NULL, true) && !nolock_qdisc_is_empty(q)) __qdisc_run(q); qdisc_run_end(q); return NET_XMIT_SUCCESS; } rc = dev_qdisc_enqueue(skb, q, &to_free, txq); qdisc_run(q); no_lock_out: if (unlikely(to_free)) kfree_skb_list_reason(to_free, tcf_get_drop_reason(to_free)); return rc; } if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) { kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); return NET_XMIT_DROP; } /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. * This permits qdisc->running owner to get the lock more * often and dequeue packets faster. * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit * and then other tasks will only enqueue packets. The packets will be * sent after the qdisc owner is scheduled again. To prevent this * scenario the task always serialize on the lock. */ contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT); if (unlikely(contended)) spin_lock(&q->busylock); spin_lock(root_lock); if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { __qdisc_drop(skb, &to_free); rc = NET_XMIT_DROP; } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && qdisc_run_begin(q)) { /* * This is a work-conserving queue; there are no old skbs * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ qdisc_bstats_update(q, skb); if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; } __qdisc_run(q); } qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { WRITE_ONCE(q->owner, smp_processor_id()); rc = dev_qdisc_enqueue(skb, q, &to_free, txq); WRITE_ONCE(q->owner, -1); if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; } __qdisc_run(q); qdisc_run_end(q); } } spin_unlock(root_lock); if (unlikely(to_free)) kfree_skb_list_reason(to_free, tcf_get_drop_reason(to_free)); if (unlikely(contended)) spin_unlock(&q->busylock); return rc; } #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) static void skb_update_prio(struct sk_buff *skb) { const struct netprio_map *map; const struct sock *sk; unsigned int prioidx; if (skb->priority) return; map = rcu_dereference_bh(skb->dev->priomap); if (!map) return; sk = skb_to_full_sk(skb); if (!sk) return; prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data); if (prioidx < map->priomap_len) skb->priority = map->priomap[prioidx]; } #else #define skb_update_prio(skb) #endif /** * dev_loopback_xmit - loop back @skb * @net: network namespace this loopback is happening in * @sk: sk needed to be a netfilter okfn * @skb: buffer to transmit */ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_reset_mac_header(skb); __skb_pull(skb, skb_network_offset(skb)); skb->pkt_type = PACKET_LOOPBACK; if (skb->ip_summed == CHECKSUM_NONE) skb->ip_summed = CHECKSUM_UNNECESSARY; DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb)); skb_dst_force(skb); netif_rx(skb); return 0; } EXPORT_SYMBOL(dev_loopback_xmit); #ifdef CONFIG_NET_EGRESS static struct netdev_queue * netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) { int qm = skb_get_queue_mapping(skb); return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); } #ifndef CONFIG_PREEMPT_RT static bool netdev_xmit_txqueue_skipped(void) { return __this_cpu_read(softnet_data.xmit.skip_txqueue); } void netdev_xmit_skip_txqueue(bool skip) { __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); } EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); #else static bool netdev_xmit_txqueue_skipped(void) { return current->net_xmit.skip_txqueue; } void netdev_xmit_skip_txqueue(bool skip) { current->net_xmit.skip_txqueue = skip; } EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); #endif #endif /* CONFIG_NET_EGRESS */ #ifdef CONFIG_NET_XGRESS static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, enum skb_drop_reason *drop_reason) { int ret = TC_ACT_UNSPEC; #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq); struct tcf_result res; if (!miniq) return ret; /* Global bypass */ if (!static_branch_likely(&tcf_sw_enabled_key)) return ret; /* Block-wise bypass */ if (tcf_block_bypass_sw(miniq->block)) return ret; tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; tcf_set_drop_reason(skb, *drop_reason); mini_qdisc_bstats_cpu_update(miniq, skb); ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); /* Only tcf related quirks below. */ switch (ret) { case TC_ACT_SHOT: *drop_reason = tcf_get_drop_reason(skb); mini_qdisc_qstats_cpu_drop(miniq); break; case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(res.classid); break; } #endif /* CONFIG_NET_CLS_ACT */ return ret; } static DEFINE_STATIC_KEY_FALSE(tcx_needed_key); void tcx_inc(void) { static_branch_inc(&tcx_needed_key); } void tcx_dec(void) { static_branch_dec(&tcx_needed_key); } static __always_inline enum tcx_action_base tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, const bool needs_mac) { const struct bpf_mprog_fp *fp; const struct bpf_prog *prog; int ret = TCX_NEXT; if (needs_mac) __skb_push(skb, skb->mac_len); bpf_mprog_foreach_prog(entry, fp, prog) { bpf_compute_data_pointers(skb); ret = bpf_prog_run(prog, skb); if (ret != TCX_NEXT) break; } if (needs_mac) __skb_pull(skb, skb->mac_len); return tcx_action_code(skb, ret); } static __always_inline struct sk_buff * sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev, bool *another) { struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS; struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int sch_ret; if (!entry) return skb; bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } qdisc_skb_cb(skb)->pkt_len = skb->len; tcx_set_ingress(skb, true); if (static_branch_unlikely(&tcx_needed_key)) { sch_ret = tcx_run(entry, skb, true); if (sch_ret != TC_ACT_UNSPEC) goto ingress_verdict; } sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason); ingress_verdict: switch (sch_ret) { case TC_ACT_REDIRECT: /* skb_mac_header check was done by BPF, so we can safely * push the L2 header back before redirecting to another * netdev. */ __skb_push(skb, skb->mac_len); if (skb_do_redirect(skb) == -EAGAIN) { __skb_pull(skb, skb->mac_len); *another = true; break; } *ret = NET_RX_SUCCESS; bpf_net_ctx_clear(bpf_net_ctx); return NULL; case TC_ACT_SHOT: kfree_skb_reason(skb, drop_reason); *ret = NET_RX_DROP; bpf_net_ctx_clear(bpf_net_ctx); return NULL; /* used by tc_run */ case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: consume_skb(skb); fallthrough; case TC_ACT_CONSUMED: *ret = NET_RX_SUCCESS; bpf_net_ctx_clear(bpf_net_ctx); return NULL; } bpf_net_ctx_clear(bpf_net_ctx); return skb; } static __always_inline struct sk_buff * sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS; struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int sch_ret; if (!entry) return skb; bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was * already set by the caller. */ if (static_branch_unlikely(&tcx_needed_key)) { sch_ret = tcx_run(entry, skb, false); if (sch_ret != TC_ACT_UNSPEC) goto egress_verdict; } sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason); egress_verdict: switch (sch_ret) { case TC_ACT_REDIRECT: /* No need to push/pop skb's mac_header here on egress! */ skb_do_redirect(skb); *ret = NET_XMIT_SUCCESS; bpf_net_ctx_clear(bpf_net_ctx); return NULL; case TC_ACT_SHOT: kfree_skb_reason(skb, drop_reason); *ret = NET_XMIT_DROP; bpf_net_ctx_clear(bpf_net_ctx); return NULL; /* used by tc_run */ case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: consume_skb(skb); fallthrough; case TC_ACT_CONSUMED: *ret = NET_XMIT_SUCCESS; bpf_net_ctx_clear(bpf_net_ctx); return NULL; } bpf_net_ctx_clear(bpf_net_ctx); return skb; } #else static __always_inline struct sk_buff * sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev, bool *another) { return skb; } static __always_inline struct sk_buff * sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { return skb; } #endif /* CONFIG_NET_XGRESS */ #ifdef CONFIG_XPS static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, struct xps_dev_maps *dev_maps, unsigned int tci) { int tc = netdev_get_prio_tc_map(dev, skb->priority); struct xps_map *map; int queue_index = -1; if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids) return queue_index; tci *= dev_maps->num_tc; tci += tc; map = rcu_dereference(dev_maps->attr_map[tci]); if (map) { if (map->len == 1) queue_index = map->queues[0]; else queue_index = map->queues[reciprocal_scale( skb_get_hash(skb), map->len)]; if (unlikely(queue_index >= dev->real_num_tx_queues)) queue_index = -1; } return queue_index; } #endif static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, struct sk_buff *skb) { #ifdef CONFIG_XPS struct xps_dev_maps *dev_maps; struct sock *sk = skb->sk; int queue_index = -1; if (!static_key_false(&xps_needed)) return -1; rcu_read_lock(); if (!static_key_false(&xps_rxqs_needed)) goto get_cpus_map; dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]); if (dev_maps) { int tci = sk_rx_queue_get(sk); if (tci >= 0) queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci); } get_cpus_map: if (queue_index < 0) { dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]); if (dev_maps) { unsigned int tci = skb->sender_cpu - 1; queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci); } } rcu_read_unlock(); return queue_index; #else return -1; #endif } u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { return 0; } EXPORT_SYMBOL(dev_pick_tx_zero); u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { struct sock *sk = skb->sk; int queue_index = sk_tx_queue_get(sk); sb_dev = sb_dev ? : dev; if (queue_index < 0 || skb->ooo_okay || queue_index >= dev->real_num_tx_queues) { int new_index = get_xps_queue(dev, sb_dev, skb); if (new_index < 0) new_index = skb_tx_hash(dev, sb_dev, skb); if (queue_index != new_index && sk && sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache)) sk_tx_queue_set(sk, new_index); queue_index = new_index; } return queue_index; } EXPORT_SYMBOL(netdev_pick_tx); struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { int queue_index = 0; #ifdef CONFIG_XPS u32 sender_cpu = skb->sender_cpu - 1; if (sender_cpu >= (u32)NR_CPUS) skb->sender_cpu = raw_smp_processor_id() + 1; #endif if (dev->real_num_tx_queues != 1) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_select_queue) queue_index = ops->ndo_select_queue(dev, skb, sb_dev); else queue_index = netdev_pick_tx(dev, skb, sb_dev); queue_index = netdev_cap_txqueue(dev, queue_index); } skb_set_queue_mapping(skb, queue_index); return netdev_get_tx_queue(dev, queue_index); } /** * __dev_queue_xmit() - transmit a buffer * @skb: buffer to transmit * @sb_dev: suboordinate device used for L2 forwarding offload * * Queue a buffer for transmission to a network device. The caller must * have set the device and priority and built the buffer before calling * this function. The function can be called from an interrupt. * * When calling this method, interrupts MUST be enabled. This is because * the BH enable code must have IRQs enabled so that it will not deadlock. * * Regardless of the return value, the skb is consumed, so it is currently * difficult to retry a send to this method. (You can bump the ref count * before sending to hold a reference for retry if you are careful.) * * Return: * * 0 - buffer successfully transmitted * * positive qdisc return code - NET_XMIT_DROP etc. * * negative errno - other errors */ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { struct net_device *dev = skb->dev; struct netdev_queue *txq = NULL; struct Qdisc *q; int rc = -ENOMEM; bool again = false; skb_reset_mac_header(skb); skb_assert_len(skb); if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ rcu_read_lock_bh(); skb_update_prio(skb); qdisc_pkt_len_init(skb); tcx_set_ingress(skb, false); #ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { if (nf_hook_egress_active()) { skb = nf_hook_egress(skb, &rc, dev); if (!skb) goto out; } netdev_xmit_skip_txqueue(false); nf_skip_egress(skb, true); skb = sch_handle_egress(skb, &rc, dev); if (!skb) goto out; nf_skip_egress(skb, false); if (netdev_xmit_txqueue_skipped()) txq = netdev_tx_queue_mapping(dev, skb); } #endif /* If device/qdisc don't need skb->dst, release it right now while * its hot in this cpu cache. */ if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(skb); else skb_dst_force(skb); if (!txq) txq = netdev_core_pick_tx(dev, skb, sb_dev); q = rcu_dereference_bh(txq->qdisc); trace_net_dev_queue(skb); if (q->enqueue) { rc = __dev_xmit_skb(skb, q, dev, txq); goto out; } /* The device has no queue. Common case for software devices: * loopback, all the sorts of tunnels... * Really, it is unlikely that netif_tx_lock protection is necessary * here. (f.e. loopback and IP tunnels are clean ignoring statistics * counters.) * However, it is possible, that they rely on protection * made by us here. * Check this and shot the lock. It is not prone from deadlocks. *Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ /* Other cpus might concurrently change txq->xmit_lock_owner * to -1 or to their cpu id, but not to our id. */ if (READ_ONCE(txq->xmit_lock_owner) != cpu) { if (dev_xmit_recursion()) goto recursion_alert; skb = validate_xmit_skb(skb, dev, &again); if (!skb) goto out; HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { dev_xmit_recursion_inc(); skb = dev_hard_start_xmit(skb, dev, txq, &rc); dev_xmit_recursion_dec(); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; } } HARD_TX_UNLOCK(dev, txq); net_crit_ratelimited("Virtual device %s asks to queue packet!\n", dev->name); } else { /* Recursion is detected! It is possible, * unfortunately */ recursion_alert: net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", dev->name); } } rc = -ENETDOWN; rcu_read_unlock_bh(); dev_core_stats_tx_dropped_inc(dev); kfree_skb_list(skb); return rc; out: rcu_read_unlock_bh(); return rc; } EXPORT_SYMBOL(__dev_queue_xmit); int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { struct net_device *dev = skb->dev; struct sk_buff *orig_skb = skb; struct netdev_queue *txq; int ret = NETDEV_TX_BUSY; bool again = false; if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev))) goto drop; skb = validate_xmit_skb_list(skb, dev, &again); if (skb != orig_skb) goto drop; skb_set_queue_mapping(skb, queue_id); txq = skb_get_tx_queue(dev, skb); local_bh_disable(); dev_xmit_recursion_inc(); HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_drv_stopped(txq)) ret = netdev_start_xmit(skb, dev, txq, false); HARD_TX_UNLOCK(dev, txq); dev_xmit_recursion_dec(); local_bh_enable(); return ret; drop: dev_core_stats_tx_dropped_inc(dev); kfree_skb_list(skb); return NET_XMIT_DROP; } EXPORT_SYMBOL(__dev_direct_xmit); /************************************************************************* * Receiver routines *************************************************************************/ static DEFINE_PER_CPU(struct task_struct *, backlog_napi); int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, struct napi_struct *napi) { struct task_struct *thread; lockdep_assert_irqs_disabled(); if (test_bit(NAPI_STATE_THREADED, &napi->state)) { /* Paired with smp_mb__before_atomic() in * napi_enable()/dev_set_threaded(). * Use READ_ONCE() to guarantee a complete * read on napi->thread. Only call * wake_up_process() when it's not NULL. */ thread = READ_ONCE(napi->thread); if (thread) { if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) goto use_local_napi; set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); wake_up_process(thread); return; } } use_local_napi: list_add_tail(&napi->poll_list, &sd->poll_list); WRITE_ONCE(napi->list_owner, smp_processor_id()); /* If not called from net_rx_action() * we have to raise NET_RX_SOFTIRQ. */ if (!sd->in_net_rx_action) __raise_softirq_irqoff(NET_RX_SOFTIRQ); } #ifdef CONFIG_RPS struct static_key_false rps_needed __read_mostly; EXPORT_SYMBOL(rps_needed); struct static_key_false rfs_needed __read_mostly; EXPORT_SYMBOL(rfs_needed); static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) { if (next_cpu < nr_cpu_ids) { u32 head; #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; u16 rxq_index; u32 flow_id; int rc; /* Should we steer this flow to a different hardware queue? */ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || !(dev->features & NETIF_F_NTUPLE)) goto out; rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); if (rxq_index == skb_get_rx_queue(skb)) goto out; rxqueue = dev->_rx + rxq_index; flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; flow_id = skb_get_hash(skb) & flow_table->mask; rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) goto out; old_rflow = rflow; rflow = &flow_table->flows[flow_id]; WRITE_ONCE(rflow->filter, rc); if (old_rflow->filter == rc) WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER); out: #endif head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head); rps_input_queue_tail_save(&rflow->last_qtail, head); } WRITE_ONCE(rflow->cpu, next_cpu); return rflow; } /* * get_rps_cpu is called from netif_receive_skb and returns the target * CPU from the RPS map of the receiving queue for a given skb. * rcu_read_lock must be held on entry. */ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { const struct rps_sock_flow_table *sock_flow_table; struct netdev_rx_queue *rxqueue = dev->_rx; struct rps_dev_flow_table *flow_table; struct rps_map *map; int cpu = -1; u32 tcpu; u32 hash; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); if (unlikely(index >= dev->real_num_rx_queues)) { WARN_ONCE(dev->real_num_rx_queues > 1, "%s received packet on queue %u, but number " "of RX queues is %u\n", dev->name, index, dev->real_num_rx_queues); goto done; } rxqueue += index; } /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ flow_table = rcu_dereference(rxqueue->rps_flow_table); map = rcu_dereference(rxqueue->rps_map); if (!flow_table && !map) goto done; skb_reset_network_header(skb); hash = skb_get_hash(skb); if (!hash) goto done; sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); if (flow_table && sock_flow_table) { struct rps_dev_flow *rflow; u32 next_cpu; u32 ident; /* First check into global flow table if there is a match. * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow(). */ ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]); if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask) goto try_rps; next_cpu = ident & net_hotdata.rps_cpu_mask; /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table */ rflow = &flow_table->flows[hash & flow_table->mask]; tcpu = rflow->cpu; /* * If the desired CPU (where last recvmsg was done) is * different from current CPU (one in the rx-queue flow * table entry), switch if one of the following holds: * - Current CPU is unset (>= nr_cpu_ids). * - Current CPU is offline. * - The current CPU's queue tail has advanced beyond the * last packet that was enqueued using this table entry. * This guarantees that all previous packets for the flow * have been dequeued, thus preserving in order delivery. */ if (unlikely(tcpu != next_cpu) && (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); } if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { *rflowp = rflow; cpu = tcpu; goto done; } } try_rps: if (map) { tcpu = map->cpus[reciprocal_scale(hash, map->len)]; if (cpu_online(tcpu)) { cpu = tcpu; goto done; } } done: return cpu; } #ifdef CONFIG_RFS_ACCEL /** * rps_may_expire_flow - check whether an RFS hardware filter may be removed * @dev: Device on which the filter was set * @rxq_index: RX queue index * @flow_id: Flow ID passed to ndo_rx_flow_steer() * @filter_id: Filter ID returned by ndo_rx_flow_steer() * * Drivers that implement ndo_rx_flow_steer() should periodically call * this function for each installed filter and remove the filters for * which it returns %true. */ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id) { struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; unsigned int cpu; rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id <= flow_table->mask) { rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - READ_ONCE(rflow->last_qtail)) < (int)(10 * flow_table->mask))) expire = false; } rcu_read_unlock(); return expire; } EXPORT_SYMBOL(rps_may_expire_flow); #endif /* CONFIG_RFS_ACCEL */ /* Called from hardirq (IPI) context */ static void rps_trigger_softirq(void *data) { struct softnet_data *sd = data; ____napi_schedule(sd, &sd->backlog); sd->received_rps++; } #endif /* CONFIG_RPS */ /* Called from hardirq (IPI) context */ static void trigger_rx_softirq(void *data) { struct softnet_data *sd = data; __raise_softirq_irqoff(NET_RX_SOFTIRQ); smp_store_release(&sd->defer_ipi_scheduled, 0); } /* * After we queued a packet into sd->input_pkt_queue, * we need to make sure this queue is serviced soon. * * - If this is another cpu queue, link it to our rps_ipi_list, * and make sure we will process rps_ipi_list from net_rx_action(). * * - If this is our own queue, NAPI schedule our backlog. * Note that this also raises NET_RX_SOFTIRQ. */ static void napi_schedule_rps(struct softnet_data *sd) { struct softnet_data *mysd = this_cpu_ptr(&softnet_data); #ifdef CONFIG_RPS if (sd != mysd) { if (use_backlog_threads()) { __napi_schedule_irqoff(&sd->backlog); return; } sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; /* If not called from net_rx_action() or napi_threaded_poll() * we have to raise NET_RX_SOFTIRQ. */ if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll) __raise_softirq_irqoff(NET_RX_SOFTIRQ); return; } #endif /* CONFIG_RPS */ __napi_schedule_irqoff(&mysd->backlog); } void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) { unsigned long flags; if (use_backlog_threads()) { backlog_lock_irq_save(sd, &flags); if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) __napi_schedule_irqoff(&sd->backlog); backlog_unlock_irq_restore(sd, &flags); } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { smp_call_function_single_async(cpu, &sd->defer_csd); } } #ifdef CONFIG_NET_FLOW_LIMIT int netdev_flow_limit_table_len __read_mostly = (1 << 12); #endif static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) { #ifdef CONFIG_NET_FLOW_LIMIT struct sd_flow_limit *fl; struct softnet_data *sd; unsigned int old_flow, new_flow; if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1)) return false; sd = this_cpu_ptr(&softnet_data); rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); if (fl) { new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); old_flow = fl->history[fl->history_head]; fl->history[fl->history_head] = new_flow; fl->history_head++; fl->history_head &= FLOW_LIMIT_HISTORY - 1; if (likely(fl->buckets[old_flow])) fl->buckets[old_flow]--; if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { fl->count++; rcu_read_unlock(); return true; } } rcu_read_unlock(); #endif return false; } /* * enqueue_to_backlog is called to queue an skb to a per CPU backlog * queue (may be a remote CPU queue). */ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail) { enum skb_drop_reason reason; struct softnet_data *sd; unsigned long flags; unsigned int qlen; int max_backlog; u32 tail; reason = SKB_DROP_REASON_DEV_READY; if (!netif_running(skb->dev)) goto bad_dev; reason = SKB_DROP_REASON_CPU_BACKLOG; sd = &per_cpu(softnet_data, cpu); qlen = skb_queue_len_lockless(&sd->input_pkt_queue); max_backlog = READ_ONCE(net_hotdata.max_backlog); if (unlikely(qlen > max_backlog)) goto cpu_backlog_drop; backlog_lock_irq_save(sd, &flags); qlen = skb_queue_len(&sd->input_pkt_queue); if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) { if (!qlen) { /* Schedule NAPI for backlog device. We can use * non atomic operation as we own the queue lock. */ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) napi_schedule_rps(sd); } __skb_queue_tail(&sd->input_pkt_queue, skb); tail = rps_input_queue_tail_incr(sd); backlog_unlock_irq_restore(sd, &flags); /* save the tail outside of the critical section */ rps_input_queue_tail_save(qtail, tail); return NET_RX_SUCCESS; } backlog_unlock_irq_restore(sd, &flags); cpu_backlog_drop: atomic_inc(&sd->dropped); bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); return NET_RX_DROP; } static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct netdev_rx_queue *rxqueue; rxqueue = dev->_rx; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); if (unlikely(index >= dev->real_num_rx_queues)) { WARN_ONCE(dev->real_num_rx_queues > 1, "%s received packet on queue %u, but number " "of RX queues is %u\n", dev->name, index, dev->real_num_rx_queues); return rxqueue; /* Return first rxqueue */ } rxqueue += index; } return rxqueue; } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; bool orig_bcast, orig_host; u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; u32 metalen, act; int off; /* The XDP program wants to see the packet starting at the MAC * header. */ mac_len = skb->data - skb_mac_header(skb); hard_start = skb->data - skb_headroom(skb); /* SKB "head" area always have tailroom for skb_shared_info */ frame_sz = (void *)skb_end_pointer(skb) - hard_start; frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); rxqueue = netif_get_rxqueue(skb); xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, skb_headlen(skb) + mac_len, true); if (skb_is_nonlinear(skb)) { skb_shinfo(skb)->xdp_frags_size = skb->data_len; xdp_buff_set_frags_flag(xdp); } else { xdp_buff_clear_frags_flag(xdp); } orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr); orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); orig_eth_type = eth->h_proto; act = bpf_prog_run_xdp(xdp_prog, xdp); /* check if bpf_xdp_adjust_head was used */ off = xdp->data - orig_data; if (off) { if (off > 0) __skb_pull(skb, off); else if (off < 0) __skb_push(skb, -off); skb->mac_header += off; skb_reset_network_header(skb); } /* check if bpf_xdp_adjust_tail was used */ off = xdp->data_end - orig_data_end; if (off != 0) { skb_set_tail_pointer(skb, xdp->data_end - xdp->data); skb->len += off; /* positive on grow, negative on shrink */ } /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. */ if (xdp_buff_has_frags(xdp)) skb->data_len = skb_shinfo(skb)->xdp_frags_size; else skb->data_len = 0; /* check if XDP changed eth hdr such SKB needs update */ eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) || (orig_host != ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr)) || (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { __skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); } /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull * before calling us again on redirect path. We do not call do_redirect * as we leave that up to the caller. * * Caller is responsible for managing lifetime of skb (i.e. calling * kfree_skb in response to actions it cannot handle/XDP_DROP). */ switch (act) { case XDP_REDIRECT: case XDP_TX: __skb_push(skb, mac_len); break; case XDP_PASS: metalen = xdp->data - xdp->data_meta; if (metalen) skb_metadata_set(skb, metalen); break; } return act; } static int netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog) { struct sk_buff *skb = *pskb; int err, hroom, troom; if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog)) return 0; /* In case we have to go down the path and also linearize, * then lets do the pskb_expand_head() work just once here. */ hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); troom = skb->tail + skb->data_len - skb->end; err = pskb_expand_head(skb, hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, troom > 0 ? troom + 128 : 0, GFP_ATOMIC); if (err) return err; return skb_linearize(skb); } static u32 netif_receive_generic_xdp(struct sk_buff **pskb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { struct sk_buff *skb = *pskb; u32 mac_len, act = XDP_DROP; /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. */ if (skb_is_redirected(skb)) return XDP_PASS; /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM * bytes. This is the guarantee that also native XDP provides, * thus we need to do it here as well. */ mac_len = skb->data - skb_mac_header(skb); __skb_push(skb, mac_len); if (skb_cloned(skb) || skb_is_nonlinear(skb) || skb_headroom(skb) < XDP_PACKET_HEADROOM) { if (netif_skb_check_for_xdp(pskb, xdp_prog)) goto do_drop; } __skb_pull(*pskb, mac_len); act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog); switch (act) { case XDP_REDIRECT: case XDP_TX: case XDP_PASS: break; default: bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception((*pskb)->dev, xdp_prog, act); fallthrough; case XDP_DROP: do_drop: kfree_skb(*pskb); break; } return act; } /* When doing generic XDP we have to bypass the qdisc layer and the * network taps in order to match in-driver-XDP behavior. This also means * that XDP packets are able to starve other packets going through a qdisc, * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX * queues, so they do not have this starvation issue. */ void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog) { struct net_device *dev = skb->dev; struct netdev_queue *txq; bool free_skb = true; int cpu, rc; txq = netdev_core_pick_tx(dev, skb, NULL); cpu = smp_processor_id(); HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_frozen_or_drv_stopped(txq)) { rc = netdev_start_xmit(skb, dev, txq, 0); if (dev_xmit_complete(rc)) free_skb = false; } HARD_TX_UNLOCK(dev, txq); if (free_skb) { trace_xdp_exception(dev, xdp_prog, XDP_TX); dev_core_stats_tx_dropped_inc(dev); kfree_skb(skb); } } static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; if (xdp_prog) { struct xdp_buff xdp; u32 act; int err; bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog); if (act != XDP_PASS) { switch (act) { case XDP_REDIRECT: err = xdp_do_generic_redirect((*pskb)->dev, *pskb, &xdp, xdp_prog); if (err) goto out_redir; break; case XDP_TX: generic_xdp_tx(*pskb, xdp_prog); break; } bpf_net_ctx_clear(bpf_net_ctx); return XDP_DROP; } bpf_net_ctx_clear(bpf_net_ctx); } return XDP_PASS; out_redir: bpf_net_ctx_clear(bpf_net_ctx); kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP); return XDP_DROP; } EXPORT_SYMBOL_GPL(do_xdp_generic); static int netif_rx_internal(struct sk_buff *skb) { int ret; net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_rx(skb); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu < 0) cpu = smp_processor_id(); ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); } else #endif { unsigned int qtail; ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail); } return ret; } /** * __netif_rx - Slightly optimized version of netif_rx * @skb: buffer to post * * This behaves as netif_rx except that it does not disable bottom halves. * As a result this function may only be invoked from the interrupt context * (either hard or soft interrupt). */ int __netif_rx(struct sk_buff *skb) { int ret; lockdep_assert_once(hardirq_count() | softirq_count()); trace_netif_rx_entry(skb); ret = netif_rx_internal(skb); trace_netif_rx_exit(ret); return ret; } EXPORT_SYMBOL(__netif_rx); /** * netif_rx - post buffer to the network code * @skb: buffer to post * * This function receives a packet from a device driver and queues it for * the upper (protocol) levels to process via the backlog NAPI device. It * always succeeds. The buffer may be dropped during processing for * congestion control or by the protocol layers. * The network buffer is passed via the backlog NAPI device. Modern NIC * driver should use NAPI and GRO. * This function can used from interrupt and from process context. The * caller from process context must not disable interrupts before invoking * this function. * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped) * */ int netif_rx(struct sk_buff *skb) { bool need_bh_off = !(hardirq_count() | softirq_count()); int ret; if (need_bh_off) local_bh_disable(); trace_netif_rx_entry(skb); ret = netif_rx_internal(skb); trace_netif_rx_exit(ret); if (need_bh_off) local_bh_enable(); return ret; } EXPORT_SYMBOL(netif_rx); static __latent_entropy void net_tx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); if (sd->completion_queue) { struct sk_buff *clist; local_irq_disable(); clist = sd->completion_queue; sd->completion_queue = NULL; local_irq_enable(); while (clist) { struct sk_buff *skb = clist; clist = clist->next; WARN_ON(refcount_read(&skb->users)); if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED)) trace_consume_skb(skb, net_tx_action); else trace_kfree_skb(skb, net_tx_action, get_kfree_skb_cb(skb)->reason, NULL); if (skb->fclone != SKB_FCLONE_UNAVAILABLE) __kfree_skb(skb); else __napi_kfree_skb(skb, get_kfree_skb_cb(skb)->reason); } } if (sd->output_queue) { struct Qdisc *head; local_irq_disable(); head = sd->output_queue; sd->output_queue = NULL; sd->output_queue_tailp = &sd->output_queue; local_irq_enable(); rcu_read_lock(); while (head) { struct Qdisc *q = head; spinlock_t *root_lock = NULL; head = head->next_sched; /* We need to make sure head->next_sched is read * before clearing __QDISC_STATE_SCHED */ smp_mb__before_atomic(); if (!(q->flags & TCQ_F_NOLOCK)) { root_lock = qdisc_lock(q); spin_lock(root_lock); } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { /* There is a synchronize_net() between * STATE_DEACTIVATED flag being set and * qdisc_reset()/some_qdisc_is_busy() in * dev_deactivate(), so we can safely bail out * early here to avoid data race between * qdisc_deactivate() and some_qdisc_is_busy() * for lockless qdisc. */ clear_bit(__QDISC_STATE_SCHED, &q->state); continue; } clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); if (root_lock) spin_unlock(root_lock); } rcu_read_unlock(); } xfrm_dev_backlog(sd); } #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) /* This hook is defined here for ATM LANE */ int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr) __read_mostly; EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif /** * netdev_is_rx_handler_busy - check if receive handler is registered * @dev: device to check * * Check if a receive handler is already registered for a given device. * Return true if there one. * * The caller must hold the rtnl_mutex. */ bool netdev_is_rx_handler_busy(struct net_device *dev) { ASSERT_RTNL(); return dev && rtnl_dereference(dev->rx_handler); } EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy); /** * netdev_rx_handler_register - register receive handler * @dev: device to register a handler for * @rx_handler: receive handler to register * @rx_handler_data: data pointer that is used by rx handler * * Register a receive handler for a device. This handler will then be * called from __netif_receive_skb. A negative errno code is returned * on a failure. * * The caller must hold the rtnl_mutex. * * For a general description of rx_handler, see enum rx_handler_result. */ int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, void *rx_handler_data) { if (netdev_is_rx_handler_busy(dev)) return -EBUSY; if (dev->priv_flags & IFF_NO_RX_HANDLER) return -EINVAL; /* Note: rx_handler_data must be set before rx_handler */ rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); rcu_assign_pointer(dev->rx_handler, rx_handler); return 0; } EXPORT_SYMBOL_GPL(netdev_rx_handler_register); /** * netdev_rx_handler_unregister - unregister receive handler * @dev: device to unregister a handler from * * Unregister a receive handler from a device. * * The caller must hold the rtnl_mutex. */ void netdev_rx_handler_unregister(struct net_device *dev) { ASSERT_RTNL(); RCU_INIT_POINTER(dev->rx_handler, NULL); /* a reader seeing a non NULL rx_handler in a rcu_read_lock() * section has a guarantee to see a non NULL rx_handler_data * as well. */ synchronize_net(); RCU_INIT_POINTER(dev->rx_handler_data, NULL); } EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); /* * Limit the use of PFMEMALLOC reserves to those protocols that implement * the special handling of PFMEMALLOC skbs. */ static bool skb_pfmemalloc_protocol(struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_ARP): case htons(ETH_P_IP): case htons(ETH_P_IPV6): case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): return true; default: return false; } } static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { if (nf_hook_ingress_active(skb)) { int ingress_retval; if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } rcu_read_lock(); ingress_retval = nf_hook_ingress(skb); rcu_read_unlock(); return ingress_retval; } return 0; } static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, struct packet_type **ppt_prev) { struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; struct sk_buff *skb = *pskb; struct net_device *orig_dev; bool deliver_exact = false; int ret = NET_RX_DROP; __be16 type; net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_receive_skb(skb); orig_dev = skb->dev; skb_reset_network_header(skb); #if !defined(CONFIG_DEBUG_NET) /* We plan to no longer reset the transport header here. * Give some time to fuzzers and dev build to catch bugs * in network stacks. */ if (!skb_transport_header_was_set(skb)) skb_reset_transport_header(skb); #endif skb_reset_mac_len(skb); pt_prev = NULL; another_round: skb->skb_iif = skb->dev->ifindex; __this_cpu_inc(softnet_data.processed); if (static_branch_unlikely(&generic_xdp_needed_key)) { int ret2; migrate_disable(); ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), &skb); migrate_enable(); if (ret2 != XDP_PASS) { ret = NET_RX_DROP; goto out; } } if (eth_type_vlan(skb->protocol)) { skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto out; } if (skb_skip_tc_classify(skb)) goto skip_classify; if (pfmemalloc) goto skip_taps; list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } skip_taps: #ifdef CONFIG_NET_INGRESS if (static_branch_unlikely(&ingress_needed_key)) { bool another = false; nf_skip_egress(skb, true); skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, &another); if (another) goto another_round; if (!skb) goto out; nf_skip_egress(skb, false); if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) goto out; } #endif skb_reset_redirect(skb); skip_classify: if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; if (skb_vlan_tag_present(skb)) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } if (vlan_do_receive(&skb)) goto another_round; else if (unlikely(!skb)) goto out; } rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } switch (rx_handler(&skb)) { case RX_HANDLER_CONSUMED: ret = NET_RX_SUCCESS; goto out; case RX_HANDLER_ANOTHER: goto another_round; case RX_HANDLER_EXACT: deliver_exact = true; break; case RX_HANDLER_PASS: break; default: BUG(); } } if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) { check_vlan_id: if (skb_vlan_tag_get_id(skb)) { /* Vlan id is non 0 and vlan_do_receive() above couldn't * find vlan device. */ skb->pkt_type = PACKET_OTHERHOST; } else if (eth_type_vlan(skb->protocol)) { /* Outer header is 802.1P with vlan 0, inner header is * 802.1Q or 802.1AD and vlan_do_receive() above could * not find vlan dev for vlan id 0. */ __vlan_hwaccel_clear_tag(skb); skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto out; if (vlan_do_receive(&skb)) /* After stripping off 802.1P header with vlan 0 * vlan dev is found for inner header. */ goto another_round; else if (unlikely(!skb)) goto out; else /* We have stripped outer 802.1P vlan 0 header. * But could not find vlan dev. * check again for vlan id to set OTHERHOST. */ goto check_vlan_id; } /* Note: we might in the future use prio bits * and set skb->priority like in vlan_do_receive() * For the time being, just ignore Priority Code Point */ __vlan_hwaccel_clear_tag(skb); } type = skb->protocol; /* deliver only exact match when indicated */ if (likely(!deliver_exact)) { deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &ptype_base[ntohs(type) & PTYPE_HASH_MASK]); } deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &orig_dev->ptype_specific); if (unlikely(skb->dev != orig_dev)) { deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &skb->dev->ptype_specific); } if (pt_prev) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; *ppt_prev = pt_prev; } else { drop: if (!deliver_exact) dev_core_stats_rx_dropped_inc(skb->dev); else dev_core_stats_rx_nohandler_inc(skb->dev); kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) */ ret = NET_RX_DROP; } out: /* The invariant here is that if *ppt_prev is not NULL * then skb should also be non-NULL. * * Apparently *ppt_prev assignment above holds this invariant due to * skb dereferencing near it. */ *pskb = skb; return ret; } static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) { struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; int ret; ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); if (pt_prev) ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb, skb->dev, pt_prev, orig_dev); return ret; } /** * netif_receive_skb_core - special purpose version of netif_receive_skb * @skb: buffer to process * * More direct receive version of netif_receive_skb(). It should * only be used by callers that have a need to skip RPS and Generic XDP. * Caller must also take care of handling if ``(page_is_)pfmemalloc``. * * This function may only be called from softirq context and interrupts * should be enabled. * * Return values (usually ignored): * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped */ int netif_receive_skb_core(struct sk_buff *skb) { int ret; rcu_read_lock(); ret = __netif_receive_skb_one_core(skb, false); rcu_read_unlock(); return ret; } EXPORT_SYMBOL(netif_receive_skb_core); static inline void __netif_receive_skb_list_ptype(struct list_head *head, struct packet_type *pt_prev, struct net_device *orig_dev) { struct sk_buff *skb, *next; if (!pt_prev) return; if (list_empty(head)) return; if (pt_prev->list_func != NULL) INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv, ip_list_rcv, head, pt_prev, orig_dev); else list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } } static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) { /* Fast-path assumptions: * - There is no RX handler. * - Only one packet_type matches. * If either of these fails, we will end up doing some per-packet * processing in-line, then handling the 'last ptype' for the whole * sublist. This can't cause out-of-order delivery to any single ptype, * because the 'last ptype' must be constant across the sublist, and all * other ptypes are handled per-packet. */ /* Current (common) ptype of sublist */ struct packet_type *pt_curr = NULL; /* Current (common) orig_dev of sublist */ struct net_device *od_curr = NULL; struct sk_buff *skb, *next; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; skb_list_del_init(skb); __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); if (!pt_prev) continue; if (pt_curr != pt_prev || od_curr != orig_dev) { /* dispatch old sublist */ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); /* start new sublist */ INIT_LIST_HEAD(&sublist); pt_curr = pt_prev; od_curr = orig_dev; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); } static int __netif_receive_skb(struct sk_buff *skb) { int ret; if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { unsigned int noreclaim_flag; /* * PFMEMALLOC skbs are special, they should * - be delivered to SOCK_MEMALLOC sockets only * - stay away from userspace * - have bounded memory usage * * Use PF_MEMALLOC as this saves us from propagating the allocation * context down to all allocation sites. */ noreclaim_flag = memalloc_noreclaim_save(); ret = __netif_receive_skb_one_core(skb, true); memalloc_noreclaim_restore(noreclaim_flag); } else ret = __netif_receive_skb_one_core(skb, false); return ret; } static void __netif_receive_skb_list(struct list_head *head) { unsigned long noreclaim_flag = 0; struct sk_buff *skb, *next; bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */ list_for_each_entry_safe(skb, next, head, list) { if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) { struct list_head sublist; /* Handle the previous sublist */ list_cut_before(&sublist, head, &skb->list); if (!list_empty(&sublist)) __netif_receive_skb_list_core(&sublist, pfmemalloc); pfmemalloc = !pfmemalloc; /* See comments in __netif_receive_skb */ if (pfmemalloc) noreclaim_flag = memalloc_noreclaim_save(); else memalloc_noreclaim_restore(noreclaim_flag); } } /* Handle the remaining sublist */ if (!list_empty(head)) __netif_receive_skb_list_core(head, pfmemalloc); /* Restore pflags */ if (pfmemalloc) memalloc_noreclaim_restore(noreclaim_flag); } static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) { struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); struct bpf_prog *new = xdp->prog; int ret = 0; switch (xdp->command) { case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); if (old) bpf_prog_put(old); if (old && !new) { static_branch_dec(&generic_xdp_needed_key); } else if (new && !old) { static_branch_inc(&generic_xdp_needed_key); dev_disable_lro(dev); dev_disable_gro_hw(dev); } break; default: ret = -EINVAL; break; } return ret; } static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; rcu_read_lock(); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu >= 0) { ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); return ret; } } #endif ret = __netif_receive_skb(skb); rcu_read_unlock(); return ret; } void netif_receive_skb_list_internal(struct list_head *head) { struct sk_buff *skb, *next; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); } list_splice_init(&sublist, head); rcu_read_lock(); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { list_for_each_entry_safe(skb, next, head, list) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu >= 0) { /* Will be handled, remove from list */ skb_list_del_init(skb); enqueue_to_backlog(skb, cpu, &rflow->last_qtail); } } } #endif __netif_receive_skb_list(head); rcu_read_unlock(); } /** * netif_receive_skb - process receive buffer from network * @skb: buffer to process * * netif_receive_skb() is the main receive data processing function. * It always succeeds. The buffer may be dropped during processing * for congestion control or by the protocol layers. * * This function may only be called from softirq context and interrupts * should be enabled. * * Return values (usually ignored): * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped */ int netif_receive_skb(struct sk_buff *skb) { int ret; trace_netif_receive_skb_entry(skb); ret = netif_receive_skb_internal(skb); trace_netif_receive_skb_exit(ret); return ret; } EXPORT_SYMBOL(netif_receive_skb); /** * netif_receive_skb_list - process many receive buffers from network * @head: list of skbs to process. * * Since return value of netif_receive_skb() is normally ignored, and * wouldn't be meaningful for a list, this function returns void. * * This function may only be called from softirq context and interrupts * should be enabled. */ void netif_receive_skb_list(struct list_head *head) { struct sk_buff *skb; if (list_empty(head)) return; if (trace_netif_receive_skb_list_entry_enabled()) { list_for_each_entry(skb, head, list) trace_netif_receive_skb_list_entry(skb); } netif_receive_skb_list_internal(head); trace_netif_receive_skb_list_exit(0); } EXPORT_SYMBOL(netif_receive_skb_list); /* Network device is going away, flush any packets still pending */ static void flush_backlog(struct work_struct *work) { struct sk_buff *skb, *tmp; struct softnet_data *sd; local_bh_disable(); sd = this_cpu_ptr(&softnet_data); backlog_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); dev_kfree_skb_irq(skb); rps_input_queue_head_incr(sd); } } backlog_unlock_irq_enable(sd); local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); kfree_skb(skb); rps_input_queue_head_incr(sd); } } local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); local_bh_enable(); } static bool flush_required(int cpu) { #if IS_ENABLED(CONFIG_RPS) struct softnet_data *sd = &per_cpu(softnet_data, cpu); bool do_flush; backlog_lock_irq_disable(sd); /* as insertion into process_queue happens with the rps lock held, * process_queue access may race only with dequeue */ do_flush = !skb_queue_empty(&sd->input_pkt_queue) || !skb_queue_empty_lockless(&sd->process_queue); backlog_unlock_irq_enable(sd); return do_flush; #endif /* without RPS we can't safely check input_pkt_queue: during a * concurrent remote skb_queue_splice() we can detect as empty both * input_pkt_queue and process_queue even if the latter could end-up * containing a lot of packets. */ return true; } struct flush_backlogs { cpumask_t flush_cpus; struct work_struct w[]; }; static struct flush_backlogs *flush_backlogs_alloc(void) { return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids), GFP_KERNEL); } static struct flush_backlogs *flush_backlogs_fallback; static DEFINE_MUTEX(flush_backlogs_mutex); static void flush_all_backlogs(void) { struct flush_backlogs *ptr = flush_backlogs_alloc(); unsigned int cpu; if (!ptr) { mutex_lock(&flush_backlogs_mutex); ptr = flush_backlogs_fallback; } cpumask_clear(&ptr->flush_cpus); cpus_read_lock(); for_each_online_cpu(cpu) { if (flush_required(cpu)) { INIT_WORK(&ptr->w[cpu], flush_backlog); queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]); __cpumask_set_cpu(cpu, &ptr->flush_cpus); } } /* we can have in flight packet[s] on the cpus we are not flushing, * synchronize_net() in unregister_netdevice_many() will take care of * them. */ for_each_cpu(cpu, &ptr->flush_cpus) flush_work(&ptr->w[cpu]); cpus_read_unlock(); if (ptr != flush_backlogs_fallback) kfree(ptr); else mutex_unlock(&flush_backlogs_mutex); } static void net_rps_send_ipi(struct softnet_data *remsd) { #ifdef CONFIG_RPS while (remsd) { struct softnet_data *next = remsd->rps_ipi_next; if (cpu_online(remsd->cpu)) smp_call_function_single_async(remsd->cpu, &remsd->csd); remsd = next; } #endif } /* * net_rps_action_and_irq_enable sends any pending IPI's for rps. * Note: called with local irq disabled, but exits with local irq enabled. */ static void net_rps_action_and_irq_enable(struct softnet_data *sd) { #ifdef CONFIG_RPS struct softnet_data *remsd = sd->rps_ipi_list; if (!use_backlog_threads() && remsd) { sd->rps_ipi_list = NULL; local_irq_enable(); /* Send pending IPI's to kick RPS processing on remote cpus. */ net_rps_send_ipi(remsd); } else #endif local_irq_enable(); } static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) { #ifdef CONFIG_RPS return !use_backlog_threads() && sd->rps_ipi_list; #else return false; #endif } static int process_backlog(struct napi_struct *napi, int quota) { struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); bool again = true; int work = 0; /* Check if we have pending ipi, its better to send them now, * not waiting net_rx_action() end. */ if (sd_has_rps_ipi_waiting(sd)) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } napi->weight = READ_ONCE(net_hotdata.dev_rx_weight); while (again) { struct sk_buff *skb; local_lock_nested_bh(&softnet_data.process_queue_bh_lock); while ((skb = __skb_dequeue(&sd->process_queue))) { local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); if (++work >= quota) { rps_input_queue_head_add(sd, work); return work; } local_lock_nested_bh(&softnet_data.process_queue_bh_lock); } local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); backlog_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). * only current cpu owns and manipulates this napi, * and NAPI_STATE_SCHED is the only possible flag set * on backlog. * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ napi->state &= NAPIF_STATE_THREADED; again = false; } else { local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); } backlog_unlock_irq_enable(sd); } if (work) rps_input_queue_head_add(sd, work); return work; } /** * __napi_schedule - schedule for receive * @n: entry to schedule * * The entry's receive function will be scheduled to run. * Consider using __napi_schedule_irqoff() if hard irqs are masked. */ void __napi_schedule(struct napi_struct *n) { unsigned long flags; local_irq_save(flags); ____napi_schedule(this_cpu_ptr(&softnet_data), n); local_irq_restore(flags); } EXPORT_SYMBOL(__napi_schedule); /** * napi_schedule_prep - check if napi can be scheduled * @n: napi context * * Test if NAPI routine is already running, and if not mark * it as running. This is used as a condition variable to * insure only one NAPI poll instance runs. We also make * sure there is no pending NAPI disable. */ bool napi_schedule_prep(struct napi_struct *n) { unsigned long new, val = READ_ONCE(n->state); do { if (unlikely(val & NAPIF_STATE_DISABLE)) return false; new = val | NAPIF_STATE_SCHED; /* Sets STATE_MISSED bit if STATE_SCHED was already set * This was suggested by Alexander Duyck, as compiler * emits better code than : * if (val & NAPIF_STATE_SCHED) * new |= NAPIF_STATE_MISSED; */ new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * NAPIF_STATE_MISSED; } while (!try_cmpxchg(&n->state, &val, new)); return !(val & NAPIF_STATE_SCHED); } EXPORT_SYMBOL(napi_schedule_prep); /** * __napi_schedule_irqoff - schedule for receive * @n: entry to schedule * * Variant of __napi_schedule() assuming hard irqs are masked. * * On PREEMPT_RT enabled kernels this maps to __napi_schedule() * because the interrupt disabled assumption might not be true * due to force-threaded interrupts and spinlock substitution. */ void __napi_schedule_irqoff(struct napi_struct *n) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ____napi_schedule(this_cpu_ptr(&softnet_data), n); else __napi_schedule(n); } EXPORT_SYMBOL(__napi_schedule_irqoff); bool napi_complete_done(struct napi_struct *n, int work_done) { unsigned long flags, val, new, timeout = 0; bool ret = true; /* * 1) Don't let napi dequeue from the cpu poll list * just in case its running on a different cpu. * 2) If we are busy polling, do nothing here, we have * the guarantee we will be called later. */ if (unlikely(n->state & (NAPIF_STATE_NPSVC | NAPIF_STATE_IN_BUSY_POLL))) return false; if (work_done) { if (n->gro_bitmask) timeout = napi_get_gro_flush_timeout(n); n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n); } if (n->defer_hard_irqs_count > 0) { n->defer_hard_irqs_count--; timeout = napi_get_gro_flush_timeout(n); if (timeout) ret = false; } if (n->gro_bitmask) { /* When the NAPI instance uses a timeout and keeps postponing * it, we need to bound somehow the time packets are kept in * the GRO layer */ napi_gro_flush(n, !!timeout); } gro_normal_list(n); if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ local_irq_save(flags); list_del_init(&n->poll_list); local_irq_restore(flags); } WRITE_ONCE(n->list_owner, -1); val = READ_ONCE(n->state); do { WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | NAPIF_STATE_SCHED_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); /* If STATE_MISSED was set, leave STATE_SCHED set, * because we will call napi->poll() one more time. * This C code was suggested by Alexander Duyck to help gcc. */ new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * NAPIF_STATE_SCHED; } while (!try_cmpxchg(&n->state, &val, new)); if (unlikely(val & NAPIF_STATE_MISSED)) { __napi_schedule(n); return false; } if (timeout) hrtimer_start(&n->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); return ret; } EXPORT_SYMBOL(napi_complete_done); static void skb_defer_free_flush(struct softnet_data *sd) { struct sk_buff *skb, *next; /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ if (!READ_ONCE(sd->defer_list)) return; spin_lock(&sd->defer_lock); skb = sd->defer_list; sd->defer_list = NULL; sd->defer_count = 0; spin_unlock(&sd->defer_lock); while (skb != NULL) { next = skb->next; napi_consume_skb(skb, 1); skb = next; } } #if defined(CONFIG_NET_RX_BUSY_POLL) static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) { if (!skip_schedule) { gro_normal_list(napi); __napi_schedule(napi); return; } if (napi->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. */ napi_gro_flush(napi, HZ >= 1000); } gro_normal_list(napi); clear_bit(NAPI_STATE_SCHED, &napi->state); } enum { NAPI_F_PREFER_BUSY_POLL = 1, NAPI_F_END_ON_RESCHED = 2, }; static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, unsigned flags, u16 budget) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; bool skip_schedule = false; unsigned long timeout; int rc; /* Busy polling means there is a high chance device driver hard irq * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was * set in napi_schedule_prep(). * Since we are about to call napi->poll() once more, we can safely * clear NAPI_STATE_MISSED. * * Note: x86 could use a single "lock and ..." instruction * to perform these two clear_bit() */ clear_bit(NAPI_STATE_MISSED, &napi->state); clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (flags & NAPI_F_PREFER_BUSY_POLL) { napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi); timeout = napi_get_gro_flush_timeout(napi); if (napi->defer_hard_irqs_count && timeout) { hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); skip_schedule = true; } } /* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ rc = napi->poll(napi, budget); /* We can't gro_normal_list() here, because napi->poll() might have * rearmed the napi (napi_complete_done()) in which case it could * already be running on another CPU. */ trace_napi_poll(napi, rc, budget); netpoll_poll_unlock(have_poll_lock); if (rc == budget) __busy_poll_stop(napi, skip_schedule); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); } static void __napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, unsigned flags, u16 budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; void *have_poll_lock = NULL; struct napi_struct *napi; WARN_ON_ONCE(!rcu_read_lock_held()); restart: napi_poll = NULL; napi = napi_by_id(napi_id); if (!napi) return; if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_disable(); for (;;) { int work = 0; local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (!napi_poll) { unsigned long val = READ_ONCE(napi->state); /* If multiple threads are competing for this napi, * we avoid dirtying napi->state as much as we can. */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | NAPIF_STATE_IN_BUSY_POLL)) { if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | NAPIF_STATE_SCHED) != val) { if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } work = napi_poll(napi, budget); trace_napi_poll(napi, work, budget); gro_normal_list(napi); count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); skb_defer_free_flush(this_cpu_ptr(&softnet_data)); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); if (!loop_end || loop_end(loop_end_arg, start_time)) break; if (unlikely(need_resched())) { if (flags & NAPI_F_END_ON_RESCHED) break; if (napi_poll) busy_poll_stop(napi, have_poll_lock, flags, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); rcu_read_unlock(); cond_resched(); rcu_read_lock(); if (loop_end(loop_end_arg, start_time)) return; goto restart; } cpu_relax(); } if (napi_poll) busy_poll_stop(napi, have_poll_lock, flags, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); } void napi_busy_loop_rcu(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget) { unsigned flags = NAPI_F_END_ON_RESCHED; if (prefer_busy_poll) flags |= NAPI_F_PREFER_BUSY_POLL; __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); } void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget) { unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0; rcu_read_lock(); __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); rcu_read_unlock(); } EXPORT_SYMBOL(napi_busy_loop); void napi_suspend_irqs(unsigned int napi_id) { struct napi_struct *napi; rcu_read_lock(); napi = napi_by_id(napi_id); if (napi) { unsigned long timeout = napi_get_irq_suspend_timeout(napi); if (timeout) hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); } rcu_read_unlock(); } void napi_resume_irqs(unsigned int napi_id) { struct napi_struct *napi; rcu_read_lock(); napi = napi_by_id(napi_id); if (napi) { /* If irq_suspend_timeout is set to 0 between the call to * napi_suspend_irqs and now, the original value still * determines the safety timeout as intended and napi_watchdog * will resume irq processing. */ if (napi_get_irq_suspend_timeout(napi)) { local_bh_disable(); napi_schedule(napi); local_bh_enable(); } } rcu_read_unlock(); } #endif /* CONFIG_NET_RX_BUSY_POLL */ static void __napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id) { WRITE_ONCE(napi->napi_id, napi_id); hlist_add_head_rcu(&napi->napi_hash_node, &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); } static void napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id) { unsigned long flags; spin_lock_irqsave(&napi_hash_lock, flags); WARN_ON_ONCE(napi_by_id(napi_id)); __napi_hash_add_with_id(napi, napi_id); spin_unlock_irqrestore(&napi_hash_lock, flags); } static void napi_hash_add(struct napi_struct *napi) { unsigned long flags; if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) return; spin_lock_irqsave(&napi_hash_lock, flags); /* 0..NR_CPUS range is reserved for sender_cpu use */ do { if (unlikely(++napi_gen_id < MIN_NAPI_ID)) napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); __napi_hash_add_with_id(napi, napi_gen_id); spin_unlock_irqrestore(&napi_hash_lock, flags); } /* Warning : caller is responsible to make sure rcu grace period * is respected before freeing memory containing @napi */ static void napi_hash_del(struct napi_struct *napi) { unsigned long flags; spin_lock_irqsave(&napi_hash_lock, flags); hlist_del_init_rcu(&napi->napi_hash_node); spin_unlock_irqrestore(&napi_hash_lock, flags); } static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) { struct napi_struct *napi; napi = container_of(timer, struct napi_struct, timer); /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ if (!napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) { clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); __napi_schedule_irqoff(napi); } return HRTIMER_NORESTART; } static void init_gro_hash(struct napi_struct *napi) { int i; for (i = 0; i < GRO_HASH_BUCKETS; i++) { INIT_LIST_HEAD(&napi->gro_hash[i].list); napi->gro_hash[i].count = 0; } napi->gro_bitmask = 0; } int dev_set_threaded(struct net_device *dev, bool threaded) { struct napi_struct *napi; int err = 0; netdev_assert_locked_or_invisible(dev); if (dev->threaded == threaded) return 0; if (threaded) { list_for_each_entry(napi, &dev->napi_list, dev_list) { if (!napi->thread) { err = napi_kthread_create(napi); if (err) { threaded = false; break; } } } } WRITE_ONCE(dev->threaded, threaded); /* Make sure kthread is created before THREADED bit * is set. */ smp_mb__before_atomic(); /* Setting/unsetting threaded mode on a napi might not immediately * take effect, if the current napi instance is actively being * polled. In this case, the switch between threaded mode and * softirq mode will happen in the next round of napi_schedule(). * This should not cause hiccups/stalls to the live traffic. */ list_for_each_entry(napi, &dev->napi_list, dev_list) assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); return err; } EXPORT_SYMBOL(dev_set_threaded); /** * netif_queue_set_napi - Associate queue with the napi * @dev: device to which NAPI and queue belong * @queue_index: Index of queue * @type: queue type as RX or TX * @napi: NAPI context, pass NULL to clear previously set NAPI * * Set queue with its corresponding napi context. This should be done after * registering the NAPI handler for the queue-vector and the queues have been * mapped to the corresponding interrupt vector. */ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, enum netdev_queue_type type, struct napi_struct *napi) { struct netdev_rx_queue *rxq; struct netdev_queue *txq; if (WARN_ON_ONCE(napi && !napi->dev)) return; if (dev->reg_state >= NETREG_REGISTERED) ASSERT_RTNL(); switch (type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(dev, queue_index); rxq->napi = napi; return; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(dev, queue_index); txq->napi = napi; return; default: return; } } EXPORT_SYMBOL(netif_queue_set_napi); static void napi_restore_config(struct napi_struct *n) { n->defer_hard_irqs = n->config->defer_hard_irqs; n->gro_flush_timeout = n->config->gro_flush_timeout; n->irq_suspend_timeout = n->config->irq_suspend_timeout; /* a NAPI ID might be stored in the config, if so use it. if not, use * napi_hash_add to generate one for us. */ if (n->config->napi_id) { napi_hash_add_with_id(n, n->config->napi_id); } else { napi_hash_add(n); n->config->napi_id = n->napi_id; } } static void napi_save_config(struct napi_struct *n) { n->config->defer_hard_irqs = n->defer_hard_irqs; n->config->gro_flush_timeout = n->gro_flush_timeout; n->config->irq_suspend_timeout = n->irq_suspend_timeout; napi_hash_del(n); } /* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will * inherit an existing ID try to insert it at the right position. */ static void netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) { unsigned int new_id, pos_id; struct list_head *higher; struct napi_struct *pos; new_id = UINT_MAX; if (napi->config && napi->config->napi_id) new_id = napi->config->napi_id; higher = &dev->napi_list; list_for_each_entry(pos, &dev->napi_list, dev_list) { if (pos->napi_id >= MIN_NAPI_ID) pos_id = pos->napi_id; else if (pos->config) pos_id = pos->config->napi_id; else pos_id = UINT_MAX; if (pos_id <= new_id) break; higher = &pos->dev_list; } list_add_rcu(&napi->dev_list, higher); /* adds after higher */ } void netif_napi_add_weight_locked(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { netdev_assert_locked(dev); if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) return; INIT_LIST_HEAD(&napi->poll_list); INIT_HLIST_NODE(&napi->napi_hash_node); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; init_gro_hash(napi); napi->skb = NULL; INIT_LIST_HEAD(&napi->rx_list); napi->rx_count = 0; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) netdev_err_once(dev, "%s() called with weight %d\n", __func__, weight); napi->weight = weight; napi->dev = dev; #ifdef CONFIG_NETPOLL napi->poll_owner = -1; #endif napi->list_owner = -1; set_bit(NAPI_STATE_SCHED, &napi->state); set_bit(NAPI_STATE_NPSVC, &napi->state); netif_napi_dev_list_add(dev, napi); /* default settings from sysfs are applied to all NAPIs. any per-NAPI * configuration will be loaded in napi_enable */ napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs)); napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout)); napi_get_frags_check(napi); /* Create kthread for this napi if dev->threaded is set. * Clear dev->threaded if kthread creation failed so that * threaded mode will not be enabled in napi_enable(). */ if (dev->threaded && napi_kthread_create(napi)) dev->threaded = false; netif_napi_set_irq_locked(napi, -1); } EXPORT_SYMBOL(netif_napi_add_weight_locked); void napi_disable_locked(struct napi_struct *n) { unsigned long val, new; might_sleep(); netdev_assert_locked(n->dev); set_bit(NAPI_STATE_DISABLE, &n->state); val = READ_ONCE(n->state); do { while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { usleep_range(20, 200); val = READ_ONCE(n->state); } new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); } while (!try_cmpxchg(&n->state, &val, new)); hrtimer_cancel(&n->timer); if (n->config) napi_save_config(n); else napi_hash_del(n); clear_bit(NAPI_STATE_DISABLE, &n->state); } EXPORT_SYMBOL(napi_disable_locked); /** * napi_disable() - prevent NAPI from scheduling * @n: NAPI context * * Stop NAPI from being scheduled on this context. * Waits till any outstanding processing completes. * Takes netdev_lock() for associated net_device. */ void napi_disable(struct napi_struct *n) { netdev_lock(n->dev); napi_disable_locked(n); netdev_unlock(n->dev); } EXPORT_SYMBOL(napi_disable); void napi_enable_locked(struct napi_struct *n) { unsigned long new, val = READ_ONCE(n->state); if (n->config) napi_restore_config(n); else napi_hash_add(n); do { BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC); if (n->dev->threaded && n->thread) new |= NAPIF_STATE_THREADED; } while (!try_cmpxchg(&n->state, &val, new)); } EXPORT_SYMBOL(napi_enable_locked); /** * napi_enable() - enable NAPI scheduling * @n: NAPI context * * Enable scheduling of a NAPI instance. * Must be paired with napi_disable(). * Takes netdev_lock() for associated net_device. */ void napi_enable(struct napi_struct *n) { netdev_lock(n->dev); napi_enable_locked(n); netdev_unlock(n->dev); } EXPORT_SYMBOL(napi_enable); static void flush_gro_hash(struct napi_struct *napi) { int i; for (i = 0; i < GRO_HASH_BUCKETS; i++) { struct sk_buff *skb, *n; list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list) kfree_skb(skb); napi->gro_hash[i].count = 0; } } /* Must be called in process context */ void __netif_napi_del_locked(struct napi_struct *napi) { netdev_assert_locked(napi->dev); if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) return; if (napi->config) { napi->index = -1; napi->config = NULL; } list_del_rcu(&napi->dev_list); napi_free_frags(napi); flush_gro_hash(napi); napi->gro_bitmask = 0; if (napi->thread) { kthread_stop(napi->thread); napi->thread = NULL; } } EXPORT_SYMBOL(__netif_napi_del_locked); static int __napi_poll(struct napi_struct *n, bool *repoll) { int work, weight; weight = n->weight; /* This NAPI_STATE_SCHED test is for avoiding a race * with netpoll's poll_napi(). Only the entity which * obtains the lock and sees NAPI_STATE_SCHED set will * actually make the ->poll() call. Therefore we avoid * accidentally calling ->poll() when NAPI is not scheduled. */ work = 0; if (napi_is_scheduled(n)) { work = n->poll(n, weight); trace_napi_poll(n, work, weight); xdp_do_check_flushed(n); } if (unlikely(work > weight)) netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n", n->poll, work, weight); if (likely(work < weight)) return work; /* Drivers must not modify the NAPI state if they * consume the entire weight. In such cases this code * still "owns" the NAPI instance and therefore can * move the instance around on the list at-will. */ if (unlikely(napi_disable_pending(n))) { napi_complete(n); return work; } /* The NAPI context has more processing work, but busy-polling * is preferred. Exit early. */ if (napi_prefer_busy_poll(n)) { if (napi_complete_done(n, work)) { /* If timeout is not set, we need to make sure * that the NAPI is re-scheduled. */ napi_schedule(n); } return work; } if (n->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. */ napi_gro_flush(n, HZ >= 1000); } gro_normal_list(n); /* Some drivers may have called napi_schedule * prior to exhausting their budget. */ if (unlikely(!list_empty(&n->poll_list))) { pr_warn_once("%s: Budget exhausted after napi rescheduled\n", n->dev ? n->dev->name : "backlog"); return work; } *repoll = true; return work; } static int napi_poll(struct napi_struct *n, struct list_head *repoll) { bool do_repoll = false; void *have; int work; list_del_init(&n->poll_list); have = netpoll_poll_lock(n); work = __napi_poll(n, &do_repoll); if (do_repoll) list_add_tail(&n->poll_list, repoll); netpoll_poll_unlock(have); return work; } static int napi_thread_wait(struct napi_struct *napi) { set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { /* Testing SCHED_THREADED bit here to make sure the current * kthread owns this napi and could poll on this napi. * Testing SCHED bit is not enough because SCHED bit might be * set by some other busy poll thread or by napi_disable(). */ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) { WARN_ON(!list_empty(&napi->poll_list)); __set_current_state(TASK_RUNNING); return 0; } schedule(); set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); return -1; } static void napi_threaded_poll_loop(struct napi_struct *napi) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct softnet_data *sd; unsigned long last_qs = jiffies; for (;;) { bool repoll = false; void *have; local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); sd = this_cpu_ptr(&softnet_data); sd->in_napi_threaded_poll = true; have = netpoll_poll_lock(napi); __napi_poll(napi, &repoll); netpoll_poll_unlock(have); sd->in_napi_threaded_poll = false; barrier(); if (sd_has_rps_ipi_waiting(sd)) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } skb_defer_free_flush(sd); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); if (!repoll) break; rcu_softirq_qs_periodic(last_qs); cond_resched(); } } static int napi_threaded_poll(void *data) { struct napi_struct *napi = data; while (!napi_thread_wait(napi)) napi_threaded_poll_loop(napi); return 0; } static __latent_entropy void net_rx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs)); struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int budget = READ_ONCE(net_hotdata.netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); start: sd->in_net_rx_action = true; local_irq_disable(); list_splice_init(&sd->poll_list, &list); local_irq_enable(); for (;;) { struct napi_struct *n; skb_defer_free_flush(sd); if (list_empty(&list)) { if (list_empty(&repoll)) { sd->in_net_rx_action = false; barrier(); /* We need to check if ____napi_schedule() * had refilled poll_list while * sd->in_net_rx_action was true. */ if (!list_empty(&sd->poll_list)) goto start; if (!sd_has_rps_ipi_waiting(sd)) goto end; } break; } n = list_first_entry(&list, struct napi_struct, poll_list); budget -= napi_poll(n, &repoll); /* If softirq window is exhausted then punt. * Allow this to run for 2 jiffies since which will allow * an average latency of 1.5/HZ. */ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) { sd->time_squeeze++; break; } } local_irq_disable(); list_splice_tail_init(&sd->poll_list, &list); list_splice_tail(&repoll, &list); list_splice(&list, &sd->poll_list); if (!list_empty(&sd->poll_list)) __raise_softirq_irqoff(NET_RX_SOFTIRQ); else sd->in_net_rx_action = false; net_rps_action_and_irq_enable(sd); end: bpf_net_ctx_clear(bpf_net_ctx); } struct netdev_adjacent { struct net_device *dev; netdevice_tracker dev_tracker; /* upper master flag, there can only be one master device per list */ bool master; /* lookup ignore flag */ bool ignore; /* counter for the number of times this device was added to us */ u16 ref_nr; /* private field for the users */ void *private; struct list_head list; struct rcu_head rcu; }; static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, struct list_head *adj_list) { struct netdev_adjacent *adj; list_for_each_entry(adj, adj_list, list) { if (adj->dev == adj_dev) return adj; } return NULL; } static int ____netdev_has_upper_dev(struct net_device *upper_dev, struct netdev_nested_priv *priv) { struct net_device *dev = (struct net_device *)priv->data; return upper_dev == dev; } /** * netdev_has_upper_dev - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * * Find out if a device is linked to specified upper device and return true * in case it is. Note that this checks only immediate upper device, * not through a complete stack of devices. The caller must hold the RTNL lock. */ bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .data = (void *)upper_dev, }; ASSERT_RTNL(); return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, &priv); } EXPORT_SYMBOL(netdev_has_upper_dev); /** * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * * Find out if a device is linked to specified upper device and return true * in case it is. Note that this checks the entire upper device chain. * The caller must hold rcu lock. */ bool netdev_has_upper_dev_all_rcu(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .data = (void *)upper_dev, }; return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, &priv); } EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); /** * netdev_has_any_upper_dev - Check if device is linked to some device * @dev: device * * Find out if a device is linked to an upper device and return true in case * it is. The caller must hold the RTNL lock. */ bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); return !list_empty(&dev->adj_list.upper); } EXPORT_SYMBOL(netdev_has_any_upper_dev); /** * netdev_master_upper_dev_get - Get master upper device * @dev: device * * Find a master upper device and return pointer to it or NULL in case * it's not there. The caller must hold the RTNL lock. */ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) { struct netdev_adjacent *upper; ASSERT_RTNL(); if (list_empty(&dev->adj_list.upper)) return NULL; upper = list_first_entry(&dev->adj_list.upper, struct netdev_adjacent, list); if (likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get); static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev) { struct netdev_adjacent *upper; ASSERT_RTNL(); if (list_empty(&dev->adj_list.upper)) return NULL; upper = list_first_entry(&dev->adj_list.upper, struct netdev_adjacent, list); if (likely(upper->master) && !upper->ignore) return upper->dev; return NULL; } /** * netdev_has_any_lower_dev - Check if device is linked to some device * @dev: device * * Find out if a device is linked to a lower device and return true in case * it is. The caller must hold the RTNL lock. */ static bool netdev_has_any_lower_dev(struct net_device *dev) { ASSERT_RTNL(); return !list_empty(&dev->adj_list.lower); } void *netdev_adjacent_get_private(struct list_head *adj_list) { struct netdev_adjacent *adj; adj = list_entry(adj_list, struct netdev_adjacent, list); return adj->private; } EXPORT_SYMBOL(netdev_adjacent_get_private); /** * netdev_upper_get_next_dev_rcu - Get the next dev from upper list * @dev: device * @iter: list_head ** of the current position * * Gets the next device from the dev's upper list, starting from iter * position. The caller must hold RCU read lock. */ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *upper; WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; return upper->dev; } EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); static struct net_device *__netdev_next_upper_dev(struct net_device *dev, struct list_head **iter, bool *ignore) { struct netdev_adjacent *upper; upper = list_entry((*iter)->next, struct netdev_adjacent, list); if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; *ignore = upper->ignore; return upper->dev; } static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *upper; WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; return upper->dev; } static int __netdev_walk_all_upper_dev(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; bool ignore; now = dev; iter = &dev->adj_list.upper; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { udev = __netdev_next_upper_dev(now, &iter, &ignore); if (!udev) break; if (ignore) continue; next = udev; niter = &udev->adj_list.upper; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } int netdev_walk_all_upper_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; now = dev; iter = &dev->adj_list.upper; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { udev = netdev_next_upper_dev_rcu(now, &iter); if (!udev) break; next = udev; niter = &udev->adj_list.upper; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); static bool __netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .flags = 0, .data = (void *)upper_dev, }; ASSERT_RTNL(); return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev, &priv); } /** * netdev_lower_get_next_private - Get the next ->private from the * lower neighbour list * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent->private from the dev's lower neighbour * list, starting from iter position. The caller must hold either hold the * RTNL lock or its own locking that guarantees that the neighbour lower * list will remain unchanged. */ void *netdev_lower_get_next_private(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry(*iter, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = lower->list.next; return lower->private; } EXPORT_SYMBOL(netdev_lower_get_next_private); /** * netdev_lower_get_next_private_rcu - Get the next ->private from the * lower neighbour list, RCU * variant * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent->private from the dev's lower neighbour * list, starting from iter position. The caller must hold RCU read lock. */ void *netdev_lower_get_next_private_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; return lower->private; } EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); /** * netdev_lower_get_next - Get the next device from the lower neighbour * list * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent from the dev's lower neighbour * list, starting from iter position. The caller must hold RTNL lock or * its own locking that guarantees that the neighbour lower * list will remain unchanged. */ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry(*iter, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = lower->list.next; return lower->dev; } EXPORT_SYMBOL(netdev_lower_get_next); static struct net_device *netdev_next_lower_dev(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; return lower->dev; } static struct net_device *__netdev_next_lower_dev(struct net_device *dev, struct list_head **iter, bool *ignore) { struct netdev_adjacent *lower; lower = list_entry((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; *ignore = lower->ignore; return lower->dev; } int netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; now = dev; iter = &dev->adj_list.lower; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { ldev = netdev_next_lower_dev(now, &iter); if (!ldev) break; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); static int __netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; bool ignore; now = dev; iter = &dev->adj_list.lower; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { ldev = __netdev_next_lower_dev(now, &iter, &ignore); if (!ldev) break; if (ignore) continue; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; return lower->dev; } EXPORT_SYMBOL(netdev_next_lower_dev_rcu); static u8 __netdev_upper_depth(struct net_device *dev) { struct net_device *udev; struct list_head *iter; u8 max_depth = 0; bool ignore; for (iter = &dev->adj_list.upper, udev = __netdev_next_upper_dev(dev, &iter, &ignore); udev; udev = __netdev_next_upper_dev(dev, &iter, &ignore)) { if (ignore) continue; if (max_depth < udev->upper_level) max_depth = udev->upper_level; } return max_depth; } static u8 __netdev_lower_depth(struct net_device *dev) { struct net_device *ldev; struct list_head *iter; u8 max_depth = 0; bool ignore; for (iter = &dev->adj_list.lower, ldev = __netdev_next_lower_dev(dev, &iter, &ignore); ldev; ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) { if (ignore) continue; if (max_depth < ldev->lower_level) max_depth = ldev->lower_level; } return max_depth; } static int __netdev_update_upper_level(struct net_device *dev, struct netdev_nested_priv *__unused) { dev->upper_level = __netdev_upper_depth(dev) + 1; return 0; } #ifdef CONFIG_LOCKDEP static LIST_HEAD(net_unlink_list); static void net_unlink_todo(struct net_device *dev) { if (list_empty(&dev->unlink_list)) list_add_tail(&dev->unlink_list, &net_unlink_list); } #endif static int __netdev_update_lower_level(struct net_device *dev, struct netdev_nested_priv *priv) { dev->lower_level = __netdev_lower_depth(dev) + 1; #ifdef CONFIG_LOCKDEP if (!priv) return 0; if (priv->flags & NESTED_SYNC_IMM) dev->nested_level = dev->lower_level - 1; if (priv->flags & NESTED_SYNC_TODO) net_unlink_todo(dev); #endif return 0; } int netdev_walk_all_lower_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; now = dev; iter = &dev->adj_list.lower; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { ldev = netdev_next_lower_dev_rcu(now, &iter); if (!ldev) break; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu); /** * netdev_lower_get_first_private_rcu - Get the first ->private from the * lower neighbour list, RCU * variant * @dev: device * * Gets the first netdev_adjacent->private from the dev's lower neighbour * list. The caller must hold RCU read lock. */ void *netdev_lower_get_first_private_rcu(struct net_device *dev) { struct netdev_adjacent *lower; lower = list_first_or_null_rcu(&dev->adj_list.lower, struct netdev_adjacent, list); if (lower) return lower->private; return NULL; } EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); /** * netdev_master_upper_dev_get_rcu - Get master upper device * @dev: device * * Find a master upper device and return pointer to it or NULL in case * it's not there. The caller must hold the RCU read lock. */ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { struct netdev_adjacent *upper; upper = list_first_or_null_rcu(&dev->adj_list.upper, struct netdev_adjacent, list); if (upper && likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); static int netdev_adjacent_sysfs_add(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list) { char linkname[IFNAMSIZ+7]; sprintf(linkname, dev_list == &dev->adj_list.upper ? "upper_%s" : "lower_%s", adj_dev->name); return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), linkname); } static void netdev_adjacent_sysfs_del(struct net_device *dev, char *name, struct list_head *dev_list) { char linkname[IFNAMSIZ+7]; sprintf(linkname, dev_list == &dev->adj_list.upper ? "upper_%s" : "lower_%s", name); sysfs_remove_link(&(dev->dev.kobj), linkname); } static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list) { return (dev_list == &dev->adj_list.upper || dev_list == &dev->adj_list.lower) && net_eq(dev_net(dev), dev_net(adj_dev)); } static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list, void *private, bool master) { struct netdev_adjacent *adj; int ret; adj = __netdev_find_adj(adj_dev, dev_list); if (adj) { adj->ref_nr += 1; pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n", dev->name, adj_dev->name, adj->ref_nr); return 0; } adj = kmalloc(sizeof(*adj), GFP_KERNEL); if (!adj) return -ENOMEM; adj->dev = adj_dev; adj->master = master; adj->ref_nr = 1; adj->private = private; adj->ignore = false; netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL); pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n", dev->name, adj_dev->name, adj->ref_nr, adj_dev->name); if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); if (ret) goto free_adj; } /* Ensure that master link is always the first item in list. */ if (master) { ret = sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), "master"); if (ret) goto remove_symlinks; list_add_rcu(&adj->list, dev_list); } else { list_add_tail_rcu(&adj->list, dev_list); } return 0; remove_symlinks: if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); free_adj: netdev_put(adj_dev, &adj->dev_tracker); kfree(adj); return ret; } static void __netdev_adjacent_dev_remove(struct net_device *dev, struct net_device *adj_dev, u16 ref_nr, struct list_head *dev_list) { struct netdev_adjacent *adj; pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n", dev->name, adj_dev->name, ref_nr); adj = __netdev_find_adj(adj_dev, dev_list); if (!adj) { pr_err("Adjacency does not exist for device %s from %s\n", dev->name, adj_dev->name); WARN_ON(1); return; } if (adj->ref_nr > ref_nr) { pr_debug("adjacency: %s to %s ref_nr - %d = %d\n", dev->name, adj_dev->name, ref_nr, adj->ref_nr - ref_nr); adj->ref_nr -= ref_nr; return; } if (adj->master) sysfs_remove_link(&(dev->dev.kobj), "master"); if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); list_del_rcu(&adj->list); pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); netdev_put(adj_dev, &adj->dev_tracker); kfree_rcu(adj, rcu); } static int __netdev_adjacent_dev_link_lists(struct net_device *dev, struct net_device *upper_dev, struct list_head *up_list, struct list_head *down_list, void *private, bool master) { int ret; ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, master); if (ret) return ret; ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, false); if (ret) { __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list); return ret; } return 0; } static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, struct net_device *upper_dev, u16 ref_nr, struct list_head *up_list, struct list_head *down_list) { __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); } static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, struct net_device *upper_dev, void *private, bool master) { return __netdev_adjacent_dev_link_lists(dev, upper_dev, &dev->adj_list.upper, &upper_dev->adj_list.lower, private, master); } static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, struct net_device *upper_dev) { __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, &dev->adj_list.upper, &upper_dev->adj_list.lower); } static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, void *upper_priv, void *upper_info, struct netdev_nested_priv *priv, struct netlink_ext_ack *extack) { struct netdev_notifier_changeupper_info changeupper_info = { .info = { .dev = dev, .extack = extack, }, .upper_dev = upper_dev, .master = master, .linking = true, .upper_info = upper_info, }; struct net_device *master_dev; int ret = 0; ASSERT_RTNL(); if (dev == upper_dev) return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ if (__netdev_has_upper_dev(upper_dev, dev)) return -EBUSY; if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV) return -EMLINK; if (!master) { if (__netdev_has_upper_dev(dev, upper_dev)) return -EEXIST; } else { master_dev = __netdev_master_upper_dev_get(dev); if (master_dev) return master_dev == upper_dev ? -EEXIST : -EBUSY; } ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) return ret; ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, master); if (ret) return ret; ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) goto rollback; __netdev_update_upper_level(dev, NULL); __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); __netdev_update_lower_level(upper_dev, priv); __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, priv); return 0; rollback: __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); return ret; } /** * netdev_upper_dev_link - Add a link to the upper device * @dev: device * @upper_dev: new upper device * @extack: netlink extended ack * * Adds a link to device which is upper to this one. The caller must hold * the RTNL lock. On a failure a negative errno code is returned. * On success the reference counts are adjusted and the function * returns zero. */ int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, struct netlink_ext_ack *extack) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, .data = NULL, }; return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL, &priv, extack); } EXPORT_SYMBOL(netdev_upper_dev_link); /** * netdev_master_upper_dev_link - Add a master link to the upper device * @dev: device * @upper_dev: new upper device * @upper_priv: upper device private * @upper_info: upper info to be passed down via notifier * @extack: netlink extended ack * * Adds a link to device which is upper to this one. In this case, only * one master upper device can be linked, although other non-master devices * might be linked as well. The caller must hold the RTNL lock. * On a failure a negative errno code is returned. On success the reference * counts are adjusted and the function returns zero. */ int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, void *upper_priv, void *upper_info, struct netlink_ext_ack *extack) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, .data = NULL, }; return __netdev_upper_dev_link(dev, upper_dev, true, upper_priv, upper_info, &priv, extack); } EXPORT_SYMBOL(netdev_master_upper_dev_link); static void __netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev, struct netdev_nested_priv *priv) { struct netdev_notifier_changeupper_info changeupper_info = { .info = { .dev = dev, }, .upper_dev = upper_dev, .linking = false, }; ASSERT_RTNL(); changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); __netdev_update_upper_level(dev, NULL); __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); __netdev_update_lower_level(upper_dev, priv); __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, priv); } /** * netdev_upper_dev_unlink - Removes a link to upper device * @dev: device * @upper_dev: new upper device * * Removes a link to device which is upper to this one. The caller must hold * the RTNL lock. */ void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_TODO, .data = NULL, }; __netdev_upper_dev_unlink(dev, upper_dev, &priv); } EXPORT_SYMBOL(netdev_upper_dev_unlink); static void __netdev_adjacent_dev_set(struct net_device *upper_dev, struct net_device *lower_dev, bool val) { struct netdev_adjacent *adj; adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower); if (adj) adj->ignore = val; adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper); if (adj) adj->ignore = val; } static void netdev_adjacent_dev_disable(struct net_device *upper_dev, struct net_device *lower_dev) { __netdev_adjacent_dev_set(upper_dev, lower_dev, true); } static void netdev_adjacent_dev_enable(struct net_device *upper_dev, struct net_device *lower_dev) { __netdev_adjacent_dev_set(upper_dev, lower_dev, false); } int netdev_adjacent_change_prepare(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev, struct netlink_ext_ack *extack) { struct netdev_nested_priv priv = { .flags = 0, .data = NULL, }; int err; if (!new_dev) return 0; if (old_dev && new_dev != old_dev) netdev_adjacent_dev_disable(dev, old_dev); err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv, extack); if (err) { if (old_dev && new_dev != old_dev) netdev_adjacent_dev_enable(dev, old_dev); return err; } return 0; } EXPORT_SYMBOL(netdev_adjacent_change_prepare); void netdev_adjacent_change_commit(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, .data = NULL, }; if (!new_dev || !old_dev) return; if (new_dev == old_dev) return; netdev_adjacent_dev_enable(dev, old_dev); __netdev_upper_dev_unlink(old_dev, dev, &priv); } EXPORT_SYMBOL(netdev_adjacent_change_commit); void netdev_adjacent_change_abort(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev) { struct netdev_nested_priv priv = { .flags = 0, .data = NULL, }; if (!new_dev) return; if (old_dev && new_dev != old_dev) netdev_adjacent_dev_enable(dev, old_dev); __netdev_upper_dev_unlink(new_dev, dev, &priv); } EXPORT_SYMBOL(netdev_adjacent_change_abort); /** * netdev_bonding_info_change - Dispatch event about slave change * @dev: device * @bonding_info: info to dispatch * * Send NETDEV_BONDING_INFO to netdev notifiers with info. * The caller must hold the RTNL lock. */ void netdev_bonding_info_change(struct net_device *dev, struct netdev_bonding_info *bonding_info) { struct netdev_notifier_bonding_info info = { .info.dev = dev, }; memcpy(&info.bonding_info, bonding_info, sizeof(struct netdev_bonding_info)); call_netdevice_notifiers_info(NETDEV_BONDING_INFO, &info.info); } EXPORT_SYMBOL(netdev_bonding_info_change); static int netdev_offload_xstats_enable_l3(struct net_device *dev, struct netlink_ext_ack *extack) { struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .info.extack = extack, .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, }; int err; int rc; dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3), GFP_KERNEL); if (!dev->offload_xstats_l3) return -ENOMEM; rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE, NETDEV_OFFLOAD_XSTATS_DISABLE, &info.info); err = notifier_to_errno(rc); if (err) goto free_stats; return 0; free_stats: kfree(dev->offload_xstats_l3); dev->offload_xstats_l3 = NULL; return err; } int netdev_offload_xstats_enable(struct net_device *dev, enum netdev_offload_xstats_type type, struct netlink_ext_ack *extack) { ASSERT_RTNL(); if (netdev_offload_xstats_enabled(dev, type)) return -EALREADY; switch (type) { case NETDEV_OFFLOAD_XSTATS_TYPE_L3: return netdev_offload_xstats_enable_l3(dev, extack); } WARN_ON(1); return -EINVAL; } EXPORT_SYMBOL(netdev_offload_xstats_enable); static void netdev_offload_xstats_disable_l3(struct net_device *dev) { struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, }; call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE, &info.info); kfree(dev->offload_xstats_l3); dev->offload_xstats_l3 = NULL; } int netdev_offload_xstats_disable(struct net_device *dev, enum netdev_offload_xstats_type type) { ASSERT_RTNL(); if (!netdev_offload_xstats_enabled(dev, type)) return -EALREADY; switch (type) { case NETDEV_OFFLOAD_XSTATS_TYPE_L3: netdev_offload_xstats_disable_l3(dev); return 0; } WARN_ON(1); return -EINVAL; } EXPORT_SYMBOL(netdev_offload_xstats_disable); static void netdev_offload_xstats_disable_all(struct net_device *dev) { netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3); } static struct rtnl_hw_stats64 * netdev_offload_xstats_get_ptr(const struct net_device *dev, enum netdev_offload_xstats_type type) { switch (type) { case NETDEV_OFFLOAD_XSTATS_TYPE_L3: return dev->offload_xstats_l3; } WARN_ON(1); return NULL; } bool netdev_offload_xstats_enabled(const struct net_device *dev, enum netdev_offload_xstats_type type) { ASSERT_RTNL(); return netdev_offload_xstats_get_ptr(dev, type); } EXPORT_SYMBOL(netdev_offload_xstats_enabled); struct netdev_notifier_offload_xstats_ru { bool used; }; struct netdev_notifier_offload_xstats_rd { struct rtnl_hw_stats64 stats; bool used; }; static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest, const struct rtnl_hw_stats64 *src) { dest->rx_packets += src->rx_packets; dest->tx_packets += src->tx_packets; dest->rx_bytes += src->rx_bytes; dest->tx_bytes += src->tx_bytes; dest->rx_errors += src->rx_errors; dest->tx_errors += src->tx_errors; dest->rx_dropped += src->rx_dropped; dest->tx_dropped += src->tx_dropped; dest->multicast += src->multicast; } static int netdev_offload_xstats_get_used(struct net_device *dev, enum netdev_offload_xstats_type type, bool *p_used, struct netlink_ext_ack *extack) { struct netdev_notifier_offload_xstats_ru report_used = {}; struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .info.extack = extack, .type = type, .report_used = &report_used, }; int rc; WARN_ON(!netdev_offload_xstats_enabled(dev, type)); rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED, &info.info); *p_used = report_used.used; return notifier_to_errno(rc); } static int netdev_offload_xstats_get_stats(struct net_device *dev, enum netdev_offload_xstats_type type, struct rtnl_hw_stats64 *p_stats, bool *p_used, struct netlink_ext_ack *extack) { struct netdev_notifier_offload_xstats_rd report_delta = {}; struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .info.extack = extack, .type = type, .report_delta = &report_delta, }; struct rtnl_hw_stats64 *stats; int rc; stats = netdev_offload_xstats_get_ptr(dev, type); if (WARN_ON(!stats)) return -EINVAL; rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, &info.info); /* Cache whatever we got, even if there was an error, otherwise the * successful stats retrievals would get lost. */ netdev_hw_stats64_add(stats, &report_delta.stats); if (p_stats) *p_stats = *stats; *p_used = report_delta.used; return notifier_to_errno(rc); } int netdev_offload_xstats_get(struct net_device *dev, enum netdev_offload_xstats_type type, struct rtnl_hw_stats64 *p_stats, bool *p_used, struct netlink_ext_ack *extack) { ASSERT_RTNL(); if (p_stats) return netdev_offload_xstats_get_stats(dev, type, p_stats, p_used, extack); else return netdev_offload_xstats_get_used(dev, type, p_used, extack); } EXPORT_SYMBOL(netdev_offload_xstats_get); void netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta, const struct rtnl_hw_stats64 *stats) { report_delta->used = true; netdev_hw_stats64_add(&report_delta->stats, stats); } EXPORT_SYMBOL(netdev_offload_xstats_report_delta); void netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used) { report_used->used = true; } EXPORT_SYMBOL(netdev_offload_xstats_report_used); void netdev_offload_xstats_push_delta(struct net_device *dev, enum netdev_offload_xstats_type type, const struct rtnl_hw_stats64 *p_stats) { struct rtnl_hw_stats64 *stats; ASSERT_RTNL(); stats = netdev_offload_xstats_get_ptr(dev, type); if (WARN_ON(!stats)) return; netdev_hw_stats64_add(stats, p_stats); } EXPORT_SYMBOL(netdev_offload_xstats_push_delta); /** * netdev_get_xmit_slave - Get the xmit slave of master device * @dev: device * @skb: The packet * @all_slaves: assume all the slaves are active * * The reference counters are not incremented so the caller must be * careful with locks. The caller must hold RCU lock. * %NULL is returned if no slave is found. */ struct net_device *netdev_get_xmit_slave(struct net_device *dev, struct sk_buff *skb, bool all_slaves) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_get_xmit_slave) return NULL; return ops->ndo_get_xmit_slave(dev, skb, all_slaves); } EXPORT_SYMBOL(netdev_get_xmit_slave); static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev, struct sock *sk) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_sk_get_lower_dev) return NULL; return ops->ndo_sk_get_lower_dev(dev, sk); } /** * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket * @dev: device * @sk: the socket * * %NULL is returned if no lower device is found. */ struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, struct sock *sk) { struct net_device *lower; lower = netdev_sk_get_lower_dev(dev, sk); while (lower) { dev = lower; lower = netdev_sk_get_lower_dev(dev, sk); } return dev; } EXPORT_SYMBOL(netdev_sk_get_lowest_dev); static void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter; struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_add(dev, iter->dev, &dev->adj_list.upper); } list_for_each_entry(iter, &dev->adj_list.lower, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_add(dev, iter->dev, &dev->adj_list.lower); } } static void netdev_adjacent_del_links(struct net_device *dev) { struct netdev_adjacent *iter; struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, dev->name, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_del(dev, iter->dev->name, &dev->adj_list.upper); } list_for_each_entry(iter, &dev->adj_list.lower, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, dev->name, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_del(dev, iter->dev->name, &dev->adj_list.lower); } } void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) { struct netdev_adjacent *iter; struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.lower); } list_for_each_entry(iter, &dev->adj_list.lower, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.upper); } } void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev) { struct netdev_adjacent *lower; if (!lower_dev) return NULL; lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); if (!lower) return NULL; return lower->private; } EXPORT_SYMBOL(netdev_lower_dev_get_private); /** * netdev_lower_state_changed - Dispatch event about lower device state change * @lower_dev: device * @lower_state_info: state to dispatch * * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. * The caller must hold the RTNL lock. */ void netdev_lower_state_changed(struct net_device *lower_dev, void *lower_state_info) { struct netdev_notifier_changelowerstate_info changelowerstate_info = { .info.dev = lower_dev, }; ASSERT_RTNL(); changelowerstate_info.lower_state_info = lower_state_info; call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, &changelowerstate_info.info); } EXPORT_SYMBOL(netdev_lower_state_changed); static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_change_rx_flags) ops->ndo_change_rx_flags(dev, flags); } static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; unsigned int promiscuity, flags; kuid_t uid; kgid_t gid; ASSERT_RTNL(); promiscuity = dev->promiscuity + inc; if (promiscuity == 0) { /* * Avoid overflow. * If inc causes overflow, untouch promisc and return error. */ if (unlikely(inc > 0)) { netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n"); return -EOVERFLOW; } flags = old_flags & ~IFF_PROMISC; } else { flags = old_flags | IFF_PROMISC; } WRITE_ONCE(dev->promiscuity, promiscuity); if (flags != old_flags) { WRITE_ONCE(dev->flags, flags); netdev_info(dev, "%s promiscuous mode\n", dev->flags & IFF_PROMISC ? "entered" : "left"); if (audit_enabled) { current_uid_gid(&uid, &gid); audit_log(audit_context(), GFP_ATOMIC, AUDIT_ANOM_PROMISCUOUS, "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", dev->name, (dev->flags & IFF_PROMISC), (old_flags & IFF_PROMISC), from_kuid(&init_user_ns, audit_get_loginuid(current)), from_kuid(&init_user_ns, uid), from_kgid(&init_user_ns, gid), audit_get_sessionid(current)); } dev_change_rx_flags(dev, IFF_PROMISC); } if (notify) __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL); return 0; } /** * dev_set_promiscuity - update promiscuity count on a device * @dev: device * @inc: modifier * * Add or remove promiscuity from a device. While the count in the device * remains above zero the interface remains promiscuous. Once it hits zero * the device reverts back to normal filtering operation. A negative inc * value is used to drop promiscuity on the device. * Return 0 if successful or a negative errno code on error. */ int dev_set_promiscuity(struct net_device *dev, int inc) { unsigned int old_flags = dev->flags; int err; err = __dev_set_promiscuity(dev, inc, true); if (err < 0) return err; if (dev->flags != old_flags) dev_set_rx_mode(dev); return err; } EXPORT_SYMBOL(dev_set_promiscuity); static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags, old_gflags = dev->gflags; unsigned int allmulti, flags; ASSERT_RTNL(); allmulti = dev->allmulti + inc; if (allmulti == 0) { /* * Avoid overflow. * If inc causes overflow, untouch allmulti and return error. */ if (unlikely(inc > 0)) { netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n"); return -EOVERFLOW; } flags = old_flags & ~IFF_ALLMULTI; } else { flags = old_flags | IFF_ALLMULTI; } WRITE_ONCE(dev->allmulti, allmulti); if (flags != old_flags) { WRITE_ONCE(dev->flags, flags); netdev_info(dev, "%s allmulticast mode\n", dev->flags & IFF_ALLMULTI ? "entered" : "left"); dev_change_rx_flags(dev, IFF_ALLMULTI); dev_set_rx_mode(dev); if (notify) __dev_notify_flags(dev, old_flags, dev->gflags ^ old_gflags, 0, NULL); } return 0; } /** * dev_set_allmulti - update allmulti count on a device * @dev: device * @inc: modifier * * Add or remove reception of all multicast frames to a device. While the * count in the device remains above zero the interface remains listening * to all interfaces. Once it hits zero the device reverts back to normal * filtering operation. A negative @inc value is used to drop the counter * when releasing a resource needing all multicasts. * Return 0 if successful or a negative errno code on error. */ int dev_set_allmulti(struct net_device *dev, int inc) { return __dev_set_allmulti(dev, inc, true); } EXPORT_SYMBOL(dev_set_allmulti); /* * Upload unicast and multicast address lists to device and * configure RX filtering. When the device doesn't support unicast * filtering it is put in promiscuous mode while unicast addresses * are present. */ void __dev_set_rx_mode(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; /* dev_open will call this function so the list will stay sane. */ if (!(dev->flags&IFF_UP)) return; if (!netif_device_present(dev)) return; if (!(dev->priv_flags & IFF_UNICAST_FLT)) { /* Unicast addresses changes may only happen under the rtnl, * therefore calling __dev_set_promiscuity here is safe. */ if (!netdev_uc_empty(dev) && !dev->uc_promisc) { __dev_set_promiscuity(dev, 1, false); dev->uc_promisc = true; } else if (netdev_uc_empty(dev) && dev->uc_promisc) { __dev_set_promiscuity(dev, -1, false); dev->uc_promisc = false; } } if (ops->ndo_set_rx_mode) ops->ndo_set_rx_mode(dev); } void dev_set_rx_mode(struct net_device *dev) { netif_addr_lock_bh(dev); __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); } /** * dev_get_flags - get flags reported to userspace * @dev: device * * Get the combination of flag bits exported through APIs to userspace. */ unsigned int dev_get_flags(const struct net_device *dev) { unsigned int flags; flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC | IFF_ALLMULTI | IFF_RUNNING | IFF_LOWER_UP | IFF_DORMANT)) | (READ_ONCE(dev->gflags) & (IFF_PROMISC | IFF_ALLMULTI)); if (netif_running(dev)) { if (netif_oper_up(dev)) flags |= IFF_RUNNING; if (netif_carrier_ok(dev)) flags |= IFF_LOWER_UP; if (netif_dormant(dev)) flags |= IFF_DORMANT; } return flags; } EXPORT_SYMBOL(dev_get_flags); int __dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack) { unsigned int old_flags = dev->flags; int ret; ASSERT_RTNL(); /* * Set the flags on our device. */ dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | IFF_AUTOMEDIA)) | (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | IFF_ALLMULTI)); /* * Load in the correct multicast list now the flags have changed. */ if ((old_flags ^ flags) & IFF_MULTICAST) dev_change_rx_flags(dev, IFF_MULTICAST); dev_set_rx_mode(dev); /* * Have we downed the interface. We handle IFF_UP ourselves * according to user attempts to set it, rather than blindly * setting it. */ ret = 0; if ((old_flags ^ flags) & IFF_UP) { if (old_flags & IFF_UP) __dev_close(dev); else ret = __dev_open(dev, extack); } if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; unsigned int old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; if (__dev_set_promiscuity(dev, inc, false) >= 0) if (dev->flags != old_flags) dev_set_rx_mode(dev); } /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI * is important. Some (broken) drivers set IFF_PROMISC, when * IFF_ALLMULTI is requested not asking us and not reporting. */ if ((flags ^ dev->gflags) & IFF_ALLMULTI) { int inc = (flags & IFF_ALLMULTI) ? 1 : -1; dev->gflags ^= IFF_ALLMULTI; __dev_set_allmulti(dev, inc, false); } return ret; } void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, unsigned int gchanges, u32 portid, const struct nlmsghdr *nlh) { unsigned int changes = dev->flags ^ old_flags; if (gchanges) rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh); if (changes & IFF_UP) { if (dev->flags & IFF_UP) call_netdevice_notifiers(NETDEV_UP, dev); else call_netdevice_notifiers(NETDEV_DOWN, dev); } if (dev->flags & IFF_UP && (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { struct netdev_notifier_change_info change_info = { .info = { .dev = dev, }, .flags_changed = changes, }; call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); } } /** * dev_change_flags - change device settings * @dev: device * @flags: device state flags * @extack: netlink extended ack * * Change settings on device based state flags. The flags are * in the userspace exported format. */ int dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack) { int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; ret = __dev_change_flags(dev, flags, extack); if (ret < 0) return ret; changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); __dev_notify_flags(dev, old_flags, changes, 0, NULL); return ret; } EXPORT_SYMBOL(dev_change_flags); int __dev_set_mtu(struct net_device *dev, int new_mtu) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_change_mtu) return ops->ndo_change_mtu(dev, new_mtu); /* Pairs with all the lockless reads of dev->mtu in the stack */ WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(__dev_set_mtu); int dev_validate_mtu(struct net_device *dev, int new_mtu, struct netlink_ext_ack *extack) { /* MTU must be positive, and in range */ if (new_mtu < 0 || new_mtu < dev->min_mtu) { NL_SET_ERR_MSG(extack, "mtu less than device minimum"); return -EINVAL; } if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { NL_SET_ERR_MSG(extack, "mtu greater than device maximum"); return -EINVAL; } return 0; } /** * dev_set_mtu_ext - Change maximum transfer unit * @dev: device * @new_mtu: new transfer unit * @extack: netlink extended ack * * Change the maximum transfer size of the network device. */ int dev_set_mtu_ext(struct net_device *dev, int new_mtu, struct netlink_ext_ack *extack) { int err, orig_mtu; if (new_mtu == dev->mtu) return 0; err = dev_validate_mtu(dev, new_mtu, extack); if (err) return err; if (!netif_device_present(dev)) return -ENODEV; err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); err = notifier_to_errno(err); if (err) return err; orig_mtu = dev->mtu; err = __dev_set_mtu(dev, new_mtu); if (!err) { err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, orig_mtu); err = notifier_to_errno(err); if (err) { /* setting mtu back and notifying everyone again, * so that they have a chance to revert changes. */ __dev_set_mtu(dev, orig_mtu); call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, new_mtu); } } return err; } int dev_set_mtu(struct net_device *dev, int new_mtu) { struct netlink_ext_ack extack; int err; memset(&extack, 0, sizeof(extack)); err = dev_set_mtu_ext(dev, new_mtu, &extack); if (err && extack._msg) net_err_ratelimited("%s: %s\n", dev->name, extack._msg); return err; } EXPORT_SYMBOL(dev_set_mtu); /** * dev_change_tx_queue_len - Change TX queue length of a netdevice * @dev: device * @new_len: new tx queue length */ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) { unsigned int orig_len = dev->tx_queue_len; int res; if (new_len != (unsigned int)new_len) return -ERANGE; if (new_len != orig_len) { WRITE_ONCE(dev->tx_queue_len, new_len); res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); res = notifier_to_errno(res); if (res) goto err_rollback; res = dev_qdisc_change_tx_queue_len(dev); if (res) goto err_rollback; } return 0; err_rollback: netdev_err(dev, "refused to change device tx_queue_len\n"); WRITE_ONCE(dev->tx_queue_len, orig_len); return res; } /** * dev_set_group - Change group this device belongs to * @dev: device * @new_group: group this device should belong to */ void dev_set_group(struct net_device *dev, int new_group) { dev->group = new_group; } /** * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR. * @dev: device * @addr: new address * @extack: netlink extended ack */ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack) { struct netdev_notifier_pre_changeaddr_info info = { .info.dev = dev, .info.extack = extack, .dev_addr = addr, }; int rc; rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info); return notifier_to_errno(rc); } EXPORT_SYMBOL(dev_pre_changeaddr_notify); /** * dev_set_mac_address - Change Media Access Control Address * @dev: device * @sa: new address * @extack: netlink extended ack * * Change the hardware (MAC) address of the device */ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int err; if (!ops->ndo_set_mac_address) return -EOPNOTSUPP; if (sa->sa_family != dev->type) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); if (err) return err; if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) { err = ops->ndo_set_mac_address(dev, sa); if (err) return err; } dev->addr_assign_type = NET_ADDR_SET; call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); add_device_randomness(dev->dev_addr, dev->addr_len); return 0; } EXPORT_SYMBOL(dev_set_mac_address); DECLARE_RWSEM(dev_addr_sem); int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack) { int ret; down_write(&dev_addr_sem); ret = dev_set_mac_address(dev, sa, extack); up_write(&dev_addr_sem); return ret; } EXPORT_SYMBOL(dev_set_mac_address_user); int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) { size_t size = sizeof(sa->sa_data_min); struct net_device *dev; int ret = 0; down_read(&dev_addr_sem); rcu_read_lock(); dev = dev_get_by_name_rcu(net, dev_name); if (!dev) { ret = -ENODEV; goto unlock; } if (!dev->addr_len) memset(sa->sa_data, 0, size); else memcpy(sa->sa_data, dev->dev_addr, min_t(size_t, size, dev->addr_len)); sa->sa_family = dev->type; unlock: rcu_read_unlock(); up_read(&dev_addr_sem); return ret; } EXPORT_SYMBOL(dev_get_mac_address); /** * dev_change_carrier - Change device carrier * @dev: device * @new_carrier: new value * * Change device carrier */ int dev_change_carrier(struct net_device *dev, bool new_carrier) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_change_carrier) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; return ops->ndo_change_carrier(dev, new_carrier); } /** * dev_get_phys_port_id - Get device physical port ID * @dev: device * @ppid: port ID * * Get device physical port ID */ int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_get_phys_port_id) return -EOPNOTSUPP; return ops->ndo_get_phys_port_id(dev, ppid); } /** * dev_get_phys_port_name - Get device physical port name * @dev: device * @name: port name * @len: limit of bytes to copy to name * * Get device physical port name */ int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len) { const struct net_device_ops *ops = dev->netdev_ops; int err; if (ops->ndo_get_phys_port_name) { err = ops->ndo_get_phys_port_name(dev, name, len); if (err != -EOPNOTSUPP) return err; } return devlink_compat_phys_port_name_get(dev, name, len); } /** * dev_get_port_parent_id - Get the device's port parent identifier * @dev: network device * @ppid: pointer to a storage for the port's parent identifier * @recurse: allow/disallow recursion to lower devices * * Get the devices's port parent identifier */ int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse) { const struct net_device_ops *ops = dev->netdev_ops; struct netdev_phys_item_id first = { }; struct net_device *lower_dev; struct list_head *iter; int err; if (ops->ndo_get_port_parent_id) { err = ops->ndo_get_port_parent_id(dev, ppid); if (err != -EOPNOTSUPP) return err; } err = devlink_compat_switch_id_get(dev, ppid); if (!recurse || err != -EOPNOTSUPP) return err; netdev_for_each_lower_dev(dev, lower_dev, iter) { err = dev_get_port_parent_id(lower_dev, ppid, true); if (err) break; if (!first.id_len) first = *ppid; else if (memcmp(&first, ppid, sizeof(*ppid))) return -EOPNOTSUPP; } return err; } EXPORT_SYMBOL(dev_get_port_parent_id); /** * netdev_port_same_parent_id - Indicate if two network devices have * the same port parent identifier * @a: first network device * @b: second network device */ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) { struct netdev_phys_item_id a_id = { }; struct netdev_phys_item_id b_id = { }; if (dev_get_port_parent_id(a, &a_id, true) || dev_get_port_parent_id(b, &b_id, true)) return false; return netdev_phys_item_id_same(&a_id, &b_id); } EXPORT_SYMBOL(netdev_port_same_parent_id); /** * dev_change_proto_down - set carrier according to proto_down. * * @dev: device * @proto_down: new value */ int dev_change_proto_down(struct net_device *dev, bool proto_down) { if (!dev->change_proto_down) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; if (proto_down) netif_carrier_off(dev); else netif_carrier_on(dev); WRITE_ONCE(dev->proto_down, proto_down); return 0; } /** * dev_change_proto_down_reason - proto down reason * * @dev: device * @mask: proto down mask * @value: proto down value */ void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, u32 value) { u32 proto_down_reason; int b; if (!mask) { proto_down_reason = value; } else { proto_down_reason = dev->proto_down_reason; for_each_set_bit(b, &mask, 32) { if (value & (1 << b)) proto_down_reason |= BIT(b); else proto_down_reason &= ~BIT(b); } } WRITE_ONCE(dev->proto_down_reason, proto_down_reason); } struct bpf_xdp_link { struct bpf_link link; struct net_device *dev; /* protected by rtnl_lock, no refcnt held */ int flags; }; static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags) { if (flags & XDP_FLAGS_HW_MODE) return XDP_MODE_HW; if (flags & XDP_FLAGS_DRV_MODE) return XDP_MODE_DRV; if (flags & XDP_FLAGS_SKB_MODE) return XDP_MODE_SKB; return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB; } static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) { switch (mode) { case XDP_MODE_SKB: return generic_xdp_install; case XDP_MODE_DRV: case XDP_MODE_HW: return dev->netdev_ops->ndo_bpf; default: return NULL; } } static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, enum bpf_xdp_mode mode) { return dev->xdp_state[mode].link; } static struct bpf_prog *dev_xdp_prog(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_xdp_link *link = dev_xdp_link(dev, mode); if (link) return link->link.prog; return dev->xdp_state[mode].prog; } u8 dev_xdp_prog_count(struct net_device *dev) { u8 count = 0; int i; for (i = 0; i < __MAX_XDP_MODE; i++) if (dev->xdp_state[i].prog || dev->xdp_state[i].link) count++; return count; } EXPORT_SYMBOL_GPL(dev_xdp_prog_count); u8 dev_xdp_sb_prog_count(struct net_device *dev) { u8 count = 0; int i; for (i = 0; i < __MAX_XDP_MODE; i++) if (dev->xdp_state[i].prog && !dev->xdp_state[i].prog->aux->xdp_has_frags) count++; return count; } int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) { if (!dev->netdev_ops->ndo_bpf) return -EOPNOTSUPP; if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && bpf->command == XDP_SETUP_PROG && bpf->prog && !bpf->prog->aux->xdp_has_frags) { NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using tcp-data-split"); return -EBUSY; } if (dev_get_min_mp_channel_count(dev)) { NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider"); return -EBUSY; } return dev->netdev_ops->ndo_bpf(dev, bpf); } EXPORT_SYMBOL_GPL(dev_xdp_propagate); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_prog *prog = dev_xdp_prog(dev, mode); return prog ? prog->aux->id : 0; } static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode, struct bpf_xdp_link *link) { dev->xdp_state[mode].link = link; dev->xdp_state[mode].prog = NULL; } static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode, struct bpf_prog *prog) { dev->xdp_state[mode].link = NULL; dev->xdp_state[mode].prog = prog; } static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, bpf_op_t bpf_op, struct netlink_ext_ack *extack, u32 flags, struct bpf_prog *prog) { struct netdev_bpf xdp; int err; if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && prog && !prog->aux->xdp_has_frags) { NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split"); return -EBUSY; } if (dev_get_min_mp_channel_count(dev)) { NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider"); return -EBUSY; } memset(&xdp, 0, sizeof(xdp)); xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG; xdp.extack = extack; xdp.flags = flags; xdp.prog = prog; /* Drivers assume refcnt is already incremented (i.e, prog pointer is * "moved" into driver), so they don't increment it on their own, but * they do decrement refcnt when program is detached or replaced. * Given net_device also owns link/prog, we need to bump refcnt here * to prevent drivers from underflowing it. */ if (prog) bpf_prog_inc(prog); err = bpf_op(dev, &xdp); if (err) { if (prog) bpf_prog_put(prog); return err; } if (mode != XDP_MODE_HW) bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog); return 0; } static void dev_xdp_uninstall(struct net_device *dev) { struct bpf_xdp_link *link; struct bpf_prog *prog; enum bpf_xdp_mode mode; bpf_op_t bpf_op; ASSERT_RTNL(); for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) { prog = dev_xdp_prog(dev, mode); if (!prog) continue; bpf_op = dev_xdp_bpf_op(dev, mode); if (!bpf_op) continue; WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); /* auto-detach link from net device */ link = dev_xdp_link(dev, mode); if (link) link->dev = NULL; else bpf_prog_put(prog); dev_xdp_set_link(dev, mode, NULL); } } static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack, struct bpf_xdp_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog, u32 flags) { unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES); struct bpf_prog *cur_prog; struct net_device *upper; struct list_head *iter; enum bpf_xdp_mode mode; bpf_op_t bpf_op; int err; ASSERT_RTNL(); /* either link or prog attachment, never both */ if (link && (new_prog || old_prog)) return -EINVAL; /* link supports only XDP mode flags */ if (link && (flags & ~XDP_FLAGS_MODES)) { NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); return -EINVAL; } /* just one XDP mode bit should be set, zero defaults to drv/skb mode */ if (num_modes > 1) { NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); return -EINVAL; } /* avoid ambiguity if offload + drv/skb mode progs are both loaded */ if (!num_modes && dev_xdp_prog_count(dev) > 1) { NL_SET_ERR_MSG(extack, "More than one program loaded, unset mode is ambiguous"); return -EINVAL; } /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */ if (old_prog && !(flags & XDP_FLAGS_REPLACE)) { NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified"); return -EINVAL; } mode = dev_xdp_mode(dev, flags); /* can't replace attached link */ if (dev_xdp_link(dev, mode)) { NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link"); return -EBUSY; } /* don't allow if an upper device already has a program */ netdev_for_each_upper_dev_rcu(dev, upper, iter) { if (dev_xdp_prog_count(upper) > 0) { NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program"); return -EEXIST; } } cur_prog = dev_xdp_prog(dev, mode); /* can't replace attached prog with link */ if (link && cur_prog) { NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link"); return -EBUSY; } if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) { NL_SET_ERR_MSG(extack, "Active program does not match expected"); return -EEXIST; } /* put effective new program into new_prog */ if (link) new_prog = link->link.prog; if (new_prog) { bool offload = mode == XDP_MODE_HW; enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB ? XDP_MODE_DRV : XDP_MODE_SKB; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) { NL_SET_ERR_MSG(extack, "XDP program already attached"); return -EBUSY; } if (!offload && dev_xdp_prog(dev, other_mode)) { NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); return -EEXIST; } if (!offload && bpf_prog_is_offloaded(new_prog->aux)) { NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported"); return -EINVAL; } if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) { NL_SET_ERR_MSG(extack, "Program bound to different device"); return -EINVAL; } if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) { NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode"); return -EINVAL; } if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); return -EINVAL; } if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) { NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device"); return -EINVAL; } } /* don't call drivers if the effective program didn't change */ if (new_prog != cur_prog) { bpf_op = dev_xdp_bpf_op(dev, mode); if (!bpf_op) { NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode"); return -EOPNOTSUPP; } err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog); if (err) return err; } if (link) dev_xdp_set_link(dev, mode, link); else dev_xdp_set_prog(dev, mode, new_prog); if (cur_prog) bpf_prog_put(cur_prog); return 0; } static int dev_xdp_attach_link(struct net_device *dev, struct netlink_ext_ack *extack, struct bpf_xdp_link *link) { return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags); } static int dev_xdp_detach_link(struct net_device *dev, struct netlink_ext_ack *extack, struct bpf_xdp_link *link) { enum bpf_xdp_mode mode; bpf_op_t bpf_op; ASSERT_RTNL(); mode = dev_xdp_mode(dev, link->flags); if (dev_xdp_link(dev, mode) != link) return -EINVAL; bpf_op = dev_xdp_bpf_op(dev, mode); WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); dev_xdp_set_link(dev, mode, NULL); return 0; } static void bpf_xdp_link_release(struct bpf_link *link) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); rtnl_lock(); /* if racing with net_device's tear down, xdp_link->dev might be * already NULL, in which case link was already auto-detached */ if (xdp_link->dev) { WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); xdp_link->dev = NULL; } rtnl_unlock(); } static int bpf_xdp_link_detach(struct bpf_link *link) { bpf_xdp_link_release(link); return 0; } static void bpf_xdp_link_dealloc(struct bpf_link *link) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); kfree(xdp_link); } static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); u32 ifindex = 0; rtnl_lock(); if (xdp_link->dev) ifindex = xdp_link->dev->ifindex; rtnl_unlock(); seq_printf(seq, "ifindex:\t%u\n", ifindex); } static int bpf_xdp_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); u32 ifindex = 0; rtnl_lock(); if (xdp_link->dev) ifindex = xdp_link->dev->ifindex; rtnl_unlock(); info->xdp.ifindex = ifindex; return 0; } static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); enum bpf_xdp_mode mode; bpf_op_t bpf_op; int err = 0; rtnl_lock(); /* link might have been auto-released already, so fail */ if (!xdp_link->dev) { err = -ENOLINK; goto out_unlock; } if (old_prog && link->prog != old_prog) { err = -EPERM; goto out_unlock; } old_prog = link->prog; if (old_prog->type != new_prog->type || old_prog->expected_attach_type != new_prog->expected_attach_type) { err = -EINVAL; goto out_unlock; } if (old_prog == new_prog) { /* no-op, don't disturb drivers */ bpf_prog_put(new_prog); goto out_unlock; } mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags); bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode); err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL, xdp_link->flags, new_prog); if (err) goto out_unlock; old_prog = xchg(&link->prog, new_prog); bpf_prog_put(old_prog); out_unlock: rtnl_unlock(); return err; } static const struct bpf_link_ops bpf_xdp_link_lops = { .release = bpf_xdp_link_release, .dealloc = bpf_xdp_link_dealloc, .detach = bpf_xdp_link_detach, .show_fdinfo = bpf_xdp_link_show_fdinfo, .fill_link_info = bpf_xdp_link_fill_link_info, .update_prog = bpf_xdp_link_update, }; int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct net *net = current->nsproxy->net_ns; struct bpf_link_primer link_primer; struct netlink_ext_ack extack = {}; struct bpf_xdp_link *link; struct net_device *dev; int err, fd; rtnl_lock(); dev = dev_get_by_index(net, attr->link_create.target_ifindex); if (!dev) { rtnl_unlock(); return -EINVAL; } link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; goto unlock; } bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); link->dev = dev; link->flags = attr->link_create.flags; err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); goto unlock; } err = dev_xdp_attach_link(dev, &extack, link); rtnl_unlock(); if (err) { link->dev = NULL; bpf_link_cleanup(&link_primer); trace_bpf_xdp_link_attach_failed(extack._msg); goto out_put_dev; } fd = bpf_link_settle(&link_primer); /* link itself doesn't hold dev's refcnt to not complicate shutdown */ dev_put(dev); return fd; unlock: rtnl_unlock(); out_put_dev: dev_put(dev); return err; } /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device * @extack: netlink extended ack * @fd: new program fd or negative value to clear * @expected_fd: old program fd that userspace expects to replace or clear * @flags: xdp-related flags * * Set or clear a bpf program for a device */ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags) { enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags); struct bpf_prog *new_prog = NULL, *old_prog = NULL; int err; ASSERT_RTNL(); if (fd >= 0) { new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, mode != XDP_MODE_SKB); if (IS_ERR(new_prog)) return PTR_ERR(new_prog); } if (expected_fd >= 0) { old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP, mode != XDP_MODE_SKB); if (IS_ERR(old_prog)) { err = PTR_ERR(old_prog); old_prog = NULL; goto err_out; } } err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags); err_out: if (err && new_prog) bpf_prog_put(new_prog); if (old_prog) bpf_prog_put(old_prog); return err; } u32 dev_get_min_mp_channel_count(const struct net_device *dev) { int i; ASSERT_RTNL(); for (i = dev->real_num_rx_queues - 1; i >= 0; i--) if (dev->_rx[i].mp_params.mp_priv) /* The channel count is the idx plus 1. */ return i + 1; return 0; } /** * dev_index_reserve() - allocate an ifindex in a namespace * @net: the applicable net namespace * @ifindex: requested ifindex, pass %0 to get one allocated * * Allocate a ifindex for a new device. Caller must either use the ifindex * to store the device (via list_netdevice()) or call dev_index_release() * to give the index up. * * Return: a suitable unique value for a new device interface number or -errno. */ static int dev_index_reserve(struct net *net, u32 ifindex) { int err; if (ifindex > INT_MAX) { DEBUG_NET_WARN_ON_ONCE(1); return -EINVAL; } if (!ifindex) err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL, xa_limit_31b, &net->ifindex, GFP_KERNEL); else err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL); if (err < 0) return err; return ifindex; } static void dev_index_release(struct net *net, int ifindex) { /* Expect only unused indexes, unlist_netdevice() removes the used */ WARN_ON(xa_erase(&net->dev_by_index, ifindex)); } static bool from_cleanup_net(void) { #ifdef CONFIG_NET_NS return current == cleanup_net_task; #else return false; #endif } /* Delayed registration/unregisteration */ LIST_HEAD(net_todo_list); DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); atomic_t dev_unreg_count = ATOMIC_INIT(0); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, struct net_device *upper, netdev_features_t features) { netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; netdev_features_t feature; int feature_bit; for_each_netdev_feature(upper_disables, feature_bit) { feature = __NETIF_F_BIT(feature_bit); if (!(upper->wanted_features & feature) && (features & feature)) { netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n", &feature, upper->name); features &= ~feature; } } return features; } static void netdev_sync_lower_features(struct net_device *upper, struct net_device *lower, netdev_features_t features) { netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; netdev_features_t feature; int feature_bit; for_each_netdev_feature(upper_disables, feature_bit) { feature = __NETIF_F_BIT(feature_bit); if (!(features & feature) && (lower->features & feature)) { netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", &feature, lower->name); lower->wanted_features &= ~feature; __netdev_update_features(lower); if (unlikely(lower->features & feature)) netdev_WARN(upper, "failed to disable %pNF on %s!\n", &feature, lower->name); else netdev_features_change(lower); } } } static bool netdev_has_ip_or_hw_csum(netdev_features_t features) { netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; bool ip_csum = (features & ip_csum_mask) == ip_csum_mask; bool hw_csum = features & NETIF_F_HW_CSUM; return ip_csum || hw_csum; } static netdev_features_t netdev_fix_features(struct net_device *dev, netdev_features_t features) { /* Fix illegal checksum combinations */ if ((features & NETIF_F_HW_CSUM) && (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { netdev_warn(dev, "mixed HW and IP checksum settings.\n"); features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); } /* TSO requires that SG is present as well. */ if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); features &= ~NETIF_F_ALL_TSO; } if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && !(features & NETIF_F_IP_CSUM)) { netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); features &= ~NETIF_F_TSO; features &= ~NETIF_F_TSO_ECN; } if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && !(features & NETIF_F_IPV6_CSUM)) { netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); features &= ~NETIF_F_TSO6; } /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */ if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO)) features &= ~NETIF_F_TSO_MANGLEID; /* TSO ECN requires that TSO is present as well. */ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) features &= ~NETIF_F_TSO_ECN; /* Software GSO depends on SG. */ if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); features &= ~NETIF_F_GSO; } /* GSO partial features require GSO partial be set */ if ((features & dev->gso_partial_features) && !(features & NETIF_F_GSO_PARTIAL)) { netdev_dbg(dev, "Dropping partially supported GSO features since no GSO partial.\n"); features &= ~dev->gso_partial_features; } if (!(features & NETIF_F_RXCSUM)) { /* NETIF_F_GRO_HW implies doing RXCSUM since every packet * successfully merged by hardware must also have the * checksum verified by hardware. If the user does not * want to enable RXCSUM, logically, we should disable GRO_HW. */ if (features & NETIF_F_GRO_HW) { netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n"); features &= ~NETIF_F_GRO_HW; } } /* LRO/HW-GRO features cannot be combined with RX-FCS */ if (features & NETIF_F_RXFCS) { if (features & NETIF_F_LRO) { netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n"); features &= ~NETIF_F_LRO; } if (features & NETIF_F_GRO_HW) { netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n"); features &= ~NETIF_F_GRO_HW; } } if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) { netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n"); features &= ~NETIF_F_LRO; } if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) { netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); features &= ~NETIF_F_HW_TLS_TX; } if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n"); features &= ~NETIF_F_HW_TLS_RX; } if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) { netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n"); features &= ~NETIF_F_GSO_UDP_L4; } return features; } int __netdev_update_features(struct net_device *dev) { struct net_device *upper, *lower; netdev_features_t features; struct list_head *iter; int err = -1; ASSERT_RTNL(); features = netdev_get_wanted_features(dev); if (dev->netdev_ops->ndo_fix_features) features = dev->netdev_ops->ndo_fix_features(dev, features); /* driver might be less strict about feature dependencies */ features = netdev_fix_features(dev, features); /* some features can't be enabled if they're off on an upper device */ netdev_for_each_upper_dev_rcu(dev, upper, iter) features = netdev_sync_upper_features(dev, upper, features); if (dev->features == features) goto sync_lower; netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", &dev->features, &features); if (dev->netdev_ops->ndo_set_features) err = dev->netdev_ops->ndo_set_features(dev, features); else err = 0; if (unlikely(err < 0)) { netdev_err(dev, "set_features() failed (%d); wanted %pNF, left %pNF\n", err, &features, &dev->features); /* return non-0 since some features might have changed and * it's better to fire a spurious notification than miss it */ return -1; } sync_lower: /* some features must be disabled on lower devices when disabled * on an upper device (think: bonding master or bridge) */ netdev_for_each_lower_dev(dev, lower, iter) netdev_sync_lower_features(dev, lower, features); if (!err) { netdev_features_t diff = features ^ dev->features; if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) { /* udp_tunnel_{get,drop}_rx_info both need * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the * device, or they won't do anything. * Thus we need to update dev->features * *before* calling udp_tunnel_get_rx_info, * but *after* calling udp_tunnel_drop_rx_info. */ if (features & NETIF_F_RX_UDP_TUNNEL_PORT) { dev->features = features; udp_tunnel_get_rx_info(dev); } else { udp_tunnel_drop_rx_info(dev); } } if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) { if (features & NETIF_F_HW_VLAN_CTAG_FILTER) { dev->features = features; err |= vlan_get_rx_ctag_filter_info(dev); } else { vlan_drop_rx_ctag_filter_info(dev); } } if (diff & NETIF_F_HW_VLAN_STAG_FILTER) { if (features & NETIF_F_HW_VLAN_STAG_FILTER) { dev->features = features; err |= vlan_get_rx_stag_filter_info(dev); } else { vlan_drop_rx_stag_filter_info(dev); } } dev->features = features; } return err < 0 ? 0 : 1; } /** * netdev_update_features - recalculate device features * @dev: the device to check * * Recalculate dev->features set and send notifications if it * has changed. Should be called after driver or hardware dependent * conditions might have changed that influence the features. */ void netdev_update_features(struct net_device *dev) { if (__netdev_update_features(dev)) netdev_features_change(dev); } EXPORT_SYMBOL(netdev_update_features); /** * netdev_change_features - recalculate device features * @dev: the device to check * * Recalculate dev->features set and send notifications even * if they have not changed. Should be called instead of * netdev_update_features() if also dev->vlan_features might * have changed to allow the changes to be propagated to stacked * VLAN devices. */ void netdev_change_features(struct net_device *dev) { __netdev_update_features(dev); netdev_features_change(dev); } EXPORT_SYMBOL(netdev_change_features); /** * netif_stacked_transfer_operstate - transfer operstate * @rootdev: the root or lower level device to transfer state from * @dev: the device to transfer operstate to * * Transfer operational state from root to device. This is normally * called when a stacking relationship exists between the root * device and the device(a leaf device). */ void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev) { if (rootdev->operstate == IF_OPER_DORMANT) netif_dormant_on(dev); else netif_dormant_off(dev); if (rootdev->operstate == IF_OPER_TESTING) netif_testing_on(dev); else netif_testing_off(dev); if (netif_carrier_ok(rootdev)) netif_carrier_on(dev); else netif_carrier_off(dev); } EXPORT_SYMBOL(netif_stacked_transfer_operstate); static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; size_t sz = count * sizeof(*rx); int err = 0; BUG_ON(count < 1); rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!rx) return -ENOMEM; dev->_rx = rx; for (i = 0; i < count; i++) { rx[i].dev = dev; /* XDP RX-queue setup */ err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0); if (err < 0) goto err_rxq_info; } return 0; err_rxq_info: /* Rollback successful reg's and free other resources */ while (i--) xdp_rxq_info_unreg(&rx[i].xdp_rxq); kvfree(dev->_rx); dev->_rx = NULL; return err; } static void netif_free_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */ if (!dev->_rx) return; for (i = 0; i < count; i++) xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq); kvfree(dev->_rx); } static void netdev_init_one_queue(struct net_device *dev, struct netdev_queue *queue, void *_unused) { /* Initialize queue lock */ spin_lock_init(&queue->_xmit_lock); netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); queue->xmit_lock_owner = -1; netdev_queue_numa_node_write(queue, NUMA_NO_NODE); queue->dev = dev; #ifdef CONFIG_BQL dql_init(&queue->dql, HZ); #endif } static void netif_free_tx_queues(struct net_device *dev) { kvfree(dev->_tx); } static int netif_alloc_netdev_queues(struct net_device *dev) { unsigned int count = dev->num_tx_queues; struct netdev_queue *tx; size_t sz = count * sizeof(*tx); if (count < 1 || count > 0xffff) return -EINVAL; tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!tx) return -ENOMEM; dev->_tx = tx; netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); spin_lock_init(&dev->tx_global_lock); return 0; } void netif_tx_stop_all_queues(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); netif_tx_stop_queue(txq); } } EXPORT_SYMBOL(netif_tx_stop_all_queues); static int netdev_do_alloc_pcpu_stats(struct net_device *dev) { void __percpu *v; /* Drivers implementing ndo_get_peer_dev must support tstat * accounting, so that skb_do_redirect() can bump the dev's * RX stats upon network namespace switch. */ if (dev->netdev_ops->ndo_get_peer_dev && dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS) return -EOPNOTSUPP; switch (dev->pcpu_stat_type) { case NETDEV_PCPU_STAT_NONE: return 0; case NETDEV_PCPU_STAT_LSTATS: v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); break; case NETDEV_PCPU_STAT_TSTATS: v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); break; case NETDEV_PCPU_STAT_DSTATS: v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); break; default: return -EINVAL; } return v ? 0 : -ENOMEM; } static void netdev_do_free_pcpu_stats(struct net_device *dev) { switch (dev->pcpu_stat_type) { case NETDEV_PCPU_STAT_NONE: return; case NETDEV_PCPU_STAT_LSTATS: free_percpu(dev->lstats); break; case NETDEV_PCPU_STAT_TSTATS: free_percpu(dev->tstats); break; case NETDEV_PCPU_STAT_DSTATS: free_percpu(dev->dstats); break; } } static void netdev_free_phy_link_topology(struct net_device *dev) { struct phy_link_topology *topo = dev->link_topo; if (IS_ENABLED(CONFIG_PHYLIB) && topo) { xa_destroy(&topo->phys); kfree(topo); dev->link_topo = NULL; } } /** * register_netdevice() - register a network device * @dev: device to register * * Take a prepared network device structure and make it externally accessible. * A %NETDEV_REGISTER message is sent to the netdev notifier chain. * Callers must hold the rtnl lock - you may want register_netdev() * instead of this. */ int register_netdevice(struct net_device *dev) { int ret; struct net *net = dev_net(dev); BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE < NETDEV_FEATURE_COUNT); BUG_ON(dev_boot_phase); ASSERT_RTNL(); might_sleep(); /* When net_device's are persistent, this will be fatal. */ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); BUG_ON(!net); ret = ethtool_check_ops(dev->ethtool_ops); if (ret) return ret; /* rss ctx ID 0 is reserved for the default context, start from 1 */ xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1); mutex_init(&dev->ethtool->rss_lock); spin_lock_init(&dev->addr_list_lock); netdev_set_addr_lockdep_class(dev); ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) goto out; ret = -ENOMEM; dev->name_node = netdev_name_node_head_alloc(dev); if (!dev->name_node) goto out; /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); if (ret) { if (ret > 0) ret = -EIO; goto err_free_name; } } if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_CTAG_FILTER) && (!dev->netdev_ops->ndo_vlan_rx_add_vid || !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); ret = -EINVAL; goto err_uninit; } ret = netdev_do_alloc_pcpu_stats(dev); if (ret) goto err_uninit; ret = dev_index_reserve(net, dev->ifindex); if (ret < 0) goto err_free_pcpu; dev->ifindex = ret; /* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO). */ dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF); dev->features |= NETIF_F_SOFT_FEATURES; if (dev->udp_tunnel_nic_info) { dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT; dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT; } dev->wanted_features = dev->features & dev->hw_features; if (!(dev->flags & IFF_LOOPBACK)) dev->hw_features |= NETIF_F_NOCACHE_COPY; /* If IPv4 TCP segmentation offload is supported we should also * allow the device to enable segmenting the frame with the option * of ignoring a static IP ID value. This doesn't enable the * feature itself but allows the user to enable it later. */ if (dev->hw_features & NETIF_F_TSO) dev->hw_features |= NETIF_F_TSO_MANGLEID; if (dev->vlan_features & NETIF_F_TSO) dev->vlan_features |= NETIF_F_TSO_MANGLEID; if (dev->mpls_features & NETIF_F_TSO) dev->mpls_features |= NETIF_F_TSO_MANGLEID; if (dev->hw_enc_features & NETIF_F_TSO) dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. */ dev->vlan_features |= NETIF_F_HIGHDMA; /* Make NETIF_F_SG inheritable to tunnel devices. */ dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL; /* Make NETIF_F_SG inheritable to MPLS. */ dev->mpls_features |= NETIF_F_SG; ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) goto err_ifindex_release; ret = netdev_register_kobject(dev); netdev_lock(dev); WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED); netdev_unlock(dev); if (ret) goto err_uninit_notify; __netdev_update_features(dev); /* * Default initial state at registry is that the * device is present. */ set_bit(__LINK_STATE_PRESENT, &dev->state); linkwatch_init_dev(dev); dev_init_scheduler(dev); netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL); list_netdevice(dev); add_device_randomness(dev->dev_addr, dev->addr_len); /* If the device has permanent device address, driver should * set dev_addr and also addr_assign_type should be set to * NET_ADDR_PERM (default value). */ if (dev->addr_assign_type == NET_ADDR_PERM) memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); /* Notify protocols, that a new device appeared. */ ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); ret = notifier_to_errno(ret); if (ret) { /* Expect explicit free_netdev() on failure */ dev->needs_free_netdev = false; unregister_netdevice_queue(dev, NULL); goto out; } /* * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); out: return ret; err_uninit_notify: call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); err_ifindex_release: dev_index_release(net, dev->ifindex); err_free_pcpu: netdev_do_free_pcpu_stats(dev); err_uninit: if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); if (dev->priv_destructor) dev->priv_destructor(dev); err_free_name: netdev_name_node_free(dev->name_node); goto out; } EXPORT_SYMBOL(register_netdevice); /* Initialize the core of a dummy net device. * The setup steps dummy netdevs need which normal netdevs get by going * through register_netdevice(). */ static void init_dummy_netdev(struct net_device *dev) { /* make sure we BUG if trying to hit standard * register/unregister code path */ dev->reg_state = NETREG_DUMMY; /* a dummy interface is started by default */ set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); /* Note : We dont allocate pcpu_refcnt for dummy devices, * because users of this 'device' dont need to change * its refcount. */ } /** * register_netdev - register a network device * @dev: device to register * * Take a completed network device structure and add it to the kernel * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier * chain. 0 is returned on success. A negative errno code is returned * on a failure to set up the device, or if the name is a duplicate. * * This is a wrapper around register_netdevice that takes the rtnl semaphore * and expands the device name if you passed a format string to * alloc_netdev. */ int register_netdev(struct net_device *dev) { struct net *net = dev_net(dev); int err; if (rtnl_net_lock_killable(net)) return -EINTR; err = register_netdevice(dev); rtnl_net_unlock(net); return err; } EXPORT_SYMBOL(register_netdev); int netdev_refcnt_read(const struct net_device *dev) { #ifdef CONFIG_PCPU_DEV_REFCNT int i, refcnt = 0; for_each_possible_cpu(i) refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); return refcnt; #else return refcount_read(&dev->dev_refcnt); #endif } EXPORT_SYMBOL(netdev_refcnt_read); int netdev_unregister_timeout_secs __read_mostly = 10; #define WAIT_REFS_MIN_MSECS 1 #define WAIT_REFS_MAX_MSECS 250 /** * netdev_wait_allrefs_any - wait until all references are gone. * @list: list of net_devices to wait on * * This is called when unregistering network devices. * * Any protocol or device that holds a reference should register * for netdevice notification, and cleanup and put back the * reference if they receive an UNREGISTER event. * We can get stuck here if buggy protocols don't correctly * call dev_put. */ static struct net_device *netdev_wait_allrefs_any(struct list_head *list) { unsigned long rebroadcast_time, warning_time; struct net_device *dev; int wait = 0; rebroadcast_time = warning_time = jiffies; list_for_each_entry(dev, list, todo_list) if (netdev_refcnt_read(dev) == 1) return dev; while (true) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); /* Rebroadcast unregister notification */ list_for_each_entry(dev, list, todo_list) call_netdevice_notifiers(NETDEV_UNREGISTER, dev); __rtnl_unlock(); rcu_barrier(); rtnl_lock(); list_for_each_entry(dev, list, todo_list) if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { /* We must not have linkwatch events * pending on unregister. If this * happens, we simply run the queue * unscheduled, resulting in a noop * for this device. */ linkwatch_run_queue(); break; } __rtnl_unlock(); rebroadcast_time = jiffies; } rcu_barrier(); if (!wait) { wait = WAIT_REFS_MIN_MSECS; } else { msleep(wait); wait = min(wait << 1, WAIT_REFS_MAX_MSECS); } list_for_each_entry(dev, list, todo_list) if (netdev_refcnt_read(dev) == 1) return dev; if (time_after(jiffies, warning_time + READ_ONCE(netdev_unregister_timeout_secs) * HZ)) { list_for_each_entry(dev, list, todo_list) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", dev->name, netdev_refcnt_read(dev)); ref_tracker_dir_print(&dev->refcnt_tracker, 10); } warning_time = jiffies; } } } /* The sequence is: * * rtnl_lock(); * ... * register_netdevice(x1); * register_netdevice(x2); * ... * unregister_netdevice(y1); * unregister_netdevice(y2); * ... * rtnl_unlock(); * free_netdev(y1); * free_netdev(y2); * * We are invoked by rtnl_unlock(). * This allows us to deal with problems: * 1) We can delete sysfs objects which invoke hotplug * without deadlocking with linkwatch via keventd. * 2) Since we run with the RTNL semaphore not held, we can sleep * safely in order to wait for the netdev refcnt to drop to zero. * * We must not return until all unregister events added during * the interval the lock was held have been completed. */ void netdev_run_todo(void) { struct net_device *dev, *tmp; struct list_head list; int cnt; #ifdef CONFIG_LOCKDEP struct list_head unlink_list; list_replace_init(&net_unlink_list, &unlink_list); while (!list_empty(&unlink_list)) { struct net_device *dev = list_first_entry(&unlink_list, struct net_device, unlink_list); list_del_init(&dev->unlink_list); dev->nested_level = dev->lower_level - 1; } #endif /* Snapshot list, allow later requests */ list_replace_init(&net_todo_list, &list); __rtnl_unlock(); /* Wait for rcu callbacks to finish before next phase */ if (!list_empty(&list)) rcu_barrier(); list_for_each_entry_safe(dev, tmp, &list, todo_list) { if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { netdev_WARN(dev, "run_todo but not unregistering\n"); list_del(&dev->todo_list); continue; } netdev_lock(dev); WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED); netdev_unlock(dev); linkwatch_sync_dev(dev); } cnt = 0; while (!list_empty(&list)) { dev = netdev_wait_allrefs_any(&list); list_del(&dev->todo_list); /* paranoia */ BUG_ON(netdev_refcnt_read(dev) != 1); BUG_ON(!list_empty(&dev->ptype_all)); BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr)); netdev_do_free_pcpu_stats(dev); if (dev->priv_destructor) dev->priv_destructor(dev); if (dev->needs_free_netdev) free_netdev(dev); cnt++; /* Free network device */ kobject_put(&dev->dev.kobj); } if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count)) wake_up(&netdev_unregistering_wq); } /* Collate per-cpu network dstats statistics * * Read per-cpu network statistics from dev->dstats and populate the related * fields in @s. */ static void dev_fetch_dstats(struct rtnl_link_stats64 *s, const struct pcpu_dstats __percpu *dstats) { int cpu; for_each_possible_cpu(cpu) { u64 rx_packets, rx_bytes, rx_drops; u64 tx_packets, tx_bytes, tx_drops; const struct pcpu_dstats *stats; unsigned int start; stats = per_cpu_ptr(dstats, cpu); do { start = u64_stats_fetch_begin(&stats->syncp); rx_packets = u64_stats_read(&stats->rx_packets); rx_bytes = u64_stats_read(&stats->rx_bytes); rx_drops = u64_stats_read(&stats->rx_drops); tx_packets = u64_stats_read(&stats->tx_packets); tx_bytes = u64_stats_read(&stats->tx_bytes); tx_drops = u64_stats_read(&stats->tx_drops); } while (u64_stats_fetch_retry(&stats->syncp, start)); s->rx_packets += rx_packets; s->rx_bytes += rx_bytes; s->rx_dropped += rx_drops; s->tx_packets += tx_packets; s->tx_bytes += tx_bytes; s->tx_dropped += tx_drops; } } /* ndo_get_stats64 implementation for dtstats-based accounting. * * Populate @s from dev->stats and dev->dstats. This is used internally by the * core for NETDEV_PCPU_STAT_DSTAT-type stats collection. */ static void dev_get_dstats64(const struct net_device *dev, struct rtnl_link_stats64 *s) { netdev_stats_to_stats64(s, &dev->stats); dev_fetch_dstats(s, dev->dstats); } /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has * all the same fields in the same order as net_device_stats, with only * the type differing, but rtnl_link_stats64 may have additional fields * at the end for newer counters. */ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats) { size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t); const atomic_long_t *src = (atomic_long_t *)netdev_stats; u64 *dst = (u64 *)stats64; BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++) dst[i] = (unsigned long)atomic_long_read(&src[i]); /* zero out counters that only exist in rtnl_link_stats64 */ memset((char *)stats64 + n * sizeof(u64), 0, sizeof(*stats64) - n * sizeof(u64)); } EXPORT_SYMBOL(netdev_stats_to_stats64); static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc( struct net_device *dev) { struct net_device_core_stats __percpu *p; p = alloc_percpu_gfp(struct net_device_core_stats, GFP_ATOMIC | __GFP_NOWARN); if (p && cmpxchg(&dev->core_stats, NULL, p)) free_percpu(p); /* This READ_ONCE() pairs with the cmpxchg() above */ return READ_ONCE(dev->core_stats); } noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset) { /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats); unsigned long __percpu *field; if (unlikely(!p)) { p = netdev_core_stats_alloc(dev); if (!p) return; } field = (unsigned long __percpu *)((void __percpu *)p + offset); this_cpu_inc(*field); } EXPORT_SYMBOL_GPL(netdev_core_stats_inc); /** * dev_get_stats - get network device statistics * @dev: device to get statistics from * @storage: place to store stats * * Get network statistics from device. Return @storage. * The device driver may provide its own method by setting * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; * otherwise the internal statistics structure is used. */ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, struct rtnl_link_stats64 *storage) { const struct net_device_ops *ops = dev->netdev_ops; const struct net_device_core_stats __percpu *p; /* * IPv{4,6} and udp tunnels share common stat helpers and use * different stat type (NETDEV_PCPU_STAT_TSTATS vs * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent. */ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) != offsetof(struct pcpu_dstats, rx_bytes)); BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) != offsetof(struct pcpu_dstats, rx_packets)); BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) != offsetof(struct pcpu_dstats, tx_bytes)); BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) != offsetof(struct pcpu_dstats, tx_packets)); if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); ops->ndo_get_stats64(dev, storage); } else if (ops->ndo_get_stats) { netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) { dev_get_tstats64(dev, storage); } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) { dev_get_dstats64(dev, storage); } else { netdev_stats_to_stats64(storage, &dev->stats); } /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ p = READ_ONCE(dev->core_stats); if (p) { const struct net_device_core_stats *core_stats; int i; for_each_possible_cpu(i) { core_stats = per_cpu_ptr(p, i); storage->rx_dropped += READ_ONCE(core_stats->rx_dropped); storage->tx_dropped += READ_ONCE(core_stats->tx_dropped); storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler); storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped); } } return storage; } EXPORT_SYMBOL(dev_get_stats); /** * dev_fetch_sw_netstats - get per-cpu network device statistics * @s: place to store stats * @netstats: per-cpu network stats to read from * * Read per-cpu network statistics and populate the related fields in @s. */ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, const struct pcpu_sw_netstats __percpu *netstats) { int cpu; for_each_possible_cpu(cpu) { u64 rx_packets, rx_bytes, tx_packets, tx_bytes; const struct pcpu_sw_netstats *stats; unsigned int start; stats = per_cpu_ptr(netstats, cpu); do { start = u64_stats_fetch_begin(&stats->syncp); rx_packets = u64_stats_read(&stats->rx_packets); rx_bytes = u64_stats_read(&stats->rx_bytes); tx_packets = u64_stats_read(&stats->tx_packets); tx_bytes = u64_stats_read(&stats->tx_bytes); } while (u64_stats_fetch_retry(&stats->syncp, start)); s->rx_packets += rx_packets; s->rx_bytes += rx_bytes; s->tx_packets += tx_packets; s->tx_bytes += tx_bytes; } } EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats); /** * dev_get_tstats64 - ndo_get_stats64 implementation * @dev: device to get statistics from * @s: place to store stats * * Populate @s from dev->stats and dev->tstats. Can be used as * ndo_get_stats64() callback. */ void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s) { netdev_stats_to_stats64(s, &dev->stats); dev_fetch_sw_netstats(s, dev->tstats); } EXPORT_SYMBOL_GPL(dev_get_tstats64); struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) { struct netdev_queue *queue = dev_ingress_queue(dev); #ifdef CONFIG_NET_CLS_ACT if (queue) return queue; queue = kzalloc(sizeof(*queue), GFP_KERNEL); if (!queue) return NULL; netdev_init_one_queue(dev, queue, NULL); RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc); rcu_assign_pointer(dev->ingress_queue, queue); #endif return queue; } static const struct ethtool_ops default_ethtool_ops; void netdev_set_default_ethtool_ops(struct net_device *dev, const struct ethtool_ops *ops) { if (dev->ethtool_ops == &default_ethtool_ops) dev->ethtool_ops = ops; } EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); /** * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default * @dev: netdev to enable the IRQ coalescing on * * Sets a conservative default for SW IRQ coalescing. Users can use * sysfs attributes to override the default values. */ void netdev_sw_irq_coalesce_default_on(struct net_device *dev) { WARN_ON(dev->reg_state == NETREG_REGISTERED); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { netdev_set_gro_flush_timeout(dev, 20000); netdev_set_defer_hard_irqs(dev, 1); } } EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on); /** * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for * @name: device name format string * @name_assign_type: origin of device name * @setup: callback to initialize device * @txqs: the number of TX subqueues to allocate * @rxqs: the number of RX subqueues to allocate * * Allocates a struct net_device with private data area for driver use * and performs basic initialization. Also allocates subqueue structs * for each queue on the device. */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs) { struct net_device *dev; size_t napi_config_sz; unsigned int maxqs; BUG_ON(strlen(name) >= sizeof(dev->name)); if (txqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); return NULL; } if (rxqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); return NULL; } maxqs = max(txqs, rxqs); dev = kvzalloc(struct_size(dev, priv, sizeof_priv), GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!dev) return NULL; dev->priv_len = sizeof_priv; ref_tracker_dir_init(&dev->refcnt_tracker, 128, name); #ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) goto free_dev; __dev_hold(dev); #else refcount_set(&dev->dev_refcnt, 1); #endif if (dev_addr_init(dev)) goto free_pcpu; dev_mc_init(dev); dev_uc_init(dev); dev_net_set(dev, &init_net); dev->gso_max_size = GSO_LEGACY_MAX_SIZE; dev->xdp_zc_max_segs = 1; dev->gso_max_segs = GSO_MAX_SEGS; dev->gro_max_size = GRO_LEGACY_MAX_SIZE; dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE; dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE; dev->tso_max_size = TSO_LEGACY_MAX_SIZE; dev->tso_max_segs = TSO_MAX_SEGS; dev->upper_level = 1; dev->lower_level = 1; #ifdef CONFIG_LOCKDEP dev->nested_level = 0; INIT_LIST_HEAD(&dev->unlink_list); #endif INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); INIT_LIST_HEAD(&dev->close_list); INIT_LIST_HEAD(&dev->link_watch_list); INIT_LIST_HEAD(&dev->adj_list.upper); INIT_LIST_HEAD(&dev->adj_list.lower); INIT_LIST_HEAD(&dev->ptype_all); INIT_LIST_HEAD(&dev->ptype_specific); INIT_LIST_HEAD(&dev->net_notifier_list); #ifdef CONFIG_NET_SCHED hash_init(dev->qdisc_hash); #endif mutex_init(&dev->lock); dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); if (!dev->tx_queue_len) { dev->priv_flags |= IFF_NO_QUEUE; dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; } dev->num_tx_queues = txqs; dev->real_num_tx_queues = txqs; if (netif_alloc_netdev_queues(dev)) goto free_all; dev->num_rx_queues = rxqs; dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_all; dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT); if (!dev->ethtool) goto free_all; dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT); if (!dev->cfg) goto free_all; dev->cfg_pending = dev->cfg; napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config)); dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT); if (!dev->napi_config) goto free_all; strscpy(dev->name, name); dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) dev->ethtool_ops = &default_ethtool_ops; nf_hook_netdev_init(dev); return dev; free_all: free_netdev(dev); return NULL; free_pcpu: #ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); free_dev: #endif kvfree(dev); return NULL; } EXPORT_SYMBOL(alloc_netdev_mqs); static void netdev_napi_exit(struct net_device *dev) { if (!list_empty(&dev->napi_list)) { struct napi_struct *p, *n; netdev_lock(dev); list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) __netif_napi_del_locked(p); netdev_unlock(dev); synchronize_net(); } kvfree(dev->napi_config); } /** * free_netdev - free network device * @dev: device * * This function does the last stage of destroying an allocated device * interface. The reference to the device object is released. If this * is the last reference then it will be freed.Must be called in process * context. */ void free_netdev(struct net_device *dev) { might_sleep(); /* When called immediately after register_netdevice() failed the unwind * handling may still be dismantling the device. Handle that case by * deferring the free. */ if (dev->reg_state == NETREG_UNREGISTERING) { ASSERT_RTNL(); dev->needs_free_netdev = true; return; } WARN_ON(dev->cfg != dev->cfg_pending); kfree(dev->cfg); kfree(dev->ethtool); netif_free_tx_queues(dev); netif_free_rx_queues(dev); kfree(rcu_dereference_protected(dev->ingress_queue, 1)); /* Flush device addresses */ dev_addr_flush(dev); netdev_napi_exit(dev); ref_tracker_dir_exit(&dev->refcnt_tracker); #ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; #endif free_percpu(dev->core_stats); dev->core_stats = NULL; free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; netdev_free_phy_link_topology(dev); mutex_destroy(&dev->lock); /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED || dev->reg_state == NETREG_DUMMY) { kvfree(dev); return; } BUG_ON(dev->reg_state != NETREG_UNREGISTERED); WRITE_ONCE(dev->reg_state, NETREG_RELEASED); /* will free via device release */ put_device(&dev->dev); } EXPORT_SYMBOL(free_netdev); /** * alloc_netdev_dummy - Allocate and initialize a dummy net device. * @sizeof_priv: size of private data to allocate space for * * Return: the allocated net_device on success, NULL otherwise */ struct net_device *alloc_netdev_dummy(int sizeof_priv) { return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN, init_dummy_netdev); } EXPORT_SYMBOL_GPL(alloc_netdev_dummy); /** * synchronize_net - Synchronize with packet receive processing * * Wait for packets currently being received to be done. * Does not block later packets from starting. */ void synchronize_net(void) { might_sleep(); if (from_cleanup_net() || rtnl_is_locked()) synchronize_rcu_expedited(); else synchronize_rcu(); } EXPORT_SYMBOL(synchronize_net); static void netdev_rss_contexts_free(struct net_device *dev) { struct ethtool_rxfh_context *ctx; unsigned long context; mutex_lock(&dev->ethtool->rss_lock); xa_for_each(&dev->ethtool->rss_ctx, context, ctx) { struct ethtool_rxfh_param rxfh; rxfh.indir = ethtool_rxfh_context_indir(ctx); rxfh.key = ethtool_rxfh_context_key(ctx); rxfh.hfunc = ctx->hfunc; rxfh.input_xfrm = ctx->input_xfrm; rxfh.rss_context = context; rxfh.rss_delete = true; xa_erase(&dev->ethtool->rss_ctx, context); if (dev->ethtool_ops->create_rxfh_context) dev->ethtool_ops->remove_rxfh_context(dev, ctx, context, NULL); else dev->ethtool_ops->set_rxfh(dev, &rxfh, NULL); kfree(ctx); } xa_destroy(&dev->ethtool->rss_ctx); mutex_unlock(&dev->ethtool->rss_lock); } /** * unregister_netdevice_queue - remove device from the kernel * @dev: device * @head: list * * This function shuts down a device interface and removes it * from the kernel tables. * If head not NULL, device is queued to be unregistered later. * * Callers must hold the rtnl semaphore. You may want * unregister_netdev() instead of this. */ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) { ASSERT_RTNL(); if (head) { list_move_tail(&dev->unreg_list, head); } else { LIST_HEAD(single); list_add(&dev->unreg_list, &single); unregister_netdevice_many(&single); } } EXPORT_SYMBOL(unregister_netdevice_queue); void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh) { struct net_device *dev, *tmp; LIST_HEAD(close_head); int cnt = 0; BUG_ON(dev_boot_phase); ASSERT_RTNL(); if (list_empty(head)) return; list_for_each_entry_safe(dev, tmp, head, unreg_list) { /* Some devices call without registering * for initialization unwind. Remove those * devices and proceed with the remaining. */ if (dev->reg_state == NETREG_UNINITIALIZED) { pr_debug("unregister_netdevice: device %s/%p never was registered\n", dev->name, dev); WARN_ON(1); list_del(&dev->unreg_list); continue; } dev->dismantle = true; BUG_ON(dev->reg_state != NETREG_REGISTERED); } /* If device is running, close it first. */ list_for_each_entry(dev, head, unreg_list) list_add_tail(&dev->close_list, &close_head); dev_close_many(&close_head, true); list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ unlist_netdevice(dev); netdev_lock(dev); WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); netdev_unlock(dev); } flush_all_backlogs(); synchronize_net(); list_for_each_entry(dev, head, unreg_list) { struct sk_buff *skb = NULL; /* Shutdown queueing discipline. */ dev_shutdown(dev); dev_tcx_uninstall(dev); dev_xdp_uninstall(dev); bpf_dev_bound_netdev_unregister(dev); dev_dmabuf_uninstall(dev); netdev_offload_xstats_disable_all(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, GFP_KERNEL, NULL, 0, portid, nlh); /* * Flush the unicast and multicast chains */ dev_uc_flush(dev); dev_mc_flush(dev); netdev_name_node_alt_flush(dev); netdev_name_node_free(dev->name_node); netdev_rss_contexts_free(dev); call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); mutex_destroy(&dev->ethtool->rss_lock); net_shaper_flush_netdev(dev); if (skb) rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh); /* Notifier chain MUST detach us all upper devices. */ WARN_ON(netdev_has_any_upper_dev(dev)); WARN_ON(netdev_has_any_lower_dev(dev)); /* Remove entries from kobject tree */ netdev_unregister_kobject(dev); #ifdef CONFIG_XPS /* Remove XPS queueing entries */ netif_reset_xps_queues_gt(dev, 0); #endif } synchronize_net(); list_for_each_entry(dev, head, unreg_list) { netdev_put(dev, &dev->dev_registered_tracker); net_set_todo(dev); cnt++; } atomic_add(cnt, &dev_unreg_count); list_del(head); } /** * unregister_netdevice_many - unregister many devices * @head: list of devices * * Note: As most callers use a stack allocated list_head, * we force a list_del() to make sure stack won't be corrupted later. */ void unregister_netdevice_many(struct list_head *head) { unregister_netdevice_many_notify(head, 0, NULL); } EXPORT_SYMBOL(unregister_netdevice_many); /** * unregister_netdev - remove device from the kernel * @dev: device * * This function shuts down a device interface and removes it * from the kernel tables. * * This is just a wrapper for unregister_netdevice that takes * the rtnl semaphore. In general you want to use this and not * unregister_netdevice. */ void unregister_netdev(struct net_device *dev) { struct net *net = dev_net(dev); rtnl_net_lock(net); unregister_netdevice(dev); rtnl_net_unlock(net); } EXPORT_SYMBOL(unregister_netdev); /** * __dev_change_net_namespace - move device to different nethost namespace * @dev: device * @net: network namespace * @pat: If not NULL name pattern to try if the current device name * is already taken in the destination network namespace. * @new_ifindex: If not zero, specifies device index in the target * namespace. * * This function shuts down a device interface and moves it * to a new network namespace. On success 0 is returned, on * a failure a netagive errno code is returned. * * Callers must hold the rtnl semaphore. */ int __dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat, int new_ifindex) { struct netdev_name_node *name_node; struct net *net_old = dev_net(dev); char new_name[IFNAMSIZ] = {}; int err, new_nsid; ASSERT_RTNL(); /* Don't allow namespace local devices to be moved. */ err = -EINVAL; if (dev->netns_local) goto out; /* Ensure the device has been registered */ if (dev->reg_state != NETREG_REGISTERED) goto out; /* Get out if there is nothing todo */ err = 0; if (net_eq(net_old, net)) goto out; /* Pick the destination device name, and ensure * we can use it in the destination network namespace. */ err = -EEXIST; if (netdev_name_in_use(net, dev->name)) { /* We get here if we can't use the current device name */ if (!pat) goto out; err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST); if (err < 0) goto out; } /* Check that none of the altnames conflicts. */ err = -EEXIST; netdev_for_each_altname(dev, name_node) if (netdev_name_in_use(net, name_node->name)) goto out; /* Check that new_ifindex isn't used yet. */ if (new_ifindex) { err = dev_index_reserve(net, new_ifindex); if (err < 0) goto out; } else { /* If there is an ifindex conflict assign a new one */ err = dev_index_reserve(net, dev->ifindex); if (err == -EBUSY) err = dev_index_reserve(net, 0); if (err < 0) goto out; new_ifindex = err; } /* * And now a mini version of register_netdevice unregister_netdevice. */ /* If device is running close it first. */ dev_close(dev); /* And unlink it from device chain */ unlist_netdevice(dev); synchronize_net(); /* Shutdown queueing discipline. */ dev_shutdown(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. * * Note that dev->reg_state stays at NETREG_REGISTERED. * This is wanted because this way 8021q and macvlan know * the device is just moving and can keep their slaves up. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, new_ifindex); /* * Flush the unicast and multicast chains */ dev_uc_flush(dev); dev_mc_flush(dev); /* Send a netdev-removed uevent to the old namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); netdev_adjacent_del_links(dev); /* Move per-net netdevice notifiers that are following the netdevice */ move_netdevice_notifiers_dev_net(dev, net); /* Actually switch the network namespace */ dev_net_set(dev, net); dev->ifindex = new_ifindex; if (new_name[0]) { /* Rename the netdev to prepared name */ write_seqlock_bh(&netdev_rename_lock); strscpy(dev->name, new_name, IFNAMSIZ); write_sequnlock_bh(&netdev_rename_lock); } /* Fixup kobjects */ dev_set_uevent_suppress(&dev->dev, 1); err = device_rename(&dev->dev, dev->name); dev_set_uevent_suppress(&dev->dev, 0); WARN_ON(err); /* Send a netdev-add uevent to the new namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_ADD); netdev_adjacent_add_links(dev); /* Adapt owner in case owning user namespace of target network * namespace is different from the original one. */ err = netdev_change_owner(dev, net_old, net); WARN_ON(err); /* Add the device back in the hashes */ list_netdevice(dev); /* Notify protocols, that a new device appeared. */ call_netdevice_notifiers(NETDEV_REGISTER, dev); /* * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); synchronize_net(); err = 0; out: return err; } EXPORT_SYMBOL_GPL(__dev_change_net_namespace); static int dev_cpu_dead(unsigned int oldcpu) { struct sk_buff **list_skb; struct sk_buff *skb; unsigned int cpu; struct softnet_data *sd, *oldsd, *remsd = NULL; local_irq_disable(); cpu = smp_processor_id(); sd = &per_cpu(softnet_data, cpu); oldsd = &per_cpu(softnet_data, oldcpu); /* Find end of our completion_queue. */ list_skb = &sd->completion_queue; while (*list_skb) list_skb = &(*list_skb)->next; /* Append completion queue from offline CPU. */ *list_skb = oldsd->completion_queue; oldsd->completion_queue = NULL; /* Append output queue from offline CPU. */ if (oldsd->output_queue) { *sd->output_queue_tailp = oldsd->output_queue; sd->output_queue_tailp = oldsd->output_queue_tailp; oldsd->output_queue = NULL; oldsd->output_queue_tailp = &oldsd->output_queue; } /* Append NAPI poll list from offline CPU, with one exception : * process_backlog() must be called by cpu owning percpu backlog. * We properly handle process_queue & input_pkt_queue later. */ while (!list_empty(&oldsd->poll_list)) { struct napi_struct *napi = list_first_entry(&oldsd->poll_list, struct napi_struct, poll_list); list_del_init(&napi->poll_list); if (napi->poll == process_backlog) napi->state &= NAPIF_STATE_THREADED; else ____napi_schedule(sd, napi); } raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); if (!use_backlog_threads()) { #ifdef CONFIG_RPS remsd = oldsd->rps_ipi_list; oldsd->rps_ipi_list = NULL; #endif /* send out pending IPI's on offline CPU */ net_rps_send_ipi(remsd); } /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx(skb); rps_input_queue_head_incr(oldsd); } while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); rps_input_queue_head_incr(oldsd); } return 0; } /** * netdev_increment_features - increment feature set by one * @all: current feature set * @one: new feature set * @mask: mask feature set * * Computes a new feature set after adding a device with feature set * @one to the master device with current feature set @all. Will not * enable anything that is off in @mask. Returns the new feature set. */ netdev_features_t netdev_increment_features(netdev_features_t all, netdev_features_t one, netdev_features_t mask) { if (mask & NETIF_F_HW_CSUM) mask |= NETIF_F_CSUM_MASK; mask |= NETIF_F_VLAN_CHALLENGED; all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask; all &= one | ~NETIF_F_ALL_FOR_ALL; /* If one device supports hw checksumming, set for all. */ if (all & NETIF_F_HW_CSUM) all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM); return all; } EXPORT_SYMBOL(netdev_increment_features); static struct hlist_head * __net_init netdev_create_hash(void) { int i; struct hlist_head *hash; hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL); if (hash != NULL) for (i = 0; i < NETDEV_HASHENTRIES; i++) INIT_HLIST_HEAD(&hash[i]); return hash; } /* Initialize per network namespace state */ static int __net_init netdev_init(struct net *net) { BUILD_BUG_ON(GRO_HASH_BUCKETS > 8 * sizeof_field(struct napi_struct, gro_bitmask)); INIT_LIST_HEAD(&net->dev_base_head); net->dev_name_head = netdev_create_hash(); if (net->dev_name_head == NULL) goto err_name; net->dev_index_head = netdev_create_hash(); if (net->dev_index_head == NULL) goto err_idx; xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1); RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); return 0; err_idx: kfree(net->dev_name_head); err_name: return -ENOMEM; } /** * netdev_drivername - network driver for the device * @dev: network device * * Determine network driver for device. */ const char *netdev_drivername(const struct net_device *dev) { const struct device_driver *driver; const struct device *parent; const char *empty = ""; parent = dev->dev.parent; if (!parent) return empty; driver = parent->driver; if (driver && driver->name) return driver->name; return empty; } static void __netdev_printk(const char *level, const struct net_device *dev, struct va_format *vaf) { if (dev && dev->dev.parent) { dev_printk_emit(level[1] - '0', dev->dev.parent, "%s %s %s%s: %pV", dev_driver_string(dev->dev.parent), dev_name(dev->dev.parent), netdev_name(dev), netdev_reg_state(dev), vaf); } else if (dev) { printk("%s%s%s: %pV", level, netdev_name(dev), netdev_reg_state(dev), vaf); } else { printk("%s(NULL net_device): %pV", level, vaf); } } void netdev_printk(const char *level, const struct net_device *dev, const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; __netdev_printk(level, dev, &vaf); va_end(args); } EXPORT_SYMBOL(netdev_printk); #define define_netdev_printk_level(func, level) \ void func(const struct net_device *dev, const char *fmt, ...) \ { \ struct va_format vaf; \ va_list args; \ \ va_start(args, fmt); \ \ vaf.fmt = fmt; \ vaf.va = &args; \ \ __netdev_printk(level, dev, &vaf); \ \ va_end(args); \ } \ EXPORT_SYMBOL(func); define_netdev_printk_level(netdev_emerg, KERN_EMERG); define_netdev_printk_level(netdev_alert, KERN_ALERT); define_netdev_printk_level(netdev_crit, KERN_CRIT); define_netdev_printk_level(netdev_err, KERN_ERR); define_netdev_printk_level(netdev_warn, KERN_WARNING); define_netdev_printk_level(netdev_notice, KERN_NOTICE); define_netdev_printk_level(netdev_info, KERN_INFO); static void __net_exit netdev_exit(struct net *net) { kfree(net->dev_name_head); kfree(net->dev_index_head); xa_destroy(&net->dev_by_index); if (net != &init_net) WARN_ON_ONCE(!list_empty(&net->dev_base_head)); } static struct pernet_operations __net_initdata netdev_net_ops = { .init = netdev_init, .exit = netdev_exit, }; static void __net_exit default_device_exit_net(struct net *net) { struct netdev_name_node *name_node, *tmp; struct net_device *dev, *aux; /* * Push all migratable network devices back to the * initial network namespace */ ASSERT_RTNL(); for_each_netdev_safe(net, dev, aux) { int err; char fb_name[IFNAMSIZ]; /* Ignore unmoveable devices (i.e. loopback) */ if (dev->netns_local) continue; /* Leave virtual devices for the generic cleanup */ if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund) continue; /* Push remaining network devices to init_net */ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); if (netdev_name_in_use(&init_net, fb_name)) snprintf(fb_name, IFNAMSIZ, "dev%%d"); netdev_for_each_altname_safe(dev, name_node, tmp) if (netdev_name_in_use(&init_net, name_node->name)) __netdev_name_node_alt_destroy(name_node); err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { pr_emerg("%s: failed to move %s to init_net: %d\n", __func__, dev->name, err); BUG(); } } } static void __net_exit default_device_exit_batch(struct list_head *net_list) { /* At exit all network devices most be removed from a network * namespace. Do this in the reverse order of registration. * Do this across as many network namespaces as possible to * improve batching efficiency. */ struct net_device *dev; struct net *net; LIST_HEAD(dev_kill_list); rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { default_device_exit_net(net); cond_resched(); } list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) dev->rtnl_link_ops->dellink(dev, &dev_kill_list); else unregister_netdevice_queue(dev, &dev_kill_list); } } unregister_netdevice_many(&dev_kill_list); rtnl_unlock(); } static struct pernet_operations __net_initdata default_device_ops = { .exit_batch = default_device_exit_batch, }; static void __init net_dev_struct_check(void) { /* TX read-mostly hotpath */ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq); #ifdef CONFIG_XPS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps); #endif #ifdef CONFIG_NETFILTER_EGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress); #endif #ifdef CONFIG_NET_XGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress); #endif CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160); /* TXRX read-mostly hotpath */ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr); CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46); /* RX read-mostly hotpath */ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net); #ifdef CONFIG_NETPOLL CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo); #endif #ifdef CONFIG_NET_XGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress); #endif CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92); } /* * Initialize the DEV module. At boot time this walks the device list and * unhooks any devices that fail to initialise (normally hardware not * present) and leaves us with a valid list of present and active devices. * */ /* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */ #define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE) static int net_page_pool_create(int cpuid) { #if IS_ENABLED(CONFIG_PAGE_POOL) struct page_pool_params page_pool_params = { .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE, .flags = PP_FLAG_SYSTEM_POOL, .nid = cpu_to_mem(cpuid), }; struct page_pool *pp_ptr; int err; pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid); if (IS_ERR(pp_ptr)) return -ENOMEM; err = xdp_reg_page_pool(pp_ptr); if (err) { page_pool_destroy(pp_ptr); return err; } per_cpu(system_page_pool, cpuid) = pp_ptr; #endif return 0; } static int backlog_napi_should_run(unsigned int cpu) { struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); struct napi_struct *napi = &sd->backlog; return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state); } static void run_backlog_napi(unsigned int cpu) { struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); napi_threaded_poll_loop(&sd->backlog); } static void backlog_napi_setup(unsigned int cpu) { struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); struct napi_struct *napi = &sd->backlog; napi->thread = this_cpu_read(backlog_napi); set_bit(NAPI_STATE_THREADED, &napi->state); } static struct smp_hotplug_thread backlog_threads = { .store = &backlog_napi, .thread_should_run = backlog_napi_should_run, .thread_fn = run_backlog_napi, .thread_comm = "backlog_napi/%u", .setup = backlog_napi_setup, }; /* * This is called single threaded during boot, so no need * to take the rtnl semaphore. */ static int __init net_dev_init(void) { int i, rc = -ENOMEM; BUG_ON(!dev_boot_phase); net_dev_struct_check(); if (dev_proc_init()) goto out; if (netdev_kobject_init()) goto out; for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); if (register_pernet_subsys(&netdev_net_ops)) goto out; /* * Initialise the packet receive queues. */ flush_backlogs_fallback = flush_backlogs_alloc(); if (!flush_backlogs_fallback) goto out; for_each_possible_cpu(i) { struct softnet_data *sd = &per_cpu(softnet_data, i); skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); #ifdef CONFIG_XFRM_OFFLOAD skb_queue_head_init(&sd->xfrm_backlog); #endif INIT_LIST_HEAD(&sd->poll_list); sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); spin_lock_init(&sd->defer_lock); init_gro_hash(&sd->backlog); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; INIT_LIST_HEAD(&sd->backlog.poll_list); if (net_page_pool_create(i)) goto out; } if (use_backlog_threads()) smpboot_register_percpu_thread(&backlog_threads); dev_boot_phase = 0; /* The loopback device is special if any other network devices * is present in a network namespace the loopback device must * be present. Since we now dynamically allocate and free the * loopback device ensure this invariant is maintained by * keeping the loopback device as the first device on the * list of network devices. Ensuring the loopback devices * is the first device that appears and the last network device * that disappears. */ if (register_pernet_device(&loopback_net_ops)) goto out; if (register_pernet_device(&default_device_ops)) goto out; open_softirq(NET_TX_SOFTIRQ, net_tx_action); open_softirq(NET_RX_SOFTIRQ, net_rx_action); rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead", NULL, dev_cpu_dead); WARN_ON(rc < 0); rc = 0; /* avoid static key IPIs to isolated CPUs */ if (housekeeping_enabled(HK_TYPE_MISC)) net_enable_timestamp(); out: if (rc < 0) { for_each_possible_cpu(i) { struct page_pool *pp_ptr; pp_ptr = per_cpu(system_page_pool, i); if (!pp_ptr) continue; xdp_unreg_page_pool(pp_ptr); page_pool_destroy(pp_ptr); per_cpu(system_page_pool, i) = NULL; } } return rc; } subsys_initcall(net_dev_init); |
| 361 362 349 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 | /* SPDX-License-Identifier: GPL-2.0+ */ #ifndef _LINUX_MAPLE_TREE_H #define _LINUX_MAPLE_TREE_H /* * Maple Tree - An RCU-safe adaptive tree for storing ranges * Copyright (c) 2018-2022 Oracle * Authors: Liam R. Howlett <Liam.Howlett@Oracle.com> * Matthew Wilcox <willy@infradead.org> */ #include <linux/kernel.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> /* #define CONFIG_MAPLE_RCU_DISABLED */ /* * Allocated nodes are mutable until they have been inserted into the tree, * at which time they cannot change their type until they have been removed * from the tree and an RCU grace period has passed. * * Removed nodes have their ->parent set to point to themselves. RCU readers * check ->parent before relying on the value that they loaded from the * slots array. This lets us reuse the slots array for the RCU head. * * Nodes in the tree point to their parent unless bit 0 is set. */ #if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) /* 64bit sizes */ #define MAPLE_NODE_SLOTS 31 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 16 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 10 /* 240 bytes */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 1) #else /* 32bit sizes */ #define MAPLE_NODE_SLOTS 63 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 32 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 21 /* 240 bytes */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 2) #endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */ #define MAPLE_NODE_MASK 255UL /* * The node->parent of the root node has bit 0 set and the rest of the pointer * is a pointer to the tree itself. No more bits are available in this pointer * (on m68k, the data structure may only be 2-byte aligned). * * Internal non-root nodes can only have maple_range_* nodes as parents. The * parent pointer is 256B aligned like all other tree nodes. When storing a 32 * or 64 bit values, the offset can fit into 4 bits. The 16 bit values need an * extra bit to store the offset. This extra bit comes from a reuse of the last * bit in the node type. This is possible by using bit 1 to indicate if bit 2 * is part of the type or the slot. * * Once the type is decided, the decision of an allocation range type or a * range type is done by examining the immutable tree flag for the * MT_FLAGS_ALLOC_RANGE flag. * * Node types: * 0x??1 = Root * 0x?00 = 16 bit nodes * 0x010 = 32 bit nodes * 0x110 = 64 bit nodes * * Slot size and location in the parent pointer: * type : slot location * 0x??1 : Root * 0x?00 : 16 bit values, type in 0-1, slot in 2-6 * 0x010 : 32 bit values, type in 0-2, slot in 3-6 * 0x110 : 64 bit values, type in 0-2, slot in 3-6 */ /* * This metadata is used to optimize the gap updating code and in reverse * searching for gaps or any other code that needs to find the end of the data. */ struct maple_metadata { unsigned char end; unsigned char gap; }; /* * Leaf nodes do not store pointers to nodes, they store user data. Users may * store almost any bit pattern. As noted above, the optimisation of storing an * entry at 0 in the root pointer cannot be done for data which have the bottom * two bits set to '10'. We also reserve values with the bottom two bits set to * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use. Some APIs * return errnos as a negative errno shifted right by two bits and the bottom * two bits set to '10', and while choosing to store these values in the array * is not an error, it may lead to confusion if you're testing for an error with * mas_is_err(). * * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits * 3-6), bit 2 is reserved. That leaves bits 0-1 unused for now. * * In regular B-Tree terms, pivots are called keys. The term pivot is used to * indicate that the tree is specifying ranges, Pivots may appear in the * subtree with an entry attached to the value whereas keys are unique to a * specific position of a B-tree. Pivot values are inclusive of the slot with * the same index. */ struct maple_range_64 { struct maple_pnode *parent; unsigned long pivot[MAPLE_RANGE64_SLOTS - 1]; union { void __rcu *slot[MAPLE_RANGE64_SLOTS]; struct { void __rcu *pad[MAPLE_RANGE64_SLOTS - 1]; struct maple_metadata meta; }; }; }; /* * At tree creation time, the user can specify that they're willing to trade off * storing fewer entries in a tree in return for storing more information in * each node. * * The maple tree supports recording the largest range of NULL entries available * in this node, also called gaps. This optimises the tree for allocating a * range. */ struct maple_arange_64 { struct maple_pnode *parent; unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1]; void __rcu *slot[MAPLE_ARANGE64_SLOTS]; unsigned long gap[MAPLE_ARANGE64_SLOTS]; struct maple_metadata meta; }; struct maple_alloc { unsigned long total; unsigned char node_count; unsigned int request_count; struct maple_alloc *slot[MAPLE_ALLOC_SLOTS]; }; struct maple_topiary { struct maple_pnode *parent; struct maple_enode *next; /* Overlaps the pivot */ }; enum maple_type { maple_dense, maple_leaf_64, maple_range_64, maple_arange_64, }; enum store_type { wr_invalid, wr_new_root, wr_store_root, wr_exact_fit, wr_spanning_store, wr_split_store, wr_rebalance, wr_append, wr_node_store, wr_slot_store, }; /** * DOC: Maple tree flags * * * MT_FLAGS_ALLOC_RANGE - Track gaps in this tree * * MT_FLAGS_USE_RCU - Operate in RCU mode * * MT_FLAGS_HEIGHT_OFFSET - The position of the tree height in the flags * * MT_FLAGS_HEIGHT_MASK - The mask for the maple tree height value * * MT_FLAGS_LOCK_MASK - How the mt_lock is used * * MT_FLAGS_LOCK_IRQ - Acquired irq-safe * * MT_FLAGS_LOCK_BH - Acquired bh-safe * * MT_FLAGS_LOCK_EXTERN - mt_lock is not used * * MAPLE_HEIGHT_MAX The largest height that can be stored */ #define MT_FLAGS_ALLOC_RANGE 0x01 #define MT_FLAGS_USE_RCU 0x02 #define MT_FLAGS_HEIGHT_OFFSET 0x02 #define MT_FLAGS_HEIGHT_MASK 0x7C #define MT_FLAGS_LOCK_MASK 0x300 #define MT_FLAGS_LOCK_IRQ 0x100 #define MT_FLAGS_LOCK_BH 0x200 #define MT_FLAGS_LOCK_EXTERN 0x300 #define MT_FLAGS_ALLOC_WRAPPED 0x0800 #define MAPLE_HEIGHT_MAX 31 #define MAPLE_NODE_TYPE_MASK 0x0F #define MAPLE_NODE_TYPE_SHIFT 0x03 #define MAPLE_RESERVED_RANGE 4096 #ifdef CONFIG_LOCKDEP typedef struct lockdep_map *lockdep_map_p; #define mt_lock_is_held(mt) \ (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock)) #define mt_write_lock_is_held(mt) \ (!(mt)->ma_external_lock || \ lock_is_held_type((mt)->ma_external_lock, 0)) #define mt_set_external_lock(mt, lock) \ (mt)->ma_external_lock = &(lock)->dep_map #define mt_on_stack(mt) (mt).ma_external_lock = NULL #else typedef struct { /* nothing */ } lockdep_map_p; #define mt_lock_is_held(mt) 1 #define mt_write_lock_is_held(mt) 1 #define mt_set_external_lock(mt, lock) do { } while (0) #define mt_on_stack(mt) do { } while (0) #endif /* * If the tree contains a single entry at index 0, it is usually stored in * tree->ma_root. To optimise for the page cache, an entry which ends in '00', * '01' or '11' is stored in the root, but an entry which ends in '10' will be * stored in a node. Bits 3-6 are used to store enum maple_type. * * The flags are used both to store some immutable information about this tree * (set at tree creation time) and dynamic information set under the spinlock. * * Another use of flags are to indicate global states of the tree. This is the * case with the MT_FLAGS_USE_RCU flag, which indicates the tree is currently in * RCU mode. This mode was added to allow the tree to reuse nodes instead of * re-allocating and RCU freeing nodes when there is a single user. */ struct maple_tree { union { spinlock_t ma_lock; lockdep_map_p ma_external_lock; }; unsigned int ma_flags; void __rcu *ma_root; }; /** * MTREE_INIT() - Initialize a maple tree * @name: The maple tree name * @__flags: The maple tree flags * */ #define MTREE_INIT(name, __flags) { \ .ma_lock = __SPIN_LOCK_UNLOCKED((name).ma_lock), \ .ma_flags = __flags, \ .ma_root = NULL, \ } /** * MTREE_INIT_EXT() - Initialize a maple tree with an external lock. * @name: The tree name * @__flags: The maple tree flags * @__lock: The external lock */ #ifdef CONFIG_LOCKDEP #define MTREE_INIT_EXT(name, __flags, __lock) { \ .ma_external_lock = &(__lock).dep_map, \ .ma_flags = (__flags), \ .ma_root = NULL, \ } #else #define MTREE_INIT_EXT(name, __flags, __lock) MTREE_INIT(name, __flags) #endif #define DEFINE_MTREE(name) \ struct maple_tree name = MTREE_INIT(name, 0) #define mtree_lock(mt) spin_lock((&(mt)->ma_lock)) #define mtree_lock_nested(mas, subclass) \ spin_lock_nested((&(mt)->ma_lock), subclass) #define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock)) /* * The Maple Tree squeezes various bits in at various points which aren't * necessarily obvious. Usually, this is done by observing that pointers are * N-byte aligned and thus the bottom log_2(N) bits are available for use. We * don't use the high bits of pointers to store additional information because * we don't know what bits are unused on any given architecture. * * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8 * low bits for our own purposes. Nodes are currently of 4 types: * 1. Single pointer (Range is 0-0) * 2. Non-leaf Allocation Range nodes * 3. Non-leaf Range nodes * 4. Leaf Range nodes All nodes consist of a number of node slots, * pivots, and a parent pointer. */ struct maple_node { union { struct { struct maple_pnode *parent; void __rcu *slot[MAPLE_NODE_SLOTS]; }; struct { void *pad; struct rcu_head rcu; struct maple_enode *piv_parent; unsigned char parent_slot; enum maple_type type; unsigned char slot_len; unsigned int ma_flags; }; struct maple_range_64 mr64; struct maple_arange_64 ma64; struct maple_alloc alloc; }; }; /* * More complicated stores can cause two nodes to become one or three and * potentially alter the height of the tree. Either half of the tree may need * to be rebalanced against the other. The ma_topiary struct is used to track * which nodes have been 'cut' from the tree so that the change can be done * safely at a later date. This is done to support RCU. */ struct ma_topiary { struct maple_enode *head; struct maple_enode *tail; struct maple_tree *mtree; }; void *mtree_load(struct maple_tree *mt, unsigned long index); int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); int mtree_insert_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp); int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp); int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); int mtree_store_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp); int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); void *mtree_erase(struct maple_tree *mt, unsigned long index); int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); void mtree_destroy(struct maple_tree *mt); void __mt_destroy(struct maple_tree *mt); /** * mtree_empty() - Determine if a tree has any present entries. * @mt: Maple Tree. * * Context: Any context. * Return: %true if the tree contains only NULL pointers. */ static inline bool mtree_empty(const struct maple_tree *mt) { return mt->ma_root == NULL; } /* Advanced API */ /* * Maple State Status * ma_active means the maple state is pointing to a node and offset and can * continue operating on the tree. * ma_start means we have not searched the tree. * ma_root means we have searched the tree and the entry we found lives in * the root of the tree (ie it has index 0, length 1 and is the only entry in * the tree). * ma_none means we have searched the tree and there is no node in the * tree for this entry. For example, we searched for index 1 in an empty * tree. Or we have a tree which points to a full leaf node and we * searched for an entry which is larger than can be contained in that * leaf node. * ma_pause means the data within the maple state may be stale, restart the * operation * ma_overflow means the search has reached the upper limit of the search * ma_underflow means the search has reached the lower limit of the search * ma_error means there was an error, check the node for the error number. */ enum maple_status { ma_active, ma_start, ma_root, ma_none, ma_pause, ma_overflow, ma_underflow, ma_error, }; /* * The maple state is defined in the struct ma_state and is used to keep track * of information during operations, and even between operations when using the * advanced API. * * If state->node has bit 0 set then it references a tree location which is not * a node (eg the root). If bit 1 is set, the rest of the bits are a negative * errno. Bit 2 (the 'unallocated slots' bit) is clear. Bits 3-6 indicate the * node type. * * state->alloc either has a request number of nodes or an allocated node. If * stat->alloc has a requested number of nodes, the first bit will be set (0x1) * and the remaining bits are the value. If state->alloc is a node, then the * node will be of type maple_alloc. maple_alloc has MAPLE_NODE_SLOTS - 1 for * storing more allocated nodes, a total number of nodes allocated, and the * node_count in this node. node_count is the number of allocated nodes in this * node. The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further * nodes into state->alloc->slot[0]'s node. Nodes are taken from state->alloc * by removing a node from the state->alloc node until state->alloc->node_count * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted * to state->alloc. Nodes are pushed onto state->alloc by putting the current * state->alloc into the pushed node's slot[0]. * * The state also contains the implied min/max of the state->node, the depth of * this search, and the offset. The implied min/max are either from the parent * node or are 0-oo for the root node. The depth is incremented or decremented * every time a node is walked down or up. The offset is the slot/pivot of * interest in the node - either for reading or writing. * * When returning a value the maple state index and last respectively contain * the start and end of the range for the entry. Ranges are inclusive in the * Maple Tree. * * The status of the state is used to determine how the next action should treat * the state. For instance, if the status is ma_start then the next action * should start at the root of the tree and walk down. If the status is * ma_pause then the node may be stale data and should be discarded. If the * status is ma_overflow, then the last action hit the upper limit. * */ struct ma_state { struct maple_tree *tree; /* The tree we're operating in */ unsigned long index; /* The index we're operating on - range start */ unsigned long last; /* The last index we're operating on - range end */ struct maple_enode *node; /* The node containing this entry */ unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ struct maple_alloc *alloc; /* Allocated nodes for this operation */ enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ unsigned char offset; unsigned char mas_flags; unsigned char end; /* The end of the node */ enum store_type store_type; /* The type of store needed for this operation */ }; struct ma_wr_state { struct ma_state *mas; struct maple_node *node; /* Decoded mas->node */ unsigned long r_min; /* range min */ unsigned long r_max; /* range max */ enum maple_type type; /* mas->node type */ unsigned char offset_end; /* The offset where the write ends */ unsigned long *pivots; /* mas->node->pivots pointer */ unsigned long end_piv; /* The pivot at the offset end */ void __rcu **slots; /* mas->node->slots pointer */ void *entry; /* The entry to write */ void *content; /* The existing entry that is being overwritten */ }; #define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) #define mas_lock_nested(mas, subclass) \ spin_lock_nested(&((mas)->tree->ma_lock), subclass) #define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) /* * Special values for ma_state.node. * MA_ERROR represents an errno. After dropping the lock and attempting * to resolve the error, the walk would have to be restarted from the * top of the tree as the tree may have been modified. */ #define MA_ERROR(err) \ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) #define MA_STATE(name, mt, first, end) \ struct ma_state name = { \ .tree = mt, \ .index = first, \ .last = end, \ .node = NULL, \ .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ .alloc = NULL, \ .mas_flags = 0, \ .store_type = wr_invalid, \ } #define MA_WR_STATE(name, ma_state, wr_entry) \ struct ma_wr_state name = { \ .mas = ma_state, \ .content = NULL, \ .entry = wr_entry, \ } #define MA_TOPIARY(name, tree) \ struct ma_topiary name = { \ .head = NULL, \ .tail = NULL, \ .mtree = tree, \ } void *mas_walk(struct ma_state *mas); void *mas_store(struct ma_state *mas, void *entry); void *mas_erase(struct ma_state *mas); int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); void mas_store_prealloc(struct ma_state *mas, void *entry); void *mas_find(struct ma_state *mas, unsigned long max); void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); void *mas_find_range_rev(struct ma_state *mas, unsigned long max); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp); bool mas_nomem(struct ma_state *mas, gfp_t gfp); void mas_pause(struct ma_state *mas); void maple_tree_init(void); void mas_destroy(struct ma_state *mas); int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries); void *mas_prev(struct ma_state *mas, unsigned long min); void *mas_prev_range(struct ma_state *mas, unsigned long max); void *mas_next(struct ma_state *mas, unsigned long max); void *mas_next_range(struct ma_state *mas, unsigned long max); int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); /* * This finds an empty area from the highest address to the lowest. * AKA "Topdown" version, */ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, unsigned long addr) { memset(mas, 0, sizeof(struct ma_state)); mas->tree = tree; mas->index = mas->last = addr; mas->max = ULONG_MAX; mas->status = ma_start; mas->node = NULL; } static inline bool mas_is_active(struct ma_state *mas) { return mas->status == ma_active; } static inline bool mas_is_err(struct ma_state *mas) { return mas->status == ma_error; } /** * mas_reset() - Reset a Maple Tree operation state. * @mas: Maple Tree operation state. * * Resets the error or walk state of the @mas so future walks of the * array will start from the root. Use this if you have dropped the * lock and want to reuse the ma_state. * * Context: Any context. */ static __always_inline void mas_reset(struct ma_state *mas) { mas->status = ma_start; mas->node = NULL; } /** * mas_for_each() - Iterate over a range of the maple tree. * @__mas: Maple Tree operation state (maple_state) * @__entry: Entry retrieved from the tree * @__max: maximum index to retrieve from the tree * * When returned, mas->index and mas->last will hold the entire range for the * entry. * * Note: may return the zero entry. */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) /** * mas_for_each_rev() - Iterate over a range of the maple tree in reverse order. * @__mas: Maple Tree operation state (maple_state) * @__entry: Entry retrieved from the tree * @__min: minimum index to retrieve from the tree * * When returned, mas->index and mas->last will hold the entire range for the * entry. * * Note: may return the zero entry. */ #define mas_for_each_rev(__mas, __entry, __min) \ while (((__entry) = mas_find_rev((__mas), (__min))) != NULL) #ifdef CONFIG_DEBUG_MAPLE_TREE enum mt_dump_format { mt_dump_dec, mt_dump_hex, }; extern atomic_t maple_tree_tests_run; extern atomic_t maple_tree_tests_passed; void mt_dump(const struct maple_tree *mt, enum mt_dump_format format); void mas_dump(const struct ma_state *mas); void mas_wr_dump(const struct ma_wr_state *wr_mas); void mt_validate(struct maple_tree *mt); void mt_cache_shrink(void); #define MT_BUG_ON(__tree, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mt_dump(__tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MAS_BUG_ON(__mas, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_dump(__mas); \ mt_dump((__mas)->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MAS_WR_BUG_ON(__wrmas, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_wr_dump(__wrmas); \ mas_dump((__wrmas)->mas); \ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MT_WARN_ON(__tree, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mt_dump(__tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #define MAS_WARN_ON(__mas, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_dump(__mas); \ mt_dump((__mas)->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #define MAS_WR_WARN_ON(__wrmas, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_wr_dump(__wrmas); \ mas_dump((__wrmas)->mas); \ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #else #define MT_BUG_ON(__tree, __x) BUG_ON(__x) #define MAS_BUG_ON(__mas, __x) BUG_ON(__x) #define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x) #define MT_WARN_ON(__tree, __x) WARN_ON(__x) #define MAS_WARN_ON(__mas, __x) WARN_ON(__x) #define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x) #endif /* CONFIG_DEBUG_MAPLE_TREE */ /** * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the * current location. * @mas: Maple Tree operation state. * @start: New start of range in the Maple Tree. * @last: New end of range in the Maple Tree. * * set the internal maple state values to a sub-range. * Please use mas_set_range() if you do not know where you are in the tree. */ static inline void __mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { /* Ensure the range starts within the current slot */ MAS_WARN_ON(mas, mas_is_active(mas) && (mas->index > start || mas->last < start)); mas->index = start; mas->last = last; } /** * mas_set_range() - Set up Maple Tree operation state for a different index. * @mas: Maple Tree operation state. * @start: New start of range in the Maple Tree. * @last: New end of range in the Maple Tree. * * Move the operation state to refer to a different range. This will * have the effect of starting a walk from the top; see mas_next() * to move to an adjacent index. */ static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { mas_reset(mas); __mas_set_range(mas, start, last); } /** * mas_set() - Set up Maple Tree operation state for a different index. * @mas: Maple Tree operation state. * @index: New index into the Maple Tree. * * Move the operation state to refer to a different index. This will * have the effect of starting a walk from the top; see mas_next() * to move to an adjacent index. */ static inline void mas_set(struct ma_state *mas, unsigned long index) { mas_set_range(mas, index, index); } static inline bool mt_external_lock(const struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN; } /** * mt_init_flags() - Initialise an empty maple tree with flags. * @mt: Maple Tree * @flags: maple tree flags. * * If you need to initialise a Maple Tree with special flags (eg, an * allocation tree), use this function. * * Context: Any context. */ static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags) { mt->ma_flags = flags; if (!mt_external_lock(mt)) spin_lock_init(&mt->ma_lock); rcu_assign_pointer(mt->ma_root, NULL); } /** * mt_init() - Initialise an empty maple tree. * @mt: Maple Tree * * An empty Maple Tree. * * Context: Any context. */ static inline void mt_init(struct maple_tree *mt) { mt_init_flags(mt, 0); } static inline bool mt_in_rcu(struct maple_tree *mt) { #ifdef CONFIG_MAPLE_RCU_DISABLED return false; #endif return mt->ma_flags & MT_FLAGS_USE_RCU; } /** * mt_clear_in_rcu() - Switch the tree to non-RCU mode. * @mt: The Maple Tree */ static inline void mt_clear_in_rcu(struct maple_tree *mt) { if (!mt_in_rcu(mt)) return; if (mt_external_lock(mt)) { WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags &= ~MT_FLAGS_USE_RCU; } else { mtree_lock(mt); mt->ma_flags &= ~MT_FLAGS_USE_RCU; mtree_unlock(mt); } } /** * mt_set_in_rcu() - Switch the tree to RCU safe mode. * @mt: The Maple Tree */ static inline void mt_set_in_rcu(struct maple_tree *mt) { if (mt_in_rcu(mt)) return; if (mt_external_lock(mt)) { WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags |= MT_FLAGS_USE_RCU; } else { mtree_lock(mt); mt->ma_flags |= MT_FLAGS_USE_RCU; mtree_unlock(mt); } } static inline unsigned int mt_height(const struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET; } void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max); void *mt_find_after(struct maple_tree *mt, unsigned long *index, unsigned long max); void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min); void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); /** * mt_for_each - Iterate over each entry starting at index until max. * @__tree: The Maple Tree * @__entry: The current entry * @__index: The index to start the search from. Subsequently used as iterator. * @__max: The maximum limit for @index * * This iterator skips all entries, which resolve to a NULL pointer, * e.g. entries which has been reserved with XA_ZERO_ENTRY. */ #define mt_for_each(__tree, __entry, __index, __max) \ for (__entry = mt_find(__tree, &(__index), __max); \ __entry; __entry = mt_find_after(__tree, &(__index), __max)) #endif /*_LINUX_MAPLE_TREE_H */ |
| 203 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | /* SPDX-License-Identifier: GPL-2.0 */ /* * A hash table (hashtab) maintains associations between * key values and datum values. The type of the key values * and the type of the datum values is arbitrary. The * functions for hash computation and key comparison are * provided by the creator of the table. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ #ifndef _SS_HASHTAB_H_ #define _SS_HASHTAB_H_ #include <linux/types.h> #include <linux/errno.h> #include <linux/sched.h> #define HASHTAB_MAX_NODES U32_MAX struct hashtab_key_params { u32 (*hash)(const void *key); /* hash func */ int (*cmp)(const void *key1, const void *key2); /* comparison func */ }; struct hashtab_node { void *key; void *datum; struct hashtab_node *next; }; struct hashtab { struct hashtab_node **htable; /* hash table */ u32 size; /* number of slots in hash table */ u32 nel; /* number of elements in hash table */ }; struct hashtab_info { u32 slots_used; u32 max_chain_len; u64 chain2_len_sum; }; /* * Initializes a new hash table with the specified characteristics. * * Returns -ENOMEM if insufficient space is available or 0 otherwise. */ int hashtab_init(struct hashtab *h, u32 nel_hint); int __hashtab_insert(struct hashtab *h, struct hashtab_node **dst, void *key, void *datum); /* * Inserts the specified (key, datum) pair into the specified hash table. * * Returns -ENOMEM on memory allocation error, * -EEXIST if there is already an entry with the same key, * -EINVAL for general errors or 0 otherwise. */ static inline int hashtab_insert(struct hashtab *h, void *key, void *datum, struct hashtab_key_params key_params) { u32 hvalue; struct hashtab_node *prev, *cur; cond_resched(); if (!h->size || h->nel == HASHTAB_MAX_NODES) return -EINVAL; hvalue = key_params.hash(key) & (h->size - 1); prev = NULL; cur = h->htable[hvalue]; while (cur) { int cmp = key_params.cmp(key, cur->key); if (cmp == 0) return -EEXIST; if (cmp < 0) break; prev = cur; cur = cur->next; } return __hashtab_insert(h, prev ? &prev->next : &h->htable[hvalue], key, datum); } /* * Searches for the entry with the specified key in the hash table. * * Returns NULL if no entry has the specified key or * the datum of the entry otherwise. */ static inline void *hashtab_search(struct hashtab *h, const void *key, struct hashtab_key_params key_params) { u32 hvalue; struct hashtab_node *cur; if (!h->size) return NULL; hvalue = key_params.hash(key) & (h->size - 1); cur = h->htable[hvalue]; while (cur) { int cmp = key_params.cmp(key, cur->key); if (cmp == 0) return cur->datum; if (cmp < 0) break; cur = cur->next; } return NULL; } /* * Destroys the specified hash table. */ void hashtab_destroy(struct hashtab *h); /* * Applies the specified apply function to (key,datum,args) * for each entry in the specified hash table. * * The order in which the function is applied to the entries * is dependent upon the internal structure of the hash table. * * If apply returns a non-zero status, then hashtab_map will cease * iterating through the hash table and will propagate the error * return to its caller. */ int hashtab_map(struct hashtab *h, int (*apply)(void *k, void *d, void *args), void *args); int hashtab_duplicate(struct hashtab *new, const struct hashtab *orig, int (*copy)(struct hashtab_node *new, const struct hashtab_node *orig, void *args), int (*destroy)(void *k, void *d, void *args), void *args); #ifdef CONFIG_SECURITY_SELINUX_DEBUG /* Fill info with some hash table statistics */ void hashtab_stat(struct hashtab *h, struct hashtab_info *info); #else static inline void hashtab_stat(struct hashtab *h, struct hashtab_info *info) { return; } #endif #endif /* _SS_HASHTAB_H */ |
| 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 | // SPDX-License-Identifier: GPL-2.0-only /* * GENEVE: Generic Network Virtualization Encapsulation * * Copyright (c) 2015 Red Hat, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ethtool.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/etherdevice.h> #include <linux/hash.h> #include <net/ipv6_stubs.h> #include <net/dst_metadata.h> #include <net/gro_cells.h> #include <net/rtnetlink.h> #include <net/geneve.h> #include <net/gro.h> #include <net/protocol.h> #define GENEVE_NETDEV_VER "0.6" #define GENEVE_N_VID (1u << 24) #define GENEVE_VID_MASK (GENEVE_N_VID - 1) #define VNI_HASH_BITS 10 #define VNI_HASH_SIZE (1<<VNI_HASH_BITS) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); #define GENEVE_VER 0 #define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr)) #define GENEVE_IPV4_HLEN (ETH_HLEN + sizeof(struct iphdr) + GENEVE_BASE_HLEN) #define GENEVE_IPV6_HLEN (ETH_HLEN + sizeof(struct ipv6hdr) + GENEVE_BASE_HLEN) /* per-network namespace private data for this module */ struct geneve_net { struct list_head geneve_list; struct list_head sock_list; }; static unsigned int geneve_net_id; struct geneve_dev_node { struct hlist_node hlist; struct geneve_dev *geneve; }; struct geneve_config { struct ip_tunnel_info info; bool collect_md; bool use_udp6_rx_checksums; bool ttl_inherit; enum ifla_geneve_df df; bool inner_proto_inherit; }; /* Pseudo network device */ struct geneve_dev { struct geneve_dev_node hlist4; /* vni hash table for IPv4 socket */ #if IS_ENABLED(CONFIG_IPV6) struct geneve_dev_node hlist6; /* vni hash table for IPv6 socket */ #endif struct net *net; /* netns for packet i/o */ struct net_device *dev; /* netdev for geneve tunnel */ struct geneve_sock __rcu *sock4; /* IPv4 socket used for geneve tunnel */ #if IS_ENABLED(CONFIG_IPV6) struct geneve_sock __rcu *sock6; /* IPv6 socket used for geneve tunnel */ #endif struct list_head next; /* geneve's per namespace list */ struct gro_cells gro_cells; struct geneve_config cfg; }; struct geneve_sock { bool collect_md; struct list_head list; struct socket *sock; struct rcu_head rcu; int refcnt; struct hlist_head vni_list[VNI_HASH_SIZE]; }; static inline __u32 geneve_net_vni_hash(u8 vni[3]) { __u32 vnid; vnid = (vni[0] << 16) | (vni[1] << 8) | vni[2]; return hash_32(vnid, VNI_HASH_BITS); } static __be64 vni_to_tunnel_id(const __u8 *vni) { #ifdef __BIG_ENDIAN return (vni[0] << 16) | (vni[1] << 8) | vni[2]; #else return (__force __be64)(((__force u64)vni[0] << 40) | ((__force u64)vni[1] << 48) | ((__force u64)vni[2] << 56)); #endif } /* Convert 64 bit tunnel ID to 24 bit VNI. */ static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) { #ifdef __BIG_ENDIAN vni[0] = (__force __u8)(tun_id >> 16); vni[1] = (__force __u8)(tun_id >> 8); vni[2] = (__force __u8)tun_id; #else vni[0] = (__force __u8)((__force u64)tun_id >> 40); vni[1] = (__force __u8)((__force u64)tun_id >> 48); vni[2] = (__force __u8)((__force u64)tun_id >> 56); #endif } static bool eq_tun_id_and_vni(u8 *tun_id, u8 *vni) { return !memcmp(vni, &tun_id[5], 3); } static sa_family_t geneve_get_sk_family(struct geneve_sock *gs) { return gs->sock->sk->sk_family; } static struct geneve_dev *geneve_lookup(struct geneve_sock *gs, __be32 addr, u8 vni[]) { struct hlist_head *vni_list_head; struct geneve_dev_node *node; __u32 hash; /* Find the device for this VNI */ hash = geneve_net_vni_hash(vni); vni_list_head = &gs->vni_list[hash]; hlist_for_each_entry_rcu(node, vni_list_head, hlist) { if (eq_tun_id_and_vni((u8 *)&node->geneve->cfg.info.key.tun_id, vni) && addr == node->geneve->cfg.info.key.u.ipv4.dst) return node->geneve; } return NULL; } #if IS_ENABLED(CONFIG_IPV6) static struct geneve_dev *geneve6_lookup(struct geneve_sock *gs, struct in6_addr addr6, u8 vni[]) { struct hlist_head *vni_list_head; struct geneve_dev_node *node; __u32 hash; /* Find the device for this VNI */ hash = geneve_net_vni_hash(vni); vni_list_head = &gs->vni_list[hash]; hlist_for_each_entry_rcu(node, vni_list_head, hlist) { if (eq_tun_id_and_vni((u8 *)&node->geneve->cfg.info.key.tun_id, vni) && ipv6_addr_equal(&addr6, &node->geneve->cfg.info.key.u.ipv6.dst)) return node->geneve; } return NULL; } #endif static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) { return (struct genevehdr *)(udp_hdr(skb) + 1); } static struct geneve_dev *geneve_lookup_skb(struct geneve_sock *gs, struct sk_buff *skb) { static u8 zero_vni[3]; u8 *vni; if (geneve_get_sk_family(gs) == AF_INET) { struct iphdr *iph; __be32 addr; iph = ip_hdr(skb); /* outer IP header... */ if (gs->collect_md) { vni = zero_vni; addr = 0; } else { vni = geneve_hdr(skb)->vni; addr = iph->saddr; } return geneve_lookup(gs, addr, vni); #if IS_ENABLED(CONFIG_IPV6) } else if (geneve_get_sk_family(gs) == AF_INET6) { static struct in6_addr zero_addr6; struct ipv6hdr *ip6h; struct in6_addr addr6; ip6h = ipv6_hdr(skb); /* outer IPv6 header... */ if (gs->collect_md) { vni = zero_vni; addr6 = zero_addr6; } else { vni = geneve_hdr(skb)->vni; addr6 = ip6h->saddr; } return geneve6_lookup(gs, addr6, vni); #endif } return NULL; } /* geneve receive/decap routine */ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, struct sk_buff *skb) { struct genevehdr *gnvh = geneve_hdr(skb); struct metadata_dst *tun_dst = NULL; unsigned int len; int nh, err = 0; void *oiph; if (ip_tunnel_collect_metadata() || gs->collect_md) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; __set_bit(IP_TUNNEL_KEY_BIT, flags); __assign_bit(IP_TUNNEL_OAM_BIT, flags, gnvh->oam); __assign_bit(IP_TUNNEL_CRIT_OPT_BIT, flags, gnvh->critical); tun_dst = udp_tun_rx_dst(skb, geneve_get_sk_family(gs), flags, vni_to_tunnel_id(gnvh->vni), gnvh->opt_len * 4); if (!tun_dst) { dev_dstats_rx_dropped(geneve->dev); goto drop; } /* Update tunnel dst according to Geneve options. */ ip_tunnel_flags_zero(flags); __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, flags); ip_tunnel_info_opts_set(&tun_dst->u.tun_info, gnvh->options, gnvh->opt_len * 4, flags); } else { /* Drop packets w/ critical options, * since we don't support any... */ if (gnvh->critical) { DEV_STATS_INC(geneve->dev, rx_frame_errors); DEV_STATS_INC(geneve->dev, rx_errors); goto drop; } } if (tun_dst) skb_dst_set(skb, &tun_dst->dst); if (gnvh->proto_type == htons(ETH_P_TEB)) { skb_reset_mac_header(skb); skb->protocol = eth_type_trans(skb, geneve->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); /* Ignore packet loops (and multicast echo) */ if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) { DEV_STATS_INC(geneve->dev, rx_errors); goto drop; } } else { skb_reset_mac_header(skb); skb->dev = geneve->dev; skb->pkt_type = PACKET_HOST; } /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_reset_network_header(skb); if (!pskb_inet_may_pull(skb)) { DEV_STATS_INC(geneve->dev, rx_length_errors); DEV_STATS_INC(geneve->dev, rx_errors); goto drop; } /* Get the outer header. */ oiph = skb->head + nh; if (geneve_get_sk_family(gs) == AF_INET) err = IP_ECN_decapsulate(oiph, skb); #if IS_ENABLED(CONFIG_IPV6) else err = IP6_ECN_decapsulate(oiph, skb); #endif if (unlikely(err)) { if (log_ecn_error) { if (geneve_get_sk_family(gs) == AF_INET) net_info_ratelimited("non-ECT from %pI4 " "with TOS=%#x\n", &((struct iphdr *)oiph)->saddr, ((struct iphdr *)oiph)->tos); #if IS_ENABLED(CONFIG_IPV6) else net_info_ratelimited("non-ECT from %pI6\n", &((struct ipv6hdr *)oiph)->saddr); #endif } if (err > 1) { DEV_STATS_INC(geneve->dev, rx_frame_errors); DEV_STATS_INC(geneve->dev, rx_errors); goto drop; } } len = skb->len; err = gro_cells_receive(&geneve->gro_cells, skb); if (likely(err == NET_RX_SUCCESS)) dev_dstats_rx_add(geneve->dev, len); return; drop: /* Consume bad packet */ kfree_skb(skb); } /* Setup stats when device is created */ static int geneve_init(struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); int err; err = gro_cells_init(&geneve->gro_cells, dev); if (err) return err; err = dst_cache_init(&geneve->cfg.info.dst_cache, GFP_KERNEL); if (err) { gro_cells_destroy(&geneve->gro_cells); return err; } netdev_lockdep_set_classes(dev); return 0; } static void geneve_uninit(struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); dst_cache_destroy(&geneve->cfg.info.dst_cache); gro_cells_destroy(&geneve->gro_cells); } /* Callback from net/ipv4/udp.c to receive packets */ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct genevehdr *geneveh; struct geneve_dev *geneve; struct geneve_sock *gs; __be16 inner_proto; int opts_len; /* Need UDP and Geneve header to be present */ if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) goto drop; /* Return packets with reserved bits set */ geneveh = geneve_hdr(skb); if (unlikely(geneveh->ver != GENEVE_VER)) goto drop; gs = rcu_dereference_sk_user_data(sk); if (!gs) goto drop; geneve = geneve_lookup_skb(gs, skb); if (!geneve) goto drop; inner_proto = geneveh->proto_type; if (unlikely((!geneve->cfg.inner_proto_inherit && inner_proto != htons(ETH_P_TEB)))) { dev_dstats_rx_dropped(geneve->dev); goto drop; } opts_len = geneveh->opt_len * 4; if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto, !net_eq(geneve->net, dev_net(geneve->dev)))) { dev_dstats_rx_dropped(geneve->dev); goto drop; } geneve_rx(geneve, gs, skb); return 0; drop: /* Consume bad packet */ kfree_skb(skb); return 0; } /* Callback from net/ipv{4,6}/udp.c to check that we have a tunnel for errors */ static int geneve_udp_encap_err_lookup(struct sock *sk, struct sk_buff *skb) { struct genevehdr *geneveh; struct geneve_sock *gs; u8 zero_vni[3] = { 0 }; u8 *vni = zero_vni; if (!pskb_may_pull(skb, skb_transport_offset(skb) + GENEVE_BASE_HLEN)) return -EINVAL; geneveh = geneve_hdr(skb); if (geneveh->ver != GENEVE_VER) return -EINVAL; if (geneveh->proto_type != htons(ETH_P_TEB)) return -EINVAL; gs = rcu_dereference_sk_user_data(sk); if (!gs) return -ENOENT; if (geneve_get_sk_family(gs) == AF_INET) { struct iphdr *iph = ip_hdr(skb); __be32 addr4 = 0; if (!gs->collect_md) { vni = geneve_hdr(skb)->vni; addr4 = iph->daddr; } return geneve_lookup(gs, addr4, vni) ? 0 : -ENOENT; } #if IS_ENABLED(CONFIG_IPV6) if (geneve_get_sk_family(gs) == AF_INET6) { struct ipv6hdr *ip6h = ipv6_hdr(skb); struct in6_addr addr6; memset(&addr6, 0, sizeof(struct in6_addr)); if (!gs->collect_md) { vni = geneve_hdr(skb)->vni; addr6 = ip6h->daddr; } return geneve6_lookup(gs, addr6, vni) ? 0 : -ENOENT; } #endif return -EPFNOSUPPORT; } static struct socket *geneve_create_sock(struct net *net, bool ipv6, __be16 port, bool ipv6_rx_csum) { struct socket *sock; struct udp_port_cfg udp_conf; int err; memset(&udp_conf, 0, sizeof(udp_conf)); if (ipv6) { udp_conf.family = AF_INET6; udp_conf.ipv6_v6only = 1; udp_conf.use_udp6_rx_checksums = ipv6_rx_csum; } else { udp_conf.family = AF_INET; udp_conf.local_ip.s_addr = htonl(INADDR_ANY); } udp_conf.local_udp_port = port; /* Open UDP socket */ err = udp_sock_create(net, &udp_conf, &sock); if (err < 0) return ERR_PTR(err); udp_allow_gso(sock->sk); return sock; } static int geneve_hlen(struct genevehdr *gh) { return sizeof(*gh) + gh->opt_len * 4; } static struct sk_buff *geneve_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { struct sk_buff *pp = NULL; struct sk_buff *p; struct genevehdr *gh, *gh2; unsigned int hlen, gh_len, off_gnv; const struct packet_offload *ptype; __be16 type; int flush = 1; off_gnv = skb_gro_offset(skb); hlen = off_gnv + sizeof(*gh); gh = skb_gro_header(skb, hlen, off_gnv); if (unlikely(!gh)) goto out; if (gh->ver != GENEVE_VER || gh->oam) goto out; gh_len = geneve_hlen(gh); hlen = off_gnv + gh_len; if (!skb_gro_may_pull(skb, hlen)) { gh = skb_gro_header_slow(skb, hlen, off_gnv); if (unlikely(!gh)) goto out; } list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; gh2 = (struct genevehdr *)(p->data + off_gnv); if (gh->opt_len != gh2->opt_len || memcmp(gh, gh2, gh_len)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } skb_gro_pull(skb, gh_len); skb_gro_postpull_rcsum(skb, gh, gh_len); type = gh->proto_type; if (likely(type == htons(ETH_P_TEB))) return call_gro_receive(eth_gro_receive, head, skb); ptype = gro_find_receive_by_type(type); if (!ptype) goto out; pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); flush = 0; out: skb_gro_flush_final(skb, pp, flush); return pp; } static int geneve_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { struct genevehdr *gh; struct packet_offload *ptype; __be16 type; int gh_len; int err = -ENOSYS; gh = (struct genevehdr *)(skb->data + nhoff); gh_len = geneve_hlen(gh); type = gh->proto_type; /* since skb->encapsulation is set, eth_gro_complete() sets the inner mac header */ if (likely(type == htons(ETH_P_TEB))) return eth_gro_complete(skb, nhoff + gh_len); ptype = gro_find_complete_by_type(type); if (ptype) err = ptype->callbacks.gro_complete(skb, nhoff + gh_len); skb_set_inner_mac_header(skb, nhoff + gh_len); return err; } /* Create new listen socket if needed */ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, bool ipv6, bool ipv6_rx_csum) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_sock *gs; struct socket *sock; struct udp_tunnel_sock_cfg tunnel_cfg; int h; gs = kzalloc(sizeof(*gs), GFP_KERNEL); if (!gs) return ERR_PTR(-ENOMEM); sock = geneve_create_sock(net, ipv6, port, ipv6_rx_csum); if (IS_ERR(sock)) { kfree(gs); return ERR_CAST(sock); } gs->sock = sock; gs->refcnt = 1; for (h = 0; h < VNI_HASH_SIZE; ++h) INIT_HLIST_HEAD(&gs->vni_list[h]); /* Initialize the geneve udp offloads structure */ udp_tunnel_notify_add_rx_port(gs->sock, UDP_TUNNEL_TYPE_GENEVE); /* Mark socket as an encapsulation socket */ memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); tunnel_cfg.sk_user_data = gs; tunnel_cfg.encap_type = 1; tunnel_cfg.gro_receive = geneve_gro_receive; tunnel_cfg.gro_complete = geneve_gro_complete; tunnel_cfg.encap_rcv = geneve_udp_encap_recv; tunnel_cfg.encap_err_lookup = geneve_udp_encap_err_lookup; tunnel_cfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, sock, &tunnel_cfg); list_add(&gs->list, &gn->sock_list); return gs; } static void __geneve_sock_release(struct geneve_sock *gs) { if (!gs || --gs->refcnt) return; list_del(&gs->list); udp_tunnel_notify_del_rx_port(gs->sock, UDP_TUNNEL_TYPE_GENEVE); udp_tunnel_sock_release(gs->sock); kfree_rcu(gs, rcu); } static void geneve_sock_release(struct geneve_dev *geneve) { struct geneve_sock *gs4 = rtnl_dereference(geneve->sock4); #if IS_ENABLED(CONFIG_IPV6) struct geneve_sock *gs6 = rtnl_dereference(geneve->sock6); rcu_assign_pointer(geneve->sock6, NULL); #endif rcu_assign_pointer(geneve->sock4, NULL); synchronize_net(); __geneve_sock_release(gs4); #if IS_ENABLED(CONFIG_IPV6) __geneve_sock_release(gs6); #endif } static struct geneve_sock *geneve_find_sock(struct geneve_net *gn, sa_family_t family, __be16 dst_port) { struct geneve_sock *gs; list_for_each_entry(gs, &gn->sock_list, list) { if (inet_sk(gs->sock->sk)->inet_sport == dst_port && geneve_get_sk_family(gs) == family) { return gs; } } return NULL; } static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6) { struct net *net = geneve->net; struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev_node *node; struct geneve_sock *gs; __u8 vni[3]; __u32 hash; gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET, geneve->cfg.info.key.tp_dst); if (gs) { gs->refcnt++; goto out; } gs = geneve_socket_create(net, geneve->cfg.info.key.tp_dst, ipv6, geneve->cfg.use_udp6_rx_checksums); if (IS_ERR(gs)) return PTR_ERR(gs); out: gs->collect_md = geneve->cfg.collect_md; #if IS_ENABLED(CONFIG_IPV6) if (ipv6) { rcu_assign_pointer(geneve->sock6, gs); node = &geneve->hlist6; } else #endif { rcu_assign_pointer(geneve->sock4, gs); node = &geneve->hlist4; } node->geneve = geneve; tunnel_id_to_vni(geneve->cfg.info.key.tun_id, vni); hash = geneve_net_vni_hash(vni); hlist_add_head_rcu(&node->hlist, &gs->vni_list[hash]); return 0; } static int geneve_open(struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); bool metadata = geneve->cfg.collect_md; bool ipv4, ipv6; int ret = 0; ipv6 = geneve->cfg.info.mode & IP_TUNNEL_INFO_IPV6 || metadata; ipv4 = !ipv6 || metadata; #if IS_ENABLED(CONFIG_IPV6) if (ipv6) { ret = geneve_sock_add(geneve, true); if (ret < 0 && ret != -EAFNOSUPPORT) ipv4 = false; } #endif if (ipv4) ret = geneve_sock_add(geneve, false); if (ret < 0) geneve_sock_release(geneve); return ret; } static int geneve_stop(struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); hlist_del_init_rcu(&geneve->hlist4.hlist); #if IS_ENABLED(CONFIG_IPV6) hlist_del_init_rcu(&geneve->hlist6.hlist); #endif geneve_sock_release(geneve); return 0; } static void geneve_build_header(struct genevehdr *geneveh, const struct ip_tunnel_info *info, __be16 inner_proto) { geneveh->ver = GENEVE_VER; geneveh->opt_len = info->options_len / 4; geneveh->oam = test_bit(IP_TUNNEL_OAM_BIT, info->key.tun_flags); geneveh->critical = test_bit(IP_TUNNEL_CRIT_OPT_BIT, info->key.tun_flags); geneveh->rsvd1 = 0; tunnel_id_to_vni(info->key.tun_id, geneveh->vni); geneveh->proto_type = inner_proto; geneveh->rsvd2 = 0; if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) ip_tunnel_info_opts_get(geneveh->options, info); } static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, const struct ip_tunnel_info *info, bool xnet, int ip_hdr_len, bool inner_proto_inherit) { bool udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); struct genevehdr *gnvh; __be16 inner_proto; int min_headroom; int err; skb_reset_mac_header(skb); skb_scrub_packet(skb, xnet); min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + GENEVE_BASE_HLEN + info->options_len + ip_hdr_len; err = skb_cow_head(skb, min_headroom); if (unlikely(err)) goto free_dst; err = udp_tunnel_handle_offloads(skb, udp_sum); if (err) goto free_dst; gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len); inner_proto = inner_proto_inherit ? skb->protocol : htons(ETH_P_TEB); geneve_build_header(gnvh, info, inner_proto); skb_set_inner_protocol(skb, inner_proto); return 0; free_dst: dst_release(dst); return err; } static u8 geneve_get_dsfield(struct sk_buff *skb, struct net_device *dev, const struct ip_tunnel_info *info, bool *use_cache) { struct geneve_dev *geneve = netdev_priv(dev); u8 dsfield; dsfield = info->key.tos; if (dsfield == 1 && !geneve->cfg.collect_md) { dsfield = ip_tunnel_get_dsfield(ip_hdr(skb), skb); *use_cache = false; } return dsfield; } static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct geneve_dev *geneve, const struct ip_tunnel_info *info) { bool inner_proto_inherit = geneve->cfg.inner_proto_inherit; bool xnet = !net_eq(geneve->net, dev_net(geneve->dev)); struct geneve_sock *gs4 = rcu_dereference(geneve->sock4); const struct ip_tunnel_key *key = &info->key; struct rtable *rt; bool use_cache; __u8 tos, ttl; __be16 df = 0; __be32 saddr; __be16 sport; int err; if (skb_vlan_inet_prepare(skb, inner_proto_inherit)) return -EINVAL; if (!gs4) return -EIO; use_cache = ip_tunnel_dst_cache_usable(skb, info); tos = geneve_get_dsfield(skb, dev, info, &use_cache); sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); rt = udp_tunnel_dst_lookup(skb, dev, geneve->net, 0, &saddr, &info->key, sport, geneve->cfg.info.key.tp_dst, tos, use_cache ? (struct dst_cache *)&info->dst_cache : NULL); if (IS_ERR(rt)) return PTR_ERR(rt); err = skb_tunnel_check_pmtu(skb, &rt->dst, GENEVE_IPV4_HLEN + info->options_len, netif_is_any_bridge_port(dev)); if (err < 0) { dst_release(&rt->dst); return err; } else if (err) { struct ip_tunnel_info *info; info = skb_tunnel_info(skb); if (info) { struct ip_tunnel_info *unclone; unclone = skb_tunnel_info_unclone(skb); if (unlikely(!unclone)) { dst_release(&rt->dst); return -ENOMEM; } unclone->key.u.ipv4.dst = saddr; unclone->key.u.ipv4.src = info->key.u.ipv4.dst; } if (!pskb_may_pull(skb, ETH_HLEN)) { dst_release(&rt->dst); return -EINVAL; } skb->protocol = eth_type_trans(skb, geneve->dev); __netif_rx(skb); dst_release(&rt->dst); return -EMSGSIZE; } tos = ip_tunnel_ecn_encap(tos, ip_hdr(skb), skb); if (geneve->cfg.collect_md) { ttl = key->ttl; df = test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags) ? htons(IP_DF) : 0; } else { if (geneve->cfg.ttl_inherit) ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb); else ttl = key->ttl; ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); if (geneve->cfg.df == GENEVE_DF_SET) { df = htons(IP_DF); } else if (geneve->cfg.df == GENEVE_DF_INHERIT) { struct ethhdr *eth = skb_eth_hdr(skb); if (ntohs(eth->h_proto) == ETH_P_IPV6) { df = htons(IP_DF); } else if (ntohs(eth->h_proto) == ETH_P_IP) { struct iphdr *iph = ip_hdr(skb); if (iph->frag_off & htons(IP_DF)) df = htons(IP_DF); } } } err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr), inner_proto_inherit); if (unlikely(err)) return err; udp_tunnel_xmit_skb(rt, gs4->sock->sk, skb, saddr, info->key.u.ipv4.dst, tos, ttl, df, sport, geneve->cfg.info.key.tp_dst, !net_eq(geneve->net, dev_net(geneve->dev)), !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags)); return 0; } #if IS_ENABLED(CONFIG_IPV6) static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct geneve_dev *geneve, const struct ip_tunnel_info *info) { bool inner_proto_inherit = geneve->cfg.inner_proto_inherit; bool xnet = !net_eq(geneve->net, dev_net(geneve->dev)); struct geneve_sock *gs6 = rcu_dereference(geneve->sock6); const struct ip_tunnel_key *key = &info->key; struct dst_entry *dst = NULL; struct in6_addr saddr; bool use_cache; __u8 prio, ttl; __be16 sport; int err; if (skb_vlan_inet_prepare(skb, inner_proto_inherit)) return -EINVAL; if (!gs6) return -EIO; use_cache = ip_tunnel_dst_cache_usable(skb, info); prio = geneve_get_dsfield(skb, dev, info, &use_cache); sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); dst = udp_tunnel6_dst_lookup(skb, dev, geneve->net, gs6->sock, 0, &saddr, key, sport, geneve->cfg.info.key.tp_dst, prio, use_cache ? (struct dst_cache *)&info->dst_cache : NULL); if (IS_ERR(dst)) return PTR_ERR(dst); err = skb_tunnel_check_pmtu(skb, dst, GENEVE_IPV6_HLEN + info->options_len, netif_is_any_bridge_port(dev)); if (err < 0) { dst_release(dst); return err; } else if (err) { struct ip_tunnel_info *info = skb_tunnel_info(skb); if (info) { struct ip_tunnel_info *unclone; unclone = skb_tunnel_info_unclone(skb); if (unlikely(!unclone)) { dst_release(dst); return -ENOMEM; } unclone->key.u.ipv6.dst = saddr; unclone->key.u.ipv6.src = info->key.u.ipv6.dst; } if (!pskb_may_pull(skb, ETH_HLEN)) { dst_release(dst); return -EINVAL; } skb->protocol = eth_type_trans(skb, geneve->dev); __netif_rx(skb); dst_release(dst); return -EMSGSIZE; } prio = ip_tunnel_ecn_encap(prio, ip_hdr(skb), skb); if (geneve->cfg.collect_md) { ttl = key->ttl; } else { if (geneve->cfg.ttl_inherit) ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb); else ttl = key->ttl; ttl = ttl ? : ip6_dst_hoplimit(dst); } err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr), inner_proto_inherit); if (unlikely(err)) return err; udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev, &saddr, &key->u.ipv6.dst, prio, ttl, info->key.label, sport, geneve->cfg.info.key.tp_dst, !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags)); return 0; } #endif static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); struct ip_tunnel_info *info = NULL; int err; if (geneve->cfg.collect_md) { info = skb_tunnel_info(skb); if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) { netdev_dbg(dev, "no tunnel metadata\n"); dev_kfree_skb(skb); dev_dstats_tx_dropped(dev); return NETDEV_TX_OK; } } else { info = &geneve->cfg.info; } rcu_read_lock(); #if IS_ENABLED(CONFIG_IPV6) if (info->mode & IP_TUNNEL_INFO_IPV6) err = geneve6_xmit_skb(skb, dev, geneve, info); else #endif err = geneve_xmit_skb(skb, dev, geneve, info); rcu_read_unlock(); if (likely(!err)) return NETDEV_TX_OK; if (err != -EMSGSIZE) dev_kfree_skb(skb); if (err == -ELOOP) DEV_STATS_INC(dev, collisions); else if (err == -ENETUNREACH) DEV_STATS_INC(dev, tx_carrier_errors); DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } static int geneve_change_mtu(struct net_device *dev, int new_mtu) { if (new_mtu > dev->max_mtu) new_mtu = dev->max_mtu; else if (new_mtu < dev->min_mtu) new_mtu = dev->min_mtu; WRITE_ONCE(dev->mtu, new_mtu); return 0; } static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info = skb_tunnel_info(skb); struct geneve_dev *geneve = netdev_priv(dev); __be16 sport; if (ip_tunnel_info_af(info) == AF_INET) { struct rtable *rt; struct geneve_sock *gs4 = rcu_dereference(geneve->sock4); bool use_cache; __be32 saddr; u8 tos; if (!gs4) return -EIO; use_cache = ip_tunnel_dst_cache_usable(skb, info); tos = geneve_get_dsfield(skb, dev, info, &use_cache); sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); rt = udp_tunnel_dst_lookup(skb, dev, geneve->net, 0, &saddr, &info->key, sport, geneve->cfg.info.key.tp_dst, tos, use_cache ? &info->dst_cache : NULL); if (IS_ERR(rt)) return PTR_ERR(rt); ip_rt_put(rt); info->key.u.ipv4.src = saddr; #if IS_ENABLED(CONFIG_IPV6) } else if (ip_tunnel_info_af(info) == AF_INET6) { struct dst_entry *dst; struct geneve_sock *gs6 = rcu_dereference(geneve->sock6); struct in6_addr saddr; bool use_cache; u8 prio; if (!gs6) return -EIO; use_cache = ip_tunnel_dst_cache_usable(skb, info); prio = geneve_get_dsfield(skb, dev, info, &use_cache); sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); dst = udp_tunnel6_dst_lookup(skb, dev, geneve->net, gs6->sock, 0, &saddr, &info->key, sport, geneve->cfg.info.key.tp_dst, prio, use_cache ? &info->dst_cache : NULL); if (IS_ERR(dst)) return PTR_ERR(dst); dst_release(dst); info->key.u.ipv6.src = saddr; #endif } else { return -EINVAL; } info->key.tp_src = sport; info->key.tp_dst = geneve->cfg.info.key.tp_dst; return 0; } static const struct net_device_ops geneve_netdev_ops = { .ndo_init = geneve_init, .ndo_uninit = geneve_uninit, .ndo_open = geneve_open, .ndo_stop = geneve_stop, .ndo_start_xmit = geneve_xmit, .ndo_change_mtu = geneve_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_fill_metadata_dst = geneve_fill_metadata_dst, }; static void geneve_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->version, GENEVE_NETDEV_VER, sizeof(drvinfo->version)); strscpy(drvinfo->driver, "geneve", sizeof(drvinfo->driver)); } static const struct ethtool_ops geneve_ethtool_ops = { .get_drvinfo = geneve_get_drvinfo, .get_link = ethtool_op_get_link, }; /* Info for udev, that this is a virtual tunnel endpoint */ static const struct device_type geneve_type = { .name = "geneve", }; /* Calls the ndo_udp_tunnel_add of the caller in order to * supply the listening GENEVE udp ports. Callers are expected * to implement the ndo_udp_tunnel_add. */ static void geneve_offload_rx_ports(struct net_device *dev, bool push) { struct net *net = dev_net(dev); struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_sock *gs; rcu_read_lock(); list_for_each_entry_rcu(gs, &gn->sock_list, list) { if (push) { udp_tunnel_push_rx_port(dev, gs->sock, UDP_TUNNEL_TYPE_GENEVE); } else { udp_tunnel_drop_rx_port(dev, gs->sock, UDP_TUNNEL_TYPE_GENEVE); } } rcu_read_unlock(); } /* Initialize the device structure. */ static void geneve_setup(struct net_device *dev) { ether_setup(dev); dev->netdev_ops = &geneve_netdev_ops; dev->ethtool_ops = &geneve_ethtool_ops; dev->needs_free_netdev = true; SET_NETDEV_DEVTYPE(dev, &geneve_type); dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->hw_features |= NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; /* MTU range: 68 - (something less than 65535) */ dev->min_mtu = ETH_MIN_MTU; /* The max_mtu calculation does not take account of GENEVE * options, to avoid excluding potentially valid * configurations. This will be further reduced by IPvX hdr size. */ dev->max_mtu = IP_MAX_MTU - GENEVE_BASE_HLEN - dev->hard_header_len; netif_keep_dst(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; dev->lltx = true; eth_hw_addr_random(dev); } static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = { [IFLA_GENEVE_UNSPEC] = { .strict_start_type = IFLA_GENEVE_INNER_PROTO_INHERIT }, [IFLA_GENEVE_ID] = { .type = NLA_U32 }, [IFLA_GENEVE_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_GENEVE_REMOTE6] = { .len = sizeof(struct in6_addr) }, [IFLA_GENEVE_TTL] = { .type = NLA_U8 }, [IFLA_GENEVE_TOS] = { .type = NLA_U8 }, [IFLA_GENEVE_LABEL] = { .type = NLA_U32 }, [IFLA_GENEVE_PORT] = { .type = NLA_U16 }, [IFLA_GENEVE_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_GENEVE_UDP_CSUM] = { .type = NLA_U8 }, [IFLA_GENEVE_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, [IFLA_GENEVE_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, [IFLA_GENEVE_TTL_INHERIT] = { .type = NLA_U8 }, [IFLA_GENEVE_DF] = { .type = NLA_U8 }, [IFLA_GENEVE_INNER_PROTO_INHERIT] = { .type = NLA_FLAG }, }; static int geneve_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], "Provided link layer address is not Ethernet"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], "Provided Ethernet address is not unicast"); return -EADDRNOTAVAIL; } } if (!data) { NL_SET_ERR_MSG(extack, "Not enough attributes provided to perform the operation"); return -EINVAL; } if (data[IFLA_GENEVE_ID]) { __u32 vni = nla_get_u32(data[IFLA_GENEVE_ID]); if (vni >= GENEVE_N_VID) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_ID], "Geneve ID must be lower than 16777216"); return -ERANGE; } } if (data[IFLA_GENEVE_DF]) { enum ifla_geneve_df df = nla_get_u8(data[IFLA_GENEVE_DF]); if (df < 0 || df > GENEVE_DF_MAX) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_DF], "Invalid DF attribute"); return -EINVAL; } } return 0; } static struct geneve_dev *geneve_find_dev(struct geneve_net *gn, const struct ip_tunnel_info *info, bool *tun_on_same_port, bool *tun_collect_md) { struct geneve_dev *geneve, *t = NULL; *tun_on_same_port = false; *tun_collect_md = false; list_for_each_entry(geneve, &gn->geneve_list, next) { if (info->key.tp_dst == geneve->cfg.info.key.tp_dst) { *tun_collect_md = geneve->cfg.collect_md; *tun_on_same_port = true; } if (info->key.tun_id == geneve->cfg.info.key.tun_id && info->key.tp_dst == geneve->cfg.info.key.tp_dst && !memcmp(&info->key.u, &geneve->cfg.info.key.u, sizeof(info->key.u))) t = geneve; } return t; } static bool is_tnl_info_zero(const struct ip_tunnel_info *info) { return !(info->key.tun_id || info->key.tos || !ip_tunnel_flags_empty(info->key.tun_flags) || info->key.ttl || info->key.label || info->key.tp_src || memchr_inv(&info->key.u, 0, sizeof(info->key.u))); } static bool geneve_dst_addr_equal(struct ip_tunnel_info *a, struct ip_tunnel_info *b) { if (ip_tunnel_info_af(a) == AF_INET) return a->key.u.ipv4.dst == b->key.u.ipv4.dst; else return ipv6_addr_equal(&a->key.u.ipv6.dst, &b->key.u.ipv6.dst); } static int geneve_configure(struct net *net, struct net_device *dev, struct netlink_ext_ack *extack, const struct geneve_config *cfg) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev *t, *geneve = netdev_priv(dev); const struct ip_tunnel_info *info = &cfg->info; bool tun_collect_md, tun_on_same_port; int err, encap_len; if (cfg->collect_md && !is_tnl_info_zero(info)) { NL_SET_ERR_MSG(extack, "Device is externally controlled, so attributes (VNI, Port, and so on) must not be specified"); return -EINVAL; } geneve->net = net; geneve->dev = dev; t = geneve_find_dev(gn, info, &tun_on_same_port, &tun_collect_md); if (t) return -EBUSY; /* make enough headroom for basic scenario */ encap_len = GENEVE_BASE_HLEN + ETH_HLEN; if (!cfg->collect_md && ip_tunnel_info_af(info) == AF_INET) { encap_len += sizeof(struct iphdr); dev->max_mtu -= sizeof(struct iphdr); } else { encap_len += sizeof(struct ipv6hdr); dev->max_mtu -= sizeof(struct ipv6hdr); } dev->needed_headroom = encap_len + ETH_HLEN; if (cfg->collect_md) { if (tun_on_same_port) { NL_SET_ERR_MSG(extack, "There can be only one externally controlled device on a destination port"); return -EPERM; } } else { if (tun_collect_md) { NL_SET_ERR_MSG(extack, "There already exists an externally controlled device on this destination port"); return -EPERM; } } dst_cache_reset(&geneve->cfg.info.dst_cache); memcpy(&geneve->cfg, cfg, sizeof(*cfg)); if (geneve->cfg.inner_proto_inherit) { dev->header_ops = NULL; dev->type = ARPHRD_NONE; dev->hard_header_len = 0; dev->addr_len = 0; dev->flags = IFF_POINTOPOINT | IFF_NOARP; } err = register_netdevice(dev); if (err) return err; list_add(&geneve->next, &gn->geneve_list); return 0; } static void init_tnl_info(struct ip_tunnel_info *info, __u16 dst_port) { memset(info, 0, sizeof(*info)); info->key.tp_dst = htons(dst_port); } static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack, struct geneve_config *cfg, bool changelink) { struct ip_tunnel_info *info = &cfg->info; int attrtype; if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6]) { NL_SET_ERR_MSG(extack, "Cannot specify both IPv4 and IPv6 Remote addresses"); return -EINVAL; } if (data[IFLA_GENEVE_REMOTE]) { if (changelink && (ip_tunnel_info_af(info) == AF_INET6)) { attrtype = IFLA_GENEVE_REMOTE; goto change_notsup; } info->key.u.ipv4.dst = nla_get_in_addr(data[IFLA_GENEVE_REMOTE]); if (ipv4_is_multicast(info->key.u.ipv4.dst)) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE], "Remote IPv4 address cannot be Multicast"); return -EINVAL; } } if (data[IFLA_GENEVE_REMOTE6]) { #if IS_ENABLED(CONFIG_IPV6) if (changelink && (ip_tunnel_info_af(info) == AF_INET)) { attrtype = IFLA_GENEVE_REMOTE6; goto change_notsup; } info->mode = IP_TUNNEL_INFO_IPV6; info->key.u.ipv6.dst = nla_get_in6_addr(data[IFLA_GENEVE_REMOTE6]); if (ipv6_addr_type(&info->key.u.ipv6.dst) & IPV6_ADDR_LINKLOCAL) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE6], "Remote IPv6 address cannot be link-local"); return -EINVAL; } if (ipv6_addr_is_multicast(&info->key.u.ipv6.dst)) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE6], "Remote IPv6 address cannot be Multicast"); return -EINVAL; } __set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); cfg->use_udp6_rx_checksums = true; #else NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE6], "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; #endif } if (data[IFLA_GENEVE_ID]) { __u32 vni; __u8 tvni[3]; __be64 tunid; vni = nla_get_u32(data[IFLA_GENEVE_ID]); tvni[0] = (vni & 0x00ff0000) >> 16; tvni[1] = (vni & 0x0000ff00) >> 8; tvni[2] = vni & 0x000000ff; tunid = vni_to_tunnel_id(tvni); if (changelink && (tunid != info->key.tun_id)) { attrtype = IFLA_GENEVE_ID; goto change_notsup; } info->key.tun_id = tunid; } if (data[IFLA_GENEVE_TTL_INHERIT]) { if (nla_get_u8(data[IFLA_GENEVE_TTL_INHERIT])) cfg->ttl_inherit = true; else cfg->ttl_inherit = false; } else if (data[IFLA_GENEVE_TTL]) { info->key.ttl = nla_get_u8(data[IFLA_GENEVE_TTL]); cfg->ttl_inherit = false; } if (data[IFLA_GENEVE_TOS]) info->key.tos = nla_get_u8(data[IFLA_GENEVE_TOS]); if (data[IFLA_GENEVE_DF]) cfg->df = nla_get_u8(data[IFLA_GENEVE_DF]); if (data[IFLA_GENEVE_LABEL]) { info->key.label = nla_get_be32(data[IFLA_GENEVE_LABEL]) & IPV6_FLOWLABEL_MASK; if (info->key.label && (!(info->mode & IP_TUNNEL_INFO_IPV6))) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_LABEL], "Label attribute only applies for IPv6 Geneve devices"); return -EINVAL; } } if (data[IFLA_GENEVE_PORT]) { if (changelink) { attrtype = IFLA_GENEVE_PORT; goto change_notsup; } info->key.tp_dst = nla_get_be16(data[IFLA_GENEVE_PORT]); } if (data[IFLA_GENEVE_COLLECT_METADATA]) { if (changelink) { attrtype = IFLA_GENEVE_COLLECT_METADATA; goto change_notsup; } cfg->collect_md = true; } if (data[IFLA_GENEVE_UDP_CSUM]) { if (changelink) { attrtype = IFLA_GENEVE_UDP_CSUM; goto change_notsup; } if (nla_get_u8(data[IFLA_GENEVE_UDP_CSUM])) __set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); } if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]) { #if IS_ENABLED(CONFIG_IPV6) if (changelink) { attrtype = IFLA_GENEVE_UDP_ZERO_CSUM6_TX; goto change_notsup; } if (nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX])) __clear_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); #else NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX], "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; #endif } if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]) { #if IS_ENABLED(CONFIG_IPV6) if (changelink) { attrtype = IFLA_GENEVE_UDP_ZERO_CSUM6_RX; goto change_notsup; } if (nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX])) cfg->use_udp6_rx_checksums = false; #else NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX], "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; #endif } if (data[IFLA_GENEVE_INNER_PROTO_INHERIT]) { if (changelink) { attrtype = IFLA_GENEVE_INNER_PROTO_INHERIT; goto change_notsup; } cfg->inner_proto_inherit = true; } return 0; change_notsup: NL_SET_ERR_MSG_ATTR(extack, data[attrtype], "Changing VNI, Port, endpoint IP address family, external, inner_proto_inherit, and UDP checksum attributes are not supported"); return -EOPNOTSUPP; } static void geneve_link_config(struct net_device *dev, struct ip_tunnel_info *info, struct nlattr *tb[]) { struct geneve_dev *geneve = netdev_priv(dev); int ldev_mtu = 0; if (tb[IFLA_MTU]) { geneve_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); return; } switch (ip_tunnel_info_af(info)) { case AF_INET: { struct flowi4 fl4 = { .daddr = info->key.u.ipv4.dst }; struct rtable *rt = ip_route_output_key(geneve->net, &fl4); if (!IS_ERR(rt) && rt->dst.dev) { ldev_mtu = rt->dst.dev->mtu - GENEVE_IPV4_HLEN; ip_rt_put(rt); } break; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { struct rt6_info *rt; if (!__in6_dev_get(dev)) break; rt = rt6_lookup(geneve->net, &info->key.u.ipv6.dst, NULL, 0, NULL, 0); if (rt && rt->dst.dev) ldev_mtu = rt->dst.dev->mtu - GENEVE_IPV6_HLEN; ip6_rt_put(rt); break; } #endif } if (ldev_mtu <= 0) return; geneve_change_mtu(dev, ldev_mtu - info->options_len); } static int geneve_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct geneve_config cfg = { .df = GENEVE_DF_UNSET, .use_udp6_rx_checksums = false, .ttl_inherit = false, .collect_md = false, }; int err; init_tnl_info(&cfg.info, GENEVE_UDP_PORT); err = geneve_nl2info(tb, data, extack, &cfg, false); if (err) return err; err = geneve_configure(net, dev, extack, &cfg); if (err) return err; geneve_link_config(dev, &cfg.info, tb); return 0; } /* Quiesces the geneve device data path for both TX and RX. * * On transmit geneve checks for non-NULL geneve_sock before it proceeds. * So, if we set that socket to NULL under RCU and wait for synchronize_net() * to complete for the existing set of in-flight packets to be transmitted, * then we would have quiesced the transmit data path. All the future packets * will get dropped until we unquiesce the data path. * * On receive geneve dereference the geneve_sock stashed in the socket. So, * if we set that to NULL under RCU and wait for synchronize_net() to * complete, then we would have quiesced the receive data path. */ static void geneve_quiesce(struct geneve_dev *geneve, struct geneve_sock **gs4, struct geneve_sock **gs6) { *gs4 = rtnl_dereference(geneve->sock4); rcu_assign_pointer(geneve->sock4, NULL); if (*gs4) rcu_assign_sk_user_data((*gs4)->sock->sk, NULL); #if IS_ENABLED(CONFIG_IPV6) *gs6 = rtnl_dereference(geneve->sock6); rcu_assign_pointer(geneve->sock6, NULL); if (*gs6) rcu_assign_sk_user_data((*gs6)->sock->sk, NULL); #else *gs6 = NULL; #endif synchronize_net(); } /* Resumes the geneve device data path for both TX and RX. */ static void geneve_unquiesce(struct geneve_dev *geneve, struct geneve_sock *gs4, struct geneve_sock __maybe_unused *gs6) { rcu_assign_pointer(geneve->sock4, gs4); if (gs4) rcu_assign_sk_user_data(gs4->sock->sk, gs4); #if IS_ENABLED(CONFIG_IPV6) rcu_assign_pointer(geneve->sock6, gs6); if (gs6) rcu_assign_sk_user_data(gs6->sock->sk, gs6); #endif synchronize_net(); } static int geneve_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct geneve_dev *geneve = netdev_priv(dev); struct geneve_sock *gs4, *gs6; struct geneve_config cfg; int err; /* If the geneve device is configured for metadata (or externally * controlled, for example, OVS), then nothing can be changed. */ if (geneve->cfg.collect_md) return -EOPNOTSUPP; /* Start with the existing info. */ memcpy(&cfg, &geneve->cfg, sizeof(cfg)); err = geneve_nl2info(tb, data, extack, &cfg, true); if (err) return err; if (!geneve_dst_addr_equal(&geneve->cfg.info, &cfg.info)) { dst_cache_reset(&cfg.info.dst_cache); geneve_link_config(dev, &cfg.info, tb); } geneve_quiesce(geneve, &gs4, &gs6); memcpy(&geneve->cfg, &cfg, sizeof(cfg)); geneve_unquiesce(geneve, gs4, gs6); return 0; } static void geneve_dellink(struct net_device *dev, struct list_head *head) { struct geneve_dev *geneve = netdev_priv(dev); list_del(&geneve->next); unregister_netdevice_queue(dev, head); } static size_t geneve_get_size(const struct net_device *dev) { return nla_total_size(sizeof(__u32)) + /* IFLA_GENEVE_ID */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GENEVE_REMOTE{6} */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TOS */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_DF */ nla_total_size(sizeof(__be32)) + /* IFLA_GENEVE_LABEL */ nla_total_size(sizeof(__be16)) + /* IFLA_GENEVE_PORT */ nla_total_size(0) + /* IFLA_GENEVE_COLLECT_METADATA */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL_INHERIT */ nla_total_size(0) + /* IFLA_GENEVE_INNER_PROTO_INHERIT */ 0; } static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); struct ip_tunnel_info *info = &geneve->cfg.info; bool ttl_inherit = geneve->cfg.ttl_inherit; bool metadata = geneve->cfg.collect_md; __u8 tmp_vni[3]; __u32 vni; tunnel_id_to_vni(info->key.tun_id, tmp_vni); vni = (tmp_vni[0] << 16) | (tmp_vni[1] << 8) | tmp_vni[2]; if (nla_put_u32(skb, IFLA_GENEVE_ID, vni)) goto nla_put_failure; if (!metadata && ip_tunnel_info_af(info) == AF_INET) { if (nla_put_in_addr(skb, IFLA_GENEVE_REMOTE, info->key.u.ipv4.dst)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_GENEVE_UDP_CSUM, test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags))) goto nla_put_failure; #if IS_ENABLED(CONFIG_IPV6) } else if (!metadata) { if (nla_put_in6_addr(skb, IFLA_GENEVE_REMOTE6, &info->key.u.ipv6.dst)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_TX, !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags))) goto nla_put_failure; #endif } if (nla_put_u8(skb, IFLA_GENEVE_TTL, info->key.ttl) || nla_put_u8(skb, IFLA_GENEVE_TOS, info->key.tos) || nla_put_be32(skb, IFLA_GENEVE_LABEL, info->key.label)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_GENEVE_DF, geneve->cfg.df)) goto nla_put_failure; if (nla_put_be16(skb, IFLA_GENEVE_PORT, info->key.tp_dst)) goto nla_put_failure; if (metadata && nla_put_flag(skb, IFLA_GENEVE_COLLECT_METADATA)) goto nla_put_failure; #if IS_ENABLED(CONFIG_IPV6) if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_RX, !geneve->cfg.use_udp6_rx_checksums)) goto nla_put_failure; #endif if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit)) goto nla_put_failure; if (geneve->cfg.inner_proto_inherit && nla_put_flag(skb, IFLA_GENEVE_INNER_PROTO_INHERIT)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct rtnl_link_ops geneve_link_ops __read_mostly = { .kind = "geneve", .maxtype = IFLA_GENEVE_MAX, .policy = geneve_policy, .priv_size = sizeof(struct geneve_dev), .setup = geneve_setup, .validate = geneve_validate, .newlink = geneve_newlink, .changelink = geneve_changelink, .dellink = geneve_dellink, .get_size = geneve_get_size, .fill_info = geneve_fill_info, }; struct net_device *geneve_dev_create_fb(struct net *net, const char *name, u8 name_assign_type, u16 dst_port) { struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; LIST_HEAD(list_kill); int err; struct geneve_config cfg = { .df = GENEVE_DF_UNSET, .use_udp6_rx_checksums = true, .ttl_inherit = false, .collect_md = true, }; memset(tb, 0, sizeof(tb)); dev = rtnl_create_link(net, name, name_assign_type, &geneve_link_ops, tb, NULL); if (IS_ERR(dev)) return dev; init_tnl_info(&cfg.info, dst_port); err = geneve_configure(net, dev, NULL, &cfg); if (err) { free_netdev(dev); return ERR_PTR(err); } /* openvswitch users expect packet sizes to be unrestricted, * so set the largest MTU we can. */ err = geneve_change_mtu(dev, IP_MAX_MTU); if (err) goto err; err = rtnl_configure_link(dev, NULL, 0, NULL); if (err < 0) goto err; return dev; err: geneve_dellink(dev, &list_kill); unregister_netdevice_many(&list_kill); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(geneve_dev_create_fb); static int geneve_netdevice_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (event == NETDEV_UDP_TUNNEL_PUSH_INFO) geneve_offload_rx_ports(dev, true); else if (event == NETDEV_UDP_TUNNEL_DROP_INFO) geneve_offload_rx_ports(dev, false); return NOTIFY_DONE; } static struct notifier_block geneve_notifier_block __read_mostly = { .notifier_call = geneve_netdevice_event, }; static __net_init int geneve_init_net(struct net *net) { struct geneve_net *gn = net_generic(net, geneve_net_id); INIT_LIST_HEAD(&gn->geneve_list); INIT_LIST_HEAD(&gn->sock_list); return 0; } static void geneve_destroy_tunnels(struct net *net, struct list_head *head) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev *geneve, *next; struct net_device *dev, *aux; /* gather any geneve devices that were moved into this ns */ for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &geneve_link_ops) unregister_netdevice_queue(dev, head); /* now gather any other geneve devices that were created in this ns */ list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) { /* If geneve->dev is in the same netns, it was already added * to the list by the previous loop. */ if (!net_eq(dev_net(geneve->dev), net)) unregister_netdevice_queue(geneve->dev, head); } } static void __net_exit geneve_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_to_kill) { struct net *net; list_for_each_entry(net, net_list, exit_list) geneve_destroy_tunnels(net, dev_to_kill); } static void __net_exit geneve_exit_net(struct net *net) { const struct geneve_net *gn = net_generic(net, geneve_net_id); WARN_ON_ONCE(!list_empty(&gn->sock_list)); } static struct pernet_operations geneve_net_ops = { .init = geneve_init_net, .exit_batch_rtnl = geneve_exit_batch_rtnl, .exit = geneve_exit_net, .id = &geneve_net_id, .size = sizeof(struct geneve_net), }; static int __init geneve_init_module(void) { int rc; rc = register_pernet_subsys(&geneve_net_ops); if (rc) goto out1; rc = register_netdevice_notifier(&geneve_notifier_block); if (rc) goto out2; rc = rtnl_link_register(&geneve_link_ops); if (rc) goto out3; return 0; out3: unregister_netdevice_notifier(&geneve_notifier_block); out2: unregister_pernet_subsys(&geneve_net_ops); out1: return rc; } late_initcall(geneve_init_module); static void __exit geneve_cleanup_module(void) { rtnl_link_unregister(&geneve_link_ops); unregister_netdevice_notifier(&geneve_notifier_block); unregister_pernet_subsys(&geneve_net_ops); } module_exit(geneve_cleanup_module); MODULE_LICENSE("GPL"); MODULE_VERSION(GENEVE_NETDEV_VER); MODULE_AUTHOR("John W. Linville <linville@tuxdriver.com>"); MODULE_DESCRIPTION("Interface driver for GENEVE encapsulated traffic"); MODULE_ALIAS_RTNL_LINK("geneve"); |
| 2 1 1 1 1 1 1 2 2 3 2 6 1 5 3 1 2 1 1 1 3 3 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 | // SPDX-License-Identifier: GPL-2.0-only /* * VFIO-KVM bridge pseudo device * * Copyright (C) 2013 Red Hat, Inc. All rights reserved. * Author: Alex Williamson <alex.williamson@redhat.com> */ #include <linux/errno.h> #include <linux/file.h> #include <linux/kvm_host.h> #include <linux/list.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/vfio.h> #include "vfio.h" #ifdef CONFIG_SPAPR_TCE_IOMMU #include <asm/kvm_ppc.h> #endif struct kvm_vfio_file { struct list_head node; struct file *file; #ifdef CONFIG_SPAPR_TCE_IOMMU struct iommu_group *iommu_group; #endif }; struct kvm_vfio { struct list_head file_list; struct mutex lock; bool noncoherent; }; static void kvm_vfio_file_set_kvm(struct file *file, struct kvm *kvm) { void (*fn)(struct file *file, struct kvm *kvm); fn = symbol_get(vfio_file_set_kvm); if (!fn) return; fn(file, kvm); symbol_put(vfio_file_set_kvm); } static bool kvm_vfio_file_enforced_coherent(struct file *file) { bool (*fn)(struct file *file); bool ret; fn = symbol_get(vfio_file_enforced_coherent); if (!fn) return false; ret = fn(file); symbol_put(vfio_file_enforced_coherent); return ret; } static bool kvm_vfio_file_is_valid(struct file *file) { bool (*fn)(struct file *file); bool ret; fn = symbol_get(vfio_file_is_valid); if (!fn) return false; ret = fn(file); symbol_put(vfio_file_is_valid); return ret; } #ifdef CONFIG_SPAPR_TCE_IOMMU static struct iommu_group *kvm_vfio_file_iommu_group(struct file *file) { struct iommu_group *(*fn)(struct file *file); struct iommu_group *ret; fn = symbol_get(vfio_file_iommu_group); if (!fn) return NULL; ret = fn(file); symbol_put(vfio_file_iommu_group); return ret; } static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm, struct kvm_vfio_file *kvf) { if (WARN_ON_ONCE(!kvf->iommu_group)) return; kvm_spapr_tce_release_iommu_group(kvm, kvf->iommu_group); iommu_group_put(kvf->iommu_group); kvf->iommu_group = NULL; } #endif /* * Groups/devices can use the same or different IOMMU domains. If the same * then adding a new group/device may change the coherency of groups/devices * we've previously been told about. We don't want to care about any of * that so we retest each group/device and bail as soon as we find one that's * noncoherent. This means we only ever [un]register_noncoherent_dma once * for the whole device. */ static void kvm_vfio_update_coherency(struct kvm_device *dev) { struct kvm_vfio *kv = dev->private; bool noncoherent = false; struct kvm_vfio_file *kvf; list_for_each_entry(kvf, &kv->file_list, node) { if (!kvm_vfio_file_enforced_coherent(kvf->file)) { noncoherent = true; break; } } if (noncoherent != kv->noncoherent) { kv->noncoherent = noncoherent; if (kv->noncoherent) kvm_arch_register_noncoherent_dma(dev->kvm); else kvm_arch_unregister_noncoherent_dma(dev->kvm); } } static int kvm_vfio_file_add(struct kvm_device *dev, unsigned int fd) { struct kvm_vfio *kv = dev->private; struct kvm_vfio_file *kvf; struct file *filp; int ret = 0; filp = fget(fd); if (!filp) return -EBADF; /* Ensure the FD is a vfio FD. */ if (!kvm_vfio_file_is_valid(filp)) { ret = -EINVAL; goto out_fput; } mutex_lock(&kv->lock); list_for_each_entry(kvf, &kv->file_list, node) { if (kvf->file == filp) { ret = -EEXIST; goto out_unlock; } } kvf = kzalloc(sizeof(*kvf), GFP_KERNEL_ACCOUNT); if (!kvf) { ret = -ENOMEM; goto out_unlock; } kvf->file = get_file(filp); list_add_tail(&kvf->node, &kv->file_list); kvm_arch_start_assignment(dev->kvm); kvm_vfio_file_set_kvm(kvf->file, dev->kvm); kvm_vfio_update_coherency(dev); out_unlock: mutex_unlock(&kv->lock); out_fput: fput(filp); return ret; } static int kvm_vfio_file_del(struct kvm_device *dev, unsigned int fd) { struct kvm_vfio *kv = dev->private; struct kvm_vfio_file *kvf; CLASS(fd, f)(fd); int ret; if (fd_empty(f)) return -EBADF; ret = -ENOENT; mutex_lock(&kv->lock); list_for_each_entry(kvf, &kv->file_list, node) { if (kvf->file != fd_file(f)) continue; list_del(&kvf->node); kvm_arch_end_assignment(dev->kvm); #ifdef CONFIG_SPAPR_TCE_IOMMU kvm_spapr_tce_release_vfio_group(dev->kvm, kvf); #endif kvm_vfio_file_set_kvm(kvf->file, NULL); fput(kvf->file); kfree(kvf); ret = 0; break; } kvm_vfio_update_coherency(dev); mutex_unlock(&kv->lock); return ret; } #ifdef CONFIG_SPAPR_TCE_IOMMU static int kvm_vfio_file_set_spapr_tce(struct kvm_device *dev, void __user *arg) { struct kvm_vfio_spapr_tce param; struct kvm_vfio *kv = dev->private; struct kvm_vfio_file *kvf; int ret; if (copy_from_user(¶m, arg, sizeof(struct kvm_vfio_spapr_tce))) return -EFAULT; CLASS(fd, f)(param.groupfd); if (fd_empty(f)) return -EBADF; ret = -ENOENT; mutex_lock(&kv->lock); list_for_each_entry(kvf, &kv->file_list, node) { if (kvf->file != fd_file(f)) continue; if (!kvf->iommu_group) { kvf->iommu_group = kvm_vfio_file_iommu_group(kvf->file); if (WARN_ON_ONCE(!kvf->iommu_group)) { ret = -EIO; goto err_fdput; } } ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, param.tablefd, kvf->iommu_group); break; } err_fdput: mutex_unlock(&kv->lock); return ret; } #endif static int kvm_vfio_set_file(struct kvm_device *dev, long attr, void __user *arg) { int32_t __user *argp = arg; int32_t fd; switch (attr) { case KVM_DEV_VFIO_FILE_ADD: if (get_user(fd, argp)) return -EFAULT; return kvm_vfio_file_add(dev, fd); case KVM_DEV_VFIO_FILE_DEL: if (get_user(fd, argp)) return -EFAULT; return kvm_vfio_file_del(dev, fd); #ifdef CONFIG_SPAPR_TCE_IOMMU case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: return kvm_vfio_file_set_spapr_tce(dev, arg); #endif } return -ENXIO; } static int kvm_vfio_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_DEV_VFIO_FILE: return kvm_vfio_set_file(dev, attr->attr, u64_to_user_ptr(attr->addr)); } return -ENXIO; } static int kvm_vfio_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_DEV_VFIO_FILE: switch (attr->attr) { case KVM_DEV_VFIO_FILE_ADD: case KVM_DEV_VFIO_FILE_DEL: #ifdef CONFIG_SPAPR_TCE_IOMMU case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: #endif return 0; } break; } return -ENXIO; } static void kvm_vfio_release(struct kvm_device *dev) { struct kvm_vfio *kv = dev->private; struct kvm_vfio_file *kvf, *tmp; list_for_each_entry_safe(kvf, tmp, &kv->file_list, node) { #ifdef CONFIG_SPAPR_TCE_IOMMU kvm_spapr_tce_release_vfio_group(dev->kvm, kvf); #endif kvm_vfio_file_set_kvm(kvf->file, NULL); fput(kvf->file); list_del(&kvf->node); kfree(kvf); kvm_arch_end_assignment(dev->kvm); } kvm_vfio_update_coherency(dev); kfree(kv); kfree(dev); /* alloc by kvm_ioctl_create_device, free by .release */ } static int kvm_vfio_create(struct kvm_device *dev, u32 type); static const struct kvm_device_ops kvm_vfio_ops = { .name = "kvm-vfio", .create = kvm_vfio_create, .release = kvm_vfio_release, .set_attr = kvm_vfio_set_attr, .has_attr = kvm_vfio_has_attr, }; static int kvm_vfio_create(struct kvm_device *dev, u32 type) { struct kvm_device *tmp; struct kvm_vfio *kv; lockdep_assert_held(&dev->kvm->lock); /* Only one VFIO "device" per VM */ list_for_each_entry(tmp, &dev->kvm->devices, vm_node) if (tmp->ops == &kvm_vfio_ops) return -EBUSY; kv = kzalloc(sizeof(*kv), GFP_KERNEL_ACCOUNT); if (!kv) return -ENOMEM; INIT_LIST_HEAD(&kv->file_list); mutex_init(&kv->lock); dev->private = kv; return 0; } int kvm_vfio_ops_init(void) { return kvm_register_device_ops(&kvm_vfio_ops, KVM_DEV_TYPE_VFIO); } void kvm_vfio_ops_exit(void) { kvm_unregister_device_ops(KVM_DEV_TYPE_VFIO); } |
| 282 282 1 1 282 251 35 4 282 281 282 282 281 1 1 215 15 229 1 228 264 264 229 228 229 264 264 264 264 253 13 262 264 264 264 31 13 13 31 31 13 253 253 253 251 251 251 251 251 251 251 251 251 251 251 4 4 4 4 4 4 4 282 281 5 5 4 4 1 1 1 1 1 1 1 1 3 3 3 268 13 1 12 3 4 3 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Routines having to do with the 'struct sk_buff' memory handlers. * * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> * Florian La Roche <rzsfl@rz.uni-sb.de> * * Fixes: * Alan Cox : Fixed the worst of the load * balancer bugs. * Dave Platt : Interrupt stacking fix. * Richard Kooijman : Timestamp fixes. * Alan Cox : Changed buffer format. * Alan Cox : destructor hook for AF_UNIX etc. * Linus Torvalds : Better skb_clone. * Alan Cox : Added skb_copy. * Alan Cox : Added all the changed routines Linus * only put in the headers * Ray VanTassle : Fixed --skb->lock in free * Alan Cox : skb_copy copy arp field * Andi Kleen : slabified it. * Robert Olsson : Removed skb_head_pool * * NOTE: * The __skb_ routines should be called with interrupts * disabled, or you better be *real* sure that the operation is atomic * with respect to whatever list is being frobbed (e.g. via lock_sock() * or via disabling bottom half handlers, etc). */ /* * The functions in this file will not compile correctly with gcc 2.4.x */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/slab.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/sctp.h> #include <linux/netdevice.h> #ifdef CONFIG_NET_CLS_ACT #include <net/pkt_sched.h> #endif #include <linux/string.h> #include <linux/skbuff.h> #include <linux/skbuff_ref.h> #include <linux/splice.h> #include <linux/cache.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/scatterlist.h> #include <linux/errqueue.h> #include <linux/prefetch.h> #include <linux/bitfield.h> #include <linux/if_vlan.h> #include <linux/mpls.h> #include <linux/kcov.h> #include <linux/iov_iter.h> #include <net/protocol.h> #include <net/dst.h> #include <net/sock.h> #include <net/checksum.h> #include <net/gso.h> #include <net/hotdata.h> #include <net/ip6_checksum.h> #include <net/xfrm.h> #include <net/mpls.h> #include <net/mptcp.h> #include <net/mctp.h> #include <net/page_pool/helpers.h> #include <net/dropreason.h> #include <linux/uaccess.h> #include <trace/events/skb.h> #include <linux/highmem.h> #include <linux/capability.h> #include <linux/user_namespace.h> #include <linux/indirect_call_wrapper.h> #include <linux/textsearch.h> #include "dev.h" #include "netmem_priv.h" #include "sock_destructor.h" #ifdef CONFIG_SKB_EXTENSIONS static struct kmem_cache *skbuff_ext_cache __ro_after_init; #endif #define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) /* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique * size, and we can differentiate heads from skb_small_head_cache * vs system slabs by looking at their size (skb_end_offset()). */ #define SKB_SMALL_HEAD_CACHE_SIZE \ (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \ (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \ SKB_SMALL_HEAD_SIZE) #define SKB_SMALL_HEAD_HEADROOM \ SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) /* kcm_write_msgs() relies on casting paged frags to bio_vec to use * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the * netmem is a page. */ static_assert(offsetof(struct bio_vec, bv_page) == offsetof(skb_frag_t, netmem)); static_assert(sizeof_field(struct bio_vec, bv_page) == sizeof_field(skb_frag_t, netmem)); static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len)); static_assert(sizeof_field(struct bio_vec, bv_len) == sizeof_field(skb_frag_t, len)); static_assert(offsetof(struct bio_vec, bv_offset) == offsetof(skb_frag_t, offset)); static_assert(sizeof_field(struct bio_vec, bv_offset) == sizeof_field(skb_frag_t, offset)); #undef FN #define FN(reason) [SKB_DROP_REASON_##reason] = #reason, static const char * const drop_reasons[] = { [SKB_CONSUMED] = "CONSUMED", DEFINE_DROP_REASON(FN, FN) }; static const struct drop_reason_list drop_reasons_core = { .reasons = drop_reasons, .n_reasons = ARRAY_SIZE(drop_reasons), }; const struct drop_reason_list __rcu * drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = { [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core), }; EXPORT_SYMBOL(drop_reasons_by_subsys); /** * drop_reasons_register_subsys - register another drop reason subsystem * @subsys: the subsystem to register, must not be the core * @list: the list of drop reasons within the subsystem, must point to * a statically initialized list */ void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys, const struct drop_reason_list *list) { if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || subsys >= ARRAY_SIZE(drop_reasons_by_subsys), "invalid subsystem %d\n", subsys)) return; /* must point to statically allocated memory, so INIT is OK */ RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list); } EXPORT_SYMBOL_GPL(drop_reasons_register_subsys); /** * drop_reasons_unregister_subsys - unregister a drop reason subsystem * @subsys: the subsystem to remove, must not be the core * * Note: This will synchronize_rcu() to ensure no users when it returns. */ void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys) { if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || subsys >= ARRAY_SIZE(drop_reasons_by_subsys), "invalid subsystem %d\n", subsys)) return; RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL); synchronize_rcu(); } EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys); /** * skb_panic - private function for out-of-line support * @skb: buffer * @sz: size * @addr: address * @msg: skb_over_panic or skb_under_panic * * Out-of-line support for skb_put() and skb_push(). * Called via the wrapper skb_over_panic() or skb_under_panic(). * Keep out of line to prevent kernel bloat. * __builtin_return_address is not used because it is not always reliable. */ static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, const char msg[]) { pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n", msg, addr, skb->len, sz, skb->head, skb->data, (unsigned long)skb->tail, (unsigned long)skb->end, skb->dev ? skb->dev->name : "<NULL>"); BUG(); } static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr) { skb_panic(skb, sz, addr, __func__); } static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) { skb_panic(skb, sz, addr, __func__); } #define NAPI_SKB_CACHE_SIZE 64 #define NAPI_SKB_CACHE_BULK 16 #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) #if PAGE_SIZE == SZ_4K #define NAPI_HAS_SMALL_PAGE_FRAG 1 #define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) /* specialized page frag allocator using a single order 0 page * and slicing it into 1K sized fragment. Constrained to systems * with a very limited amount of 1K fragments fitting a single * page - to avoid excessive truesize underestimation */ struct page_frag_1k { void *va; u16 offset; bool pfmemalloc; }; static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) { struct page *page; int offset; offset = nc->offset - SZ_1K; if (likely(offset >= 0)) goto use_frag; page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); if (!page) return NULL; nc->va = page_address(page); nc->pfmemalloc = page_is_pfmemalloc(page); offset = PAGE_SIZE - SZ_1K; page_ref_add(page, offset / SZ_1K); use_frag: nc->offset = offset; return nc->va + offset; } #else /* the small page is actually unused in this build; add dummy helpers * to please the compiler and avoid later preprocessor's conditionals */ #define NAPI_HAS_SMALL_PAGE_FRAG 0 #define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false struct page_frag_1k { }; static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) { return NULL; } #endif struct napi_alloc_cache { local_lock_t bh_lock; struct page_frag_cache page; struct page_frag_1k page_small; unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; /* Double check that napi_get_frags() allocates skbs with * skb->head being backed by slab, not a page fragment. * This is to make sure bug fixed in 3226b158e67c * ("net: avoid 32 x truesize under-estimation for tiny skbs") * does not accidentally come back. */ void napi_get_frags_check(struct napi_struct *napi) { struct sk_buff *skb; local_bh_disable(); skb = napi_get_frags(napi); WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); napi_free_frags(napi); local_bh_enable(); } void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); void *data; fragsz = SKB_DATA_ALIGN(fragsz); local_lock_nested_bh(&napi_alloc_cache.bh_lock); data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC | __GFP_NOWARN, align_mask); local_unlock_nested_bh(&napi_alloc_cache.bh_lock); return data; } EXPORT_SYMBOL(__napi_alloc_frag_align); void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { void *data; if (in_hardirq() || irqs_disabled()) { struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); fragsz = SKB_DATA_ALIGN(fragsz); data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC | __GFP_NOWARN, align_mask); } else { local_bh_disable(); data = __napi_alloc_frag_align(fragsz, align_mask); local_bh_enable(); } return data; } EXPORT_SYMBOL(__netdev_alloc_frag_align); static struct sk_buff *napi_skb_cache_get(void) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; local_lock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!nc->skb_count)) { nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, GFP_ATOMIC | __GFP_NOWARN, NAPI_SKB_CACHE_BULK, nc->skb_cache); if (unlikely(!nc->skb_count)) { local_unlock_nested_bh(&napi_alloc_cache.bh_lock); return NULL; } } skb = nc->skb_cache[--nc->skb_count]; local_unlock_nested_bh(&napi_alloc_cache.bh_lock); kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache)); return skb; } static inline void __finalize_skb_around(struct sk_buff *skb, void *data, unsigned int size) { struct skb_shared_info *shinfo; size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); /* Assumes caller memset cleared SKB */ skb->truesize = SKB_TRUESIZE(size); refcount_set(&skb->users, 1); skb->head = data; skb->data = data; skb_reset_tail_pointer(skb); skb_set_end_offset(skb, size); skb->mac_header = (typeof(skb->mac_header))~0U; skb->transport_header = (typeof(skb->transport_header))~0U; skb->alloc_cpu = raw_smp_processor_id(); /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); skb_set_kcov_handle(skb, kcov_common_handle()); } static inline void *__slab_build_skb(struct sk_buff *skb, void *data, unsigned int *size) { void *resized; /* Must find the allocation size (and grow it to match). */ *size = ksize(data); /* krealloc() will immediately return "data" when * "ksize(data)" is requested: it is the existing upper * bounds. As a result, GFP_ATOMIC will be ignored. Note * that this "new" pointer needs to be passed back to the * caller for use so the __alloc_size hinting will be * tracked correctly. */ resized = krealloc(data, *size, GFP_ATOMIC); WARN_ON_ONCE(resized != data); return resized; } /* build_skb() variant which can operate on slab buffers. * Note that this should be used sparingly as slab buffers * cannot be combined efficiently by GRO! */ struct sk_buff *slab_build_skb(void *data) { struct sk_buff *skb; unsigned int size; skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC | __GFP_NOWARN); if (unlikely(!skb)) return NULL; memset(skb, 0, offsetof(struct sk_buff, tail)); data = __slab_build_skb(skb, data, &size); __finalize_skb_around(skb, data, size); return skb; } EXPORT_SYMBOL(slab_build_skb); /* Caller must provide SKB that is memset cleared */ static void __build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size) { unsigned int size = frag_size; /* frag_size == 0 is considered deprecated now. Callers * using slab buffer should use slab_build_skb() instead. */ if (WARN_ONCE(size == 0, "Use slab_build_skb() instead")) data = __slab_build_skb(skb, data, &size); __finalize_skb_around(skb, data, size); } /** * __build_skb - build a network buffer * @data: data buffer provided by caller * @frag_size: size of data (must not be 0) * * Allocate a new &sk_buff. Caller provides space holding head and * skb_shared_info. @data must have been allocated from the page * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc() * allocation is deprecated, and callers should use slab_build_skb() * instead.) * The return is the new skb buffer. * On a failure the return is %NULL, and @data is not freed. * Notes : * Before IO, driver allocates only data buffer where NIC put incoming frame * Driver should add room at head (NET_SKB_PAD) and * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) * After IO, driver calls build_skb(), to allocate sk_buff and populate it * before giving packet to stack. * RX rings only contains data buffers, not full skbs. */ struct sk_buff *__build_skb(void *data, unsigned int frag_size) { struct sk_buff *skb; skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC | __GFP_NOWARN); if (unlikely(!skb)) return NULL; memset(skb, 0, offsetof(struct sk_buff, tail)); __build_skb_around(skb, data, frag_size); return skb; } /* build_skb() is wrapper over __build_skb(), that specifically * takes care of skb->head and skb->pfmemalloc */ struct sk_buff *build_skb(void *data, unsigned int frag_size) { struct sk_buff *skb = __build_skb(data, frag_size); if (likely(skb && frag_size)) { skb->head_frag = 1; skb_propagate_pfmemalloc(virt_to_head_page(data), skb); } return skb; } EXPORT_SYMBOL(build_skb); /** * build_skb_around - build a network buffer around provided skb * @skb: sk_buff provide by caller, must be memset cleared * @data: data buffer provided by caller * @frag_size: size of data */ struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size) { if (unlikely(!skb)) return NULL; __build_skb_around(skb, data, frag_size); if (frag_size) { skb->head_frag = 1; skb_propagate_pfmemalloc(virt_to_head_page(data), skb); } return skb; } EXPORT_SYMBOL(build_skb_around); /** * __napi_build_skb - build a network buffer * @data: data buffer provided by caller * @frag_size: size of data * * Version of __build_skb() that uses NAPI percpu caches to obtain * skbuff_head instead of inplace allocation. * * Returns a new &sk_buff on success, %NULL on allocation failure. */ static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) { struct sk_buff *skb; skb = napi_skb_cache_get(); if (unlikely(!skb)) return NULL; memset(skb, 0, offsetof(struct sk_buff, tail)); __build_skb_around(skb, data, frag_size); return skb; } /** * napi_build_skb - build a network buffer * @data: data buffer provided by caller * @frag_size: size of data * * Version of __napi_build_skb() that takes care of skb->head_frag * and skb->pfmemalloc when the data is a page or page fragment. * * Returns a new &sk_buff on success, %NULL on allocation failure. */ struct sk_buff *napi_build_skb(void *data, unsigned int frag_size) { struct sk_buff *skb = __napi_build_skb(data, frag_size); if (likely(skb) && frag_size) { skb->head_frag = 1; skb_propagate_pfmemalloc(virt_to_head_page(data), skb); } return skb; } EXPORT_SYMBOL(napi_build_skb); /* * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells * the caller if emergency pfmemalloc reserves are being used. If it is and * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves * may be used. Otherwise, the packet data may be discarded until enough * memory is free */ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, bool *pfmemalloc) { bool ret_pfmemalloc = false; size_t obj_size; void *obj; obj_size = SKB_HEAD_ALIGN(*size); if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && !(flags & KMALLOC_NOT_NORMAL_BITS)) { obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); *size = SKB_SMALL_HEAD_CACHE_SIZE; if (obj || !(gfp_pfmemalloc_allowed(flags))) goto out; /* Try again but now we are using pfmemalloc reserves */ ret_pfmemalloc = true; obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node); goto out; } obj_size = kmalloc_size_roundup(obj_size); /* The following cast might truncate high-order bits of obj_size, this * is harmless because kmalloc(obj_size >= 2^32) will fail anyway. */ *size = (unsigned int)obj_size; /* * Try a regular allocation, when that fails and we're not entitled * to the reserves, fail. */ obj = kmalloc_node_track_caller(obj_size, flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); if (obj || !(gfp_pfmemalloc_allowed(flags))) goto out; /* Try again but now we are using pfmemalloc reserves */ ret_pfmemalloc = true; obj = kmalloc_node_track_caller(obj_size, flags, node); out: if (pfmemalloc) *pfmemalloc = ret_pfmemalloc; return obj; } /* Allocate a new skbuff. We do this ourselves so we can fill in a few * 'private' fields and also do memory statistics to find all the * [BEEP] leaks. * */ /** * __alloc_skb - allocate a network buffer * @size: size to allocate * @gfp_mask: allocation mask * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache * instead of head cache and allocate a cloned (child) skb. * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for * allocations in case the data is required for writeback * @node: numa node to allocate memory on * * Allocate a new &sk_buff. The returned buffer has no headroom and a * tail room of at least size bytes. The object has a reference count * of one. The return is the buffer. On a failure the return is %NULL. * * Buffers may only be allocated from interrupts using a @gfp_mask of * %GFP_ATOMIC. */ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, int flags, int node) { struct kmem_cache *cache; struct sk_buff *skb; bool pfmemalloc; u8 *data; cache = (flags & SKB_ALLOC_FCLONE) ? net_hotdata.skbuff_fclone_cache : net_hotdata.skbuff_cache; if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) gfp_mask |= __GFP_MEMALLOC; /* Get the HEAD */ if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI && likely(node == NUMA_NO_NODE || node == numa_mem_id())) skb = napi_skb_cache_get(); else skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node); if (unlikely(!skb)) return NULL; prefetchw(skb); /* We do our best to align skb_shared_info on a separate cache * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives * aligned memory blocks, unless SLUB/SLAB debug is enabled. * Both skb->head and skb_shared_info are cache line aligned. */ data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc); if (unlikely(!data)) goto nodata; /* kmalloc_size_roundup() might give us more room than requested. * Put skb_shared_info exactly at the end of allocated zone, * to allow max possible filling before reallocation. */ prefetchw(data + SKB_WITH_OVERHEAD(size)); /* * Only clear those fields we need to clear, not those that we will * actually initialise below. Hence, don't put any more fields after * the tail pointer in struct sk_buff! */ memset(skb, 0, offsetof(struct sk_buff, tail)); __build_skb_around(skb, data, size); skb->pfmemalloc = pfmemalloc; if (flags & SKB_ALLOC_FCLONE) { struct sk_buff_fclones *fclones; fclones = container_of(skb, struct sk_buff_fclones, skb1); skb->fclone = SKB_FCLONE_ORIG; refcount_set(&fclones->fclone_ref, 1); } return skb; nodata: kmem_cache_free(cache, skb); return NULL; } EXPORT_SYMBOL(__alloc_skb); /** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device * @dev: network device to receive on * @len: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb * * Allocate a new &sk_buff and assign it a usage count of one. The * buffer has NET_SKB_PAD headroom built in. Users should allocate * the headroom they think they need without accounting for the * built in space. The built in space is used for optimisations. * * %NULL is returned if there is no free memory. */ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, gfp_t gfp_mask) { struct page_frag_cache *nc; struct sk_buff *skb; bool pfmemalloc; void *data; len += NET_SKB_PAD; /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. */ if (len <= SKB_WITH_OVERHEAD(1024) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) goto skb_fail; goto skb_success; } len = SKB_HEAD_ALIGN(len); if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; if (in_hardirq() || irqs_disabled()) { nc = this_cpu_ptr(&netdev_alloc_cache); data = page_frag_alloc(nc, len, gfp_mask); pfmemalloc = page_frag_cache_is_pfmemalloc(nc); } else { local_bh_disable(); local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc = this_cpu_ptr(&napi_alloc_cache.page); data = page_frag_alloc(nc, len, gfp_mask); pfmemalloc = page_frag_cache_is_pfmemalloc(nc); local_unlock_nested_bh(&napi_alloc_cache.bh_lock); local_bh_enable(); } if (unlikely(!data)) return NULL; skb = __build_skb(data, len); if (unlikely(!skb)) { skb_free_frag(data); return NULL; } if (pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; skb_success: skb_reserve(skb, NET_SKB_PAD); skb->dev = dev; skb_fail: return skb; } EXPORT_SYMBOL(__netdev_alloc_skb); /** * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance * @napi: napi instance this buffer was allocated for * @len: length to allocate * * Allocate a new sk_buff for use in NAPI receive. This buffer will * attempt to allocate the head from a special reserved region used * only for NAPI Rx allocation. By doing this we can save several * CPU cycles by avoiding having to disable and re-enable IRQs. * * %NULL is returned if there is no free memory. */ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) { gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN; struct napi_alloc_cache *nc; struct sk_buff *skb; bool pfmemalloc; void *data; DEBUG_NET_WARN_ON_ONCE(!in_softirq()); len += NET_SKB_PAD + NET_IP_ALIGN; /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. * When the small frag allocator is available, prefer it over kmalloc * for small fragments */ if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, NUMA_NO_NODE); if (!skb) goto skb_fail; goto skb_success; } if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc = this_cpu_ptr(&napi_alloc_cache); if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { /* we are artificially inflating the allocation size, but * that is not as bad as it may look like, as: * - 'len' less than GRO_MAX_HEAD makes little sense * - On most systems, larger 'len' values lead to fragment * size above 512 bytes * - kmalloc would use the kmalloc-1k slab for such values * - Builds with smaller GRO_MAX_HEAD will very likely do * little networking, as that implies no WiFi and no * tunnels support, and 32 bits arches. */ len = SZ_1K; data = page_frag_alloc_1k(&nc->page_small, gfp_mask); pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); } else { len = SKB_HEAD_ALIGN(len); data = page_frag_alloc(&nc->page, len, gfp_mask); pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); } local_unlock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!data)) return NULL; skb = __napi_build_skb(data, len); if (unlikely(!skb)) { skb_free_frag(data); return NULL; } if (pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; skb_success: skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); skb->dev = napi->dev; skb_fail: return skb; } EXPORT_SYMBOL(napi_alloc_skb); void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, int off, int size, unsigned int truesize) { DEBUG_NET_WARN_ON_ONCE(size > truesize); skb_fill_netmem_desc(skb, i, netmem, off, size); skb->len += size; skb->data_len += size; skb->truesize += truesize; } EXPORT_SYMBOL(skb_add_rx_frag_netmem); void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, unsigned int truesize) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; DEBUG_NET_WARN_ON_ONCE(size > truesize); skb_frag_size_add(frag, size); skb->len += size; skb->data_len += size; skb->truesize += truesize; } EXPORT_SYMBOL(skb_coalesce_rx_frag); static void skb_drop_list(struct sk_buff **listp) { kfree_skb_list(*listp); *listp = NULL; } static inline void skb_drop_fraglist(struct sk_buff *skb) { skb_drop_list(&skb_shinfo(skb)->frag_list); } static void skb_clone_fraglist(struct sk_buff *skb) { struct sk_buff *list; skb_walk_frags(skb, list) skb_get(list); } static bool is_pp_netmem(netmem_ref netmem) { return (netmem_get_pp_magic(netmem) & ~0x3UL) == PP_SIGNATURE; } int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom) { #if IS_ENABLED(CONFIG_PAGE_POOL) u32 size, truesize, len, max_head_size, off; struct sk_buff *skb = *pskb, *nskb; int err, i, head_off; void *data; /* XDP does not support fraglist so we need to linearize * the skb. */ if (skb_has_frag_list(skb)) return -EOPNOTSUPP; max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom); if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE) return -ENOMEM; size = min_t(u32, skb->len, max_head_size); truesize = SKB_HEAD_ALIGN(size) + headroom; data = page_pool_dev_alloc_va(pool, &truesize); if (!data) return -ENOMEM; nskb = napi_build_skb(data, truesize); if (!nskb) { page_pool_free_va(pool, data, true); return -ENOMEM; } skb_reserve(nskb, headroom); skb_copy_header(nskb, skb); skb_mark_for_recycle(nskb); err = skb_copy_bits(skb, 0, nskb->data, size); if (err) { consume_skb(nskb); return err; } skb_put(nskb, size); head_off = skb_headroom(nskb) - skb_headroom(skb); skb_headers_offset_update(nskb, head_off); off = size; len = skb->len - off; for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { struct page *page; u32 page_off; size = min_t(u32, len, PAGE_SIZE); truesize = size; page = page_pool_dev_alloc(pool, &page_off, &truesize); if (!page) { consume_skb(nskb); return -ENOMEM; } skb_add_rx_frag(nskb, i, page, page_off, size, truesize); err = skb_copy_bits(skb, off, page_address(page) + page_off, size); if (err) { consume_skb(nskb); return err; } len -= size; off += size; } consume_skb(skb); *pskb = nskb; return 0; #else return -EOPNOTSUPP; #endif } EXPORT_SYMBOL(skb_pp_cow_data); int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, const struct bpf_prog *prog) { if (!prog->aux->xdp_has_frags) return -EINVAL; return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM); } EXPORT_SYMBOL(skb_cow_data_for_xdp); #if IS_ENABLED(CONFIG_PAGE_POOL) bool napi_pp_put_page(netmem_ref netmem) { netmem = netmem_compound_head(netmem); /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation * in order to preserve any existing bits, such as bit 0 for the * head page of compound page and bit 1 for pfmemalloc page, so * mask those bits for freeing side when doing below checking, * and page_is_pfmemalloc() is checked in __page_pool_put_page() * to avoid recycling the pfmemalloc page. */ if (unlikely(!is_pp_netmem(netmem))) return false; page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); return true; } EXPORT_SYMBOL(napi_pp_put_page); #endif static bool skb_pp_recycle(struct sk_buff *skb, void *data) { if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) return false; return napi_pp_put_page(page_to_netmem(virt_to_page(data))); } /** * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb * @skb: page pool aware skb * * Increase the fragment reference count (pp_ref_count) of a skb. This is * intended to gain fragment references only for page pool aware skbs, * i.e. when skb->pp_recycle is true, and not for fragments in a * non-pp-recycling skb. It has a fallback to increase references on normal * pages, as page pool aware skbs may also have normal page fragments. */ static int skb_pp_frag_ref(struct sk_buff *skb) { struct skb_shared_info *shinfo; netmem_ref head_netmem; int i; if (!skb->pp_recycle) return -EINVAL; shinfo = skb_shinfo(skb); for (i = 0; i < shinfo->nr_frags; i++) { head_netmem = netmem_compound_head(shinfo->frags[i].netmem); if (likely(is_pp_netmem(head_netmem))) page_pool_ref_netmem(head_netmem); else page_ref_inc(netmem_to_page(head_netmem)); } return 0; } static void skb_kfree_head(void *head, unsigned int end_offset) { if (end_offset == SKB_SMALL_HEAD_HEADROOM) kmem_cache_free(net_hotdata.skb_small_head_cache, head); else kfree(head); } static void skb_free_head(struct sk_buff *skb) { unsigned char *head = skb->head; if (skb->head_frag) { if (skb_pp_recycle(skb, head)) return; skb_free_frag(head); } else { skb_kfree_head(head, skb_end_offset(skb)); } } static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) { struct skb_shared_info *shinfo = skb_shinfo(skb); int i; if (!skb_data_unref(skb, shinfo)) goto exit; if (skb_zcopy(skb)) { bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS; skb_zcopy_clear(skb, true); if (skip_unref) goto free_head; } for (i = 0; i < shinfo->nr_frags; i++) __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); free_head: if (shinfo->frag_list) kfree_skb_list_reason(shinfo->frag_list, reason); skb_free_head(skb); exit: /* When we clone an SKB we copy the reycling bit. The pp_recycle * bit is only set on the head though, so in order to avoid races * while trying to recycle fragments on __skb_frag_unref() we need * to make one SKB responsible for triggering the recycle path. * So disable the recycling bit if an SKB is cloned and we have * additional references to the fragmented part of the SKB. * Eventually the last SKB will have the recycling bit set and it's * dataref set to 0, which will trigger the recycling */ skb->pp_recycle = 0; } /* * Free an skbuff by memory without cleaning the state. */ static void kfree_skbmem(struct sk_buff *skb) { struct sk_buff_fclones *fclones; switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: kmem_cache_free(net_hotdata.skbuff_cache, skb); return; case SKB_FCLONE_ORIG: fclones = container_of(skb, struct sk_buff_fclones, skb1); /* We usually free the clone (TX completion) before original skb * This test would have no chance to be true for the clone, * while here, branch prediction will be good. */ if (refcount_read(&fclones->fclone_ref) == 1) goto fastpath; break; default: /* SKB_FCLONE_CLONE */ fclones = container_of(skb, struct sk_buff_fclones, skb2); break; } if (!refcount_dec_and_test(&fclones->fclone_ref)) return; fastpath: kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones); } void skb_release_head_state(struct sk_buff *skb) { skb_dst_drop(skb); if (skb->destructor) { DEBUG_NET_WARN_ON_ONCE(in_hardirq()); skb->destructor(skb); } #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb_nfct(skb)); #endif skb_ext_put(skb); } /* Free everything but the sk_buff shell. */ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) { skb_release_head_state(skb); if (likely(skb->head)) skb_release_data(skb, reason); } /** * __kfree_skb - private function * @skb: buffer * * Free an sk_buff. Release anything attached to the buffer. * Clean the state. This is an internal helper function. Users should * always call kfree_skb */ void __kfree_skb(struct sk_buff *skb) { skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); kfree_skbmem(skb); } EXPORT_SYMBOL(__kfree_skb); static __always_inline bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { if (unlikely(!skb_unref(skb))) return false; DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET || u32_get_bits(reason, SKB_DROP_REASON_SUBSYS_MASK) >= SKB_DROP_REASON_SUBSYS_NUM); if (reason == SKB_CONSUMED) trace_consume_skb(skb, __builtin_return_address(0)); else trace_kfree_skb(skb, __builtin_return_address(0), reason, sk); return true; } /** * sk_skb_reason_drop - free an sk_buff with special reason * @sk: the socket to receive @skb, or NULL if not applicable * @skb: buffer to free * @reason: reason why this skb is dropped * * Drop a reference to the buffer and free it if the usage count has hit * zero. Meanwhile, pass the receiving socket and drop reason to * 'kfree_skb' tracepoint. */ void __fix_address sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { if (__sk_skb_reason_drop(sk, skb, reason)) __kfree_skb(skb); } EXPORT_SYMBOL(sk_skb_reason_drop); #define KFREE_SKB_BULK_SIZE 16 struct skb_free_array { unsigned int skb_count; void *skb_array[KFREE_SKB_BULK_SIZE]; }; static void kfree_skb_add_bulk(struct sk_buff *skb, struct skb_free_array *sa, enum skb_drop_reason reason) { /* if SKB is a clone, don't handle this case */ if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) { __kfree_skb(skb); return; } skb_release_all(skb, reason); sa->skb_array[sa->skb_count++] = skb; if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { kmem_cache_free_bulk(net_hotdata.skbuff_cache, KFREE_SKB_BULK_SIZE, sa->skb_array); sa->skb_count = 0; } } void __fix_address kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason) { struct skb_free_array sa; sa.skb_count = 0; while (segs) { struct sk_buff *next = segs->next; if (__sk_skb_reason_drop(NULL, segs, reason)) { skb_poison_list(segs); kfree_skb_add_bulk(segs, &sa, reason); } segs = next; } if (sa.skb_count) kmem_cache_free_bulk(net_hotdata.skbuff_cache, sa.skb_count, sa.skb_array); } EXPORT_SYMBOL(kfree_skb_list_reason); /* Dump skb information and contents. * * Must only be called from net_ratelimit()-ed paths. * * Dumps whole packets if full_pkt, only headers otherwise. */ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) { struct skb_shared_info *sh = skb_shinfo(skb); struct net_device *dev = skb->dev; struct sock *sk = skb->sk; struct sk_buff *list_skb; bool has_mac, has_trans; int headroom, tailroom; int i, len, seg_len; if (full_pkt) len = skb->len; else len = min_t(int, skb->len, MAX_HEADER + 128); headroom = skb_headroom(skb); tailroom = skb_tailroom(skb); has_mac = skb_mac_header_was_set(skb); has_trans = skb_transport_header_was_set(skb); printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n" "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n" "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n" "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n" "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n", level, skb->len, headroom, skb_headlen(skb), tailroom, has_mac ? skb->mac_header : -1, has_mac ? skb_mac_header_len(skb) : -1, skb->mac_len, skb->network_header, has_trans ? skb_network_header_len(skb) : -1, has_trans ? skb->transport_header : -1, sh->tx_flags, sh->nr_frags, sh->gso_size, sh->gso_type, sh->gso_segs, skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed, skb->csum_complete_sw, skb->csum_valid, skb->csum_level, skb->hash, skb->sw_hash, skb->l4_hash, ntohs(skb->protocol), skb->pkt_type, skb->skb_iif, skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all, skb->encapsulation, skb->inner_protocol, skb->inner_mac_header, skb->inner_network_header, skb->inner_transport_header); if (dev) printk("%sdev name=%s feat=%pNF\n", level, dev->name, &dev->features); if (sk) printk("%ssk family=%hu type=%u proto=%u\n", level, sk->sk_family, sk->sk_type, sk->sk_protocol); if (full_pkt && headroom) print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET, 16, 1, skb->head, headroom, false); seg_len = min_t(int, skb_headlen(skb), len); if (seg_len) print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET, 16, 1, skb->data, seg_len, false); len -= seg_len; if (full_pkt && tailroom) print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET, 16, 1, skb_tail_pointer(skb), tailroom, false); for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; u32 p_off, p_len, copied; struct page *p; u8 *vaddr; if (skb_frag_is_net_iov(frag)) { printk("%sskb frag %d: not readable\n", level, i); len -= skb_frag_size(frag); if (!len) break; continue; } skb_frag_foreach_page(frag, skb_frag_off(frag), skb_frag_size(frag), p, p_off, p_len, copied) { seg_len = min_t(int, p_len, len); vaddr = kmap_atomic(p); print_hex_dump(level, "skb frag: ", DUMP_PREFIX_OFFSET, 16, 1, vaddr + p_off, seg_len, false); kunmap_atomic(vaddr); len -= seg_len; if (!len) break; } } if (full_pkt && skb_has_frag_list(skb)) { printk("skb fraglist:\n"); skb_walk_frags(skb, list_skb) skb_dump(level, list_skb, true); } } EXPORT_SYMBOL(skb_dump); /** * skb_tx_error - report an sk_buff xmit error * @skb: buffer that triggered an error * * Report xmit error if a device callback is tracking this skb. * skb must be freed afterwards. */ void skb_tx_error(struct sk_buff *skb) { if (skb) { skb_zcopy_downgrade_managed(skb); skb_zcopy_clear(skb, true); } } EXPORT_SYMBOL(skb_tx_error); #ifdef CONFIG_TRACEPOINTS /** * consume_skb - free an skbuff * @skb: buffer to free * * Drop a ref to the buffer and free it if the usage count has hit zero * Functions identically to kfree_skb, but kfree_skb assumes that the frame * is being dropped after a failure and notes that */ void consume_skb(struct sk_buff *skb) { if (!skb_unref(skb)) return; trace_consume_skb(skb, __builtin_return_address(0)); __kfree_skb(skb); } EXPORT_SYMBOL(consume_skb); #endif /** * __consume_stateless_skb - free an skbuff, assuming it is stateless * @skb: buffer to free * * Alike consume_skb(), but this variant assumes that this is the last * skb reference and all the head states have been already dropped */ void __consume_stateless_skb(struct sk_buff *skb) { trace_consume_skb(skb, __builtin_return_address(0)); skb_release_data(skb, SKB_CONSUMED); kfree_skbmem(skb); } static void napi_skb_cache_put(struct sk_buff *skb) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); u32 i; if (!kasan_mempool_poison_object(skb)) return; local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc->skb_cache[nc->skb_count++] = skb; if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) kasan_mempool_unpoison_object(nc->skb_cache[i], kmem_cache_size(net_hotdata.skbuff_cache)); kmem_cache_free_bulk(net_hotdata.skbuff_cache, NAPI_SKB_CACHE_HALF, nc->skb_cache + NAPI_SKB_CACHE_HALF); nc->skb_count = NAPI_SKB_CACHE_HALF; } local_unlock_nested_bh(&napi_alloc_cache.bh_lock); } void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason) { skb_release_all(skb, reason); napi_skb_cache_put(skb); } void napi_skb_free_stolen_head(struct sk_buff *skb) { if (unlikely(skb->slow_gro)) { nf_reset_ct(skb); skb_dst_drop(skb); skb_ext_put(skb); skb_orphan(skb); skb->slow_gro = 0; } napi_skb_cache_put(skb); } void napi_consume_skb(struct sk_buff *skb, int budget) { /* Zero budget indicate non-NAPI context called us, like netpoll */ if (unlikely(!budget)) { dev_consume_skb_any(skb); return; } DEBUG_NET_WARN_ON_ONCE(!in_softirq()); if (!skb_unref(skb)) return; /* if reaching here SKB is ready to free */ trace_consume_skb(skb, __builtin_return_address(0)); /* if SKB is a clone, don't handle this case */ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { __kfree_skb(skb); return; } skb_release_all(skb, SKB_CONSUMED); napi_skb_cache_put(skb); } EXPORT_SYMBOL(napi_consume_skb); /* Make sure a field is contained by headers group */ #define CHECK_SKB_FIELD(field) \ BUILD_BUG_ON(offsetof(struct sk_buff, field) != \ offsetof(struct sk_buff, headers.field)); \ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { new->tstamp = old->tstamp; /* We do not copy old->sk */ new->dev = old->dev; memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); __skb_ext_copy(new, old); __nf_copy(new, old, false); /* Note : this field could be in the headers group. * It is not yet because we do not want to have a 16 bit hole */ new->queue_mapping = old->queue_mapping; memcpy(&new->headers, &old->headers, sizeof(new->headers)); CHECK_SKB_FIELD(protocol); CHECK_SKB_FIELD(csum); CHECK_SKB_FIELD(hash); CHECK_SKB_FIELD(priority); CHECK_SKB_FIELD(skb_iif); CHECK_SKB_FIELD(vlan_proto); CHECK_SKB_FIELD(vlan_tci); CHECK_SKB_FIELD(transport_header); CHECK_SKB_FIELD(network_header); CHECK_SKB_FIELD(mac_header); CHECK_SKB_FIELD(inner_protocol); CHECK_SKB_FIELD(inner_transport_header); CHECK_SKB_FIELD(inner_network_header); CHECK_SKB_FIELD(inner_mac_header); CHECK_SKB_FIELD(mark); #ifdef CONFIG_NETWORK_SECMARK CHECK_SKB_FIELD(secmark); #endif #ifdef CONFIG_NET_RX_BUSY_POLL CHECK_SKB_FIELD(napi_id); #endif CHECK_SKB_FIELD(alloc_cpu); #ifdef CONFIG_XPS CHECK_SKB_FIELD(sender_cpu); #endif #ifdef CONFIG_NET_SCHED CHECK_SKB_FIELD(tc_index); #endif } /* * You should not add any new code to this function. Add it to * __copy_skb_header above instead. */ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) { #define C(x) n->x = skb->x n->next = n->prev = NULL; n->sk = NULL; __copy_skb_header(n, skb); C(len); C(data_len); C(mac_len); n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; n->cloned = 1; n->nohdr = 0; n->peeked = 0; C(pfmemalloc); C(pp_recycle); n->destructor = NULL; C(tail); C(end); C(head); C(head_frag); C(data); C(truesize); refcount_set(&n->users, 1); atomic_inc(&(skb_shinfo(skb)->dataref)); skb->cloned = 1; return n; #undef C } /** * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg * @first: first sk_buff of the msg */ struct sk_buff *alloc_skb_for_msg(struct sk_buff *first) { struct sk_buff *n; n = alloc_skb(0, GFP_ATOMIC); if (!n) return NULL; n->len = first->len; n->data_len = first->len; n->truesize = first->truesize; skb_shinfo(n)->frag_list = first; __copy_skb_header(n, first); n->destructor = NULL; return n; } EXPORT_SYMBOL_GPL(alloc_skb_for_msg); /** * skb_morph - morph one skb into another * @dst: the skb to receive the contents * @src: the skb to supply the contents * * This is identical to skb_clone except that the target skb is * supplied by the user. * * The target skb is returned upon exit. */ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) { skb_release_all(dst, SKB_CONSUMED); return __skb_clone(dst, src); } EXPORT_SYMBOL_GPL(skb_morph); int mm_account_pinned_pages(struct mmpin *mmp, size_t size) { unsigned long max_pg, num_pg, new_pg, old_pg, rlim; struct user_struct *user; if (capable(CAP_IPC_LOCK) || !size) return 0; rlim = rlimit(RLIMIT_MEMLOCK); if (rlim == RLIM_INFINITY) return 0; num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */ max_pg = rlim >> PAGE_SHIFT; user = mmp->user ? : current_user(); old_pg = atomic_long_read(&user->locked_vm); do { new_pg = old_pg + num_pg; if (new_pg > max_pg) return -ENOBUFS; } while (!atomic_long_try_cmpxchg(&user->locked_vm, &old_pg, new_pg)); if (!mmp->user) { mmp->user = get_uid(user); mmp->num_pg = num_pg; } else { mmp->num_pg += num_pg; } return 0; } EXPORT_SYMBOL_GPL(mm_account_pinned_pages); void mm_unaccount_pinned_pages(struct mmpin *mmp) { if (mmp->user) { atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); free_uid(mmp->user); } } EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) { struct ubuf_info_msgzc *uarg; struct sk_buff *skb; WARN_ON_ONCE(!in_task()); skb = sock_omalloc(sk, 0, GFP_KERNEL); if (!skb) return NULL; BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); uarg = (void *)skb->cb; uarg->mmp.user = NULL; if (mm_account_pinned_pages(&uarg->mmp, size)) { kfree_skb(skb); return NULL; } uarg->ubuf.ops = &msg_zerocopy_ubuf_ops; uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; uarg->len = 1; uarg->bytelen = size; uarg->zerocopy = 1; uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; refcount_set(&uarg->ubuf.refcnt, 1); sock_hold(sk); return &uarg->ubuf; } static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) { return container_of((void *)uarg, struct sk_buff, cb); } struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg) { if (uarg) { struct ubuf_info_msgzc *uarg_zc; const u32 byte_limit = 1 << 19; /* limit to a few TSO */ u32 bytelen, next; /* there might be non MSG_ZEROCOPY users */ if (uarg->ops != &msg_zerocopy_ubuf_ops) return NULL; /* realloc only when socket is locked (TCP, UDP cork), * so uarg->len and sk_zckey access is serialized */ if (!sock_owned_by_user(sk)) { WARN_ON_ONCE(1); return NULL; } uarg_zc = uarg_to_msgzc(uarg); bytelen = uarg_zc->bytelen + size; if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) { /* TCP can create new skb to attach new uarg */ if (sk->sk_type == SOCK_STREAM) goto new_alloc; return NULL; } next = (u32)atomic_read(&sk->sk_zckey); if ((u32)(uarg_zc->id + uarg_zc->len) == next) { if (mm_account_pinned_pages(&uarg_zc->mmp, size)) return NULL; uarg_zc->len++; uarg_zc->bytelen = bytelen; atomic_set(&sk->sk_zckey, ++next); /* no extra ref when appending to datagram (MSG_MORE) */ if (sk->sk_type == SOCK_STREAM) net_zcopy_get(uarg); return uarg; } } new_alloc: return msg_zerocopy_alloc(sk, size); } EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) { struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); u32 old_lo, old_hi; u64 sum_len; old_lo = serr->ee.ee_info; old_hi = serr->ee.ee_data; sum_len = old_hi - old_lo + 1ULL + len; if (sum_len >= (1ULL << 32)) return false; if (lo != old_hi + 1) return false; serr->ee.ee_data += len; return true; } static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg) { struct sk_buff *tail, *skb = skb_from_uarg(uarg); struct sock_exterr_skb *serr; struct sock *sk = skb->sk; struct sk_buff_head *q; unsigned long flags; bool is_zerocopy; u32 lo, hi; u16 len; mm_unaccount_pinned_pages(&uarg->mmp); /* if !len, there was only 1 call, and it was aborted * so do not queue a completion notification */ if (!uarg->len || sock_flag(sk, SOCK_DEAD)) goto release; len = uarg->len; lo = uarg->id; hi = uarg->id + len - 1; is_zerocopy = uarg->zerocopy; serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = 0; serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; serr->ee.ee_data = hi; serr->ee.ee_info = lo; if (!is_zerocopy) serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; q = &sk->sk_error_queue; spin_lock_irqsave(&q->lock, flags); tail = skb_peek_tail(q); if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY || !skb_zerocopy_notify_extend(tail, lo, len)) { __skb_queue_tail(q, skb); skb = NULL; } spin_unlock_irqrestore(&q->lock, flags); sk_error_report(sk); release: consume_skb(skb); sock_put(sk); } static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg, bool success) { struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); uarg_zc->zerocopy = uarg_zc->zerocopy & success; if (refcount_dec_and_test(&uarg->refcnt)) __msg_zerocopy_callback(uarg_zc); } void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk; atomic_dec(&sk->sk_zckey); uarg_to_msgzc(uarg)->len--; if (have_uref) msg_zerocopy_complete(NULL, uarg, true); } EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); const struct ubuf_info_ops msg_zerocopy_ubuf_ops = { .complete = msg_zerocopy_complete, }; EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg) { int err, orig_len = skb->len; if (uarg->ops->link_skb) { err = uarg->ops->link_skb(skb, uarg); if (err) return err; } else { struct ubuf_info *orig_uarg = skb_zcopy(skb); /* An skb can only point to one uarg. This edge case happens * when TCP appends to an skb, but zerocopy_realloc triggered * a new alloc. */ if (orig_uarg && uarg != orig_uarg) return -EEXIST; } err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { struct sock *save_sk = skb->sk; /* Streams do not free skb on error. Reset to prev state. */ iov_iter_revert(&msg->msg_iter, skb->len - orig_len); skb->sk = sk; ___pskb_trim(skb, orig_len); skb->sk = save_sk; return err; } skb_zcopy_set(skb, uarg, NULL); return skb->len - orig_len; } EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); void __skb_zcopy_downgrade_managed(struct sk_buff *skb) { int i; skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_frag_ref(skb, i); } EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed); static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, gfp_t gfp_mask) { if (skb_zcopy(orig)) { if (skb_zcopy(nskb)) { /* !gfp_mask callers are verified to !skb_zcopy(nskb) */ if (!gfp_mask) { WARN_ON_ONCE(1); return -ENOMEM; } if (skb_uarg(nskb) == skb_uarg(orig)) return 0; if (skb_copy_ubufs(nskb, GFP_ATOMIC)) return -EIO; } skb_zcopy_set(nskb, skb_uarg(orig), NULL); } return 0; } /** * skb_copy_ubufs - copy userspace skb frags buffers to kernel * @skb: the skb to modify * @gfp_mask: allocation priority * * This must be called on skb with SKBFL_ZEROCOPY_ENABLE. * It will copy all frags into kernel and drop the reference * to userspace pages. * * If this function is called from an interrupt gfp_mask() must be * %GFP_ATOMIC. * * Returns 0 on success or a negative error code on failure * to allocate kernel memory to copy to. */ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) { int num_frags = skb_shinfo(skb)->nr_frags; struct page *page, *head = NULL; int i, order, psize, new_frags; u32 d_off; if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) return -EINVAL; if (!skb_frags_readable(skb)) return -EFAULT; if (!num_frags) goto release; /* We might have to allocate high order pages, so compute what minimum * page order is needed. */ order = 0; while ((PAGE_SIZE << order) * MAX_SKB_FRAGS < __skb_pagelen(skb)) order++; psize = (PAGE_SIZE << order); new_frags = (__skb_pagelen(skb) + psize - 1) >> (PAGE_SHIFT + order); for (i = 0; i < new_frags; i++) { page = alloc_pages(gfp_mask | __GFP_COMP, order); if (!page) { while (head) { struct page *next = (struct page *)page_private(head); put_page(head); head = next; } return -ENOMEM; } set_page_private(page, (unsigned long)head); head = page; } page = head; d_off = 0; for (i = 0; i < num_frags; i++) { skb_frag_t *f = &skb_shinfo(skb)->frags[i]; u32 p_off, p_len, copied; struct page *p; u8 *vaddr; skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), p, p_off, p_len, copied) { u32 copy, done = 0; vaddr = kmap_atomic(p); while (done < p_len) { if (d_off == psize) { d_off = 0; page = (struct page *)page_private(page); } copy = min_t(u32, psize - d_off, p_len - done); memcpy(page_address(page) + d_off, vaddr + p_off + done, copy); done += copy; d_off += copy; } kunmap_atomic(vaddr); } } /* skb frags release userspace buffers */ for (i = 0; i < num_frags; i++) skb_frag_unref(skb, i); /* skb frags point to kernel buffers */ for (i = 0; i < new_frags - 1; i++) { __skb_fill_netmem_desc(skb, i, page_to_netmem(head), 0, psize); head = (struct page *)page_private(head); } __skb_fill_netmem_desc(skb, new_frags - 1, page_to_netmem(head), 0, d_off); skb_shinfo(skb)->nr_frags = new_frags; release: skb_zcopy_clear(skb, false); return 0; } EXPORT_SYMBOL_GPL(skb_copy_ubufs); /** * skb_clone - duplicate an sk_buff * @skb: buffer to clone * @gfp_mask: allocation priority * * Duplicate an &sk_buff. The new one is not owned by a socket. Both * copies share the same packet data but not structure. The new * buffer has a reference count of 1. If the allocation fails the * function returns %NULL otherwise the new buffer is returned. * * If this function is called from an interrupt gfp_mask() must be * %GFP_ATOMIC. */ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff_fclones *fclones = container_of(skb, struct sk_buff_fclones, skb1); struct sk_buff *n; if (skb_orphan_frags(skb, gfp_mask)) return NULL; if (skb->fclone == SKB_FCLONE_ORIG && refcount_read(&fclones->fclone_ref) == 1) { n = &fclones->skb2; refcount_set(&fclones->fclone_ref, 2); n->fclone = SKB_FCLONE_CLONE; } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; n = kmem_cache_alloc(net_hotdata.skbuff_cache, gfp_mask); if (!n) return NULL; n->fclone = SKB_FCLONE_UNAVAILABLE; } return __skb_clone(n, skb); } EXPORT_SYMBOL(skb_clone); void skb_headers_offset_update(struct sk_buff *skb, int off) { /* Only adjust this if it actually is csum_start rather than csum */ if (skb->ip_summed == CHECKSUM_PARTIAL) skb->csum_start += off; /* {transport,network,mac}_header and tail are relative to skb->head */ skb->transport_header += off; skb->network_header += off; if (skb_mac_header_was_set(skb)) skb->mac_header += off; skb->inner_transport_header += off; skb->inner_network_header += off; skb->inner_mac_header += off; } EXPORT_SYMBOL(skb_headers_offset_update); void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) { __copy_skb_header(new, old); skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; } EXPORT_SYMBOL(skb_copy_header); static inline int skb_alloc_rx_flag(const struct sk_buff *skb) { if (skb_pfmemalloc(skb)) return SKB_ALLOC_RX; return 0; } /** * skb_copy - create private copy of an sk_buff * @skb: buffer to copy * @gfp_mask: allocation priority * * Make a copy of both an &sk_buff and its data. This is used when the * caller wishes to modify the data and needs a private copy of the * data to alter. Returns %NULL on failure or the pointer to the buffer * on success. The returned buffer has a reference count of 1. * * As by-product this function converts non-linear &sk_buff to linear * one, so that &sk_buff becomes completely private and caller is allowed * to modify all the data of returned buffer. This means that this * function is not recommended for use in circumstances when only * header is going to be modified. Use pskb_copy() instead. */ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff *n; unsigned int size; int headerlen; if (!skb_frags_readable(skb)) return NULL; if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) return NULL; headerlen = skb_headroom(skb); size = skb_end_offset(skb) + skb->data_len; n = __alloc_skb(size, gfp_mask, skb_alloc_rx_flag(skb), NUMA_NO_NODE); if (!n) return NULL; /* Set the data pointer */ skb_reserve(n, headerlen); /* Set the tail pointer and length */ skb_put(n, skb->len); BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); skb_copy_header(n, skb); return n; } EXPORT_SYMBOL(skb_copy); /** * __pskb_copy_fclone - create copy of an sk_buff with private head. * @skb: buffer to copy * @headroom: headroom of new skb * @gfp_mask: allocation priority * @fclone: if true allocate the copy of the skb from the fclone * cache instead of the head cache; it is recommended to set this * to true for the cases where the copy will likely be cloned * * Make a copy of both an &sk_buff and part of its data, located * in header. Fragmented data remain shared. This is used when * the caller wishes to modify only header of &sk_buff and needs * private copy of the header to alter. Returns %NULL on failure * or the pointer to the buffer on success. * The returned buffer has a reference count of 1. */ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, gfp_t gfp_mask, bool fclone) { unsigned int size = skb_headlen(skb) + headroom; int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE); if (!n) goto out; /* Set the data pointer */ skb_reserve(n, headroom); /* Set the tail pointer and length */ skb_put(n, skb_headlen(skb)); /* Copy the bytes */ skb_copy_from_linear_data(skb, n->data, n->len); n->truesize += skb->data_len; n->data_len = skb->data_len; n->len = skb->len; if (skb_shinfo(skb)->nr_frags) { int i; if (skb_orphan_frags(skb, gfp_mask) || skb_zerocopy_clone(n, skb, gfp_mask)) { kfree_skb(n); n = NULL; goto out; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; skb_frag_ref(skb, i); } skb_shinfo(n)->nr_frags = i; } if (skb_has_frag_list(skb)) { skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; skb_clone_fraglist(n); } skb_copy_header(n, skb); out: return n; } EXPORT_SYMBOL(__pskb_copy_fclone); /** * pskb_expand_head - reallocate header of &sk_buff * @skb: buffer to reallocate * @nhead: room to add at head * @ntail: room to add at tail * @gfp_mask: allocation priority * * Expands (or creates identical copy, if @nhead and @ntail are zero) * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have * reference count of 1. Returns zero in the case of success or error, * if expansion failed. In the last case, &sk_buff is not changed. * * All the pointers pointing into skb header may change and must be * reloaded after call to this function. */ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask) { unsigned int osize = skb_end_offset(skb); unsigned int size = osize + nhead + ntail; long off; u8 *data; int i; BUG_ON(nhead < 0); BUG_ON(skb_shared(skb)); skb_zcopy_downgrade_managed(skb); if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) goto nodata; size = SKB_WITH_OVERHEAD(size); /* Copy only real data... and, alas, header. This should be * optimized for the cases when header is void. */ memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); memcpy((struct skb_shared_info *)(data + size), skb_shinfo(skb), offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); /* * if shinfo is shared we must drop the old head gracefully, but if it * is not we can just drop the old head and let the existing refcount * be since all we did is relocate the values */ if (skb_cloned(skb)) { if (skb_orphan_frags(skb, gfp_mask)) goto nofrags; if (skb_zcopy(skb)) refcount_inc(&skb_uarg(skb)->refcnt); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_frag_ref(skb, i); if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); skb_release_data(skb, SKB_CONSUMED); } else { skb_free_head(skb); } off = (data + nhead) - skb->head; skb->head = data; skb->head_frag = 0; skb->data += off; skb_set_end_offset(skb, size); #ifdef NET_SKBUFF_DATA_USES_OFFSET off = nhead; #endif skb->tail += off; skb_headers_offset_update(skb, nhead); skb->cloned = 0; skb->hdr_len = 0; skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); skb_metadata_clear(skb); /* It is not generally safe to change skb->truesize. * For the moment, we really care of rx path, or * when skb is orphaned (not attached to a socket). */ if (!skb->sk || skb->destructor == sock_edemux) skb->truesize += size - osize; return 0; nofrags: skb_kfree_head(data, size); nodata: return -ENOMEM; } EXPORT_SYMBOL(pskb_expand_head); /* Make private copy of skb with writable head and some headroom */ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) { struct sk_buff *skb2; int delta = headroom - skb_headroom(skb); if (delta <= 0) skb2 = pskb_copy(skb, GFP_ATOMIC); else { skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) { kfree_skb(skb2); skb2 = NULL; } } return skb2; } EXPORT_SYMBOL(skb_realloc_headroom); /* Note: We plan to rework this in linux-6.4 */ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) { unsigned int saved_end_offset, saved_truesize; struct skb_shared_info *shinfo; int res; saved_end_offset = skb_end_offset(skb); saved_truesize = skb->truesize; res = pskb_expand_head(skb, 0, 0, pri); if (res) return res; skb->truesize = saved_truesize; if (likely(skb_end_offset(skb) == saved_end_offset)) return 0; /* We can not change skb->end if the original or new value * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head(). */ if (saved_end_offset == SKB_SMALL_HEAD_HEADROOM || skb_end_offset(skb) == SKB_SMALL_HEAD_HEADROOM) { /* We think this path should not be taken. * Add a temporary trace to warn us just in case. */ pr_err_once("__skb_unclone_keeptruesize() skb_end_offset() %u -> %u\n", saved_end_offset, skb_end_offset(skb)); WARN_ON_ONCE(1); return 0; } shinfo = skb_shinfo(skb); /* We are about to change back skb->end, * we need to move skb_shinfo() to its new location. */ memmove(skb->head + saved_end_offset, shinfo, offsetof(struct skb_shared_info, frags[shinfo->nr_frags])); skb_set_end_offset(skb, saved_end_offset); return 0; } /** * skb_expand_head - reallocate header of &sk_buff * @skb: buffer to reallocate * @headroom: needed headroom * * Unlike skb_realloc_headroom, this one does not allocate a new skb * if possible; copies skb->sk to new skb as needed * and frees original skb in case of failures. * * It expect increased headroom and generates warning otherwise. */ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom) { int delta = headroom - skb_headroom(skb); int osize = skb_end_offset(skb); struct sock *sk = skb->sk; if (WARN_ONCE(delta <= 0, "%s is expecting an increase in the headroom", __func__)) return skb; delta = SKB_DATA_ALIGN(delta); /* pskb_expand_head() might crash, if skb is shared. */ if (skb_shared(skb) || !is_skb_wmem(skb)) { struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) goto fail; if (sk) skb_set_owner_w(nskb, sk); consume_skb(skb); skb = nskb; } if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) goto fail; if (sk && is_skb_wmem(skb)) { delta = skb_end_offset(skb) - osize; refcount_add(delta, &sk->sk_wmem_alloc); skb->truesize += delta; } return skb; fail: kfree_skb(skb); return NULL; } EXPORT_SYMBOL(skb_expand_head); /** * skb_copy_expand - copy and expand sk_buff * @skb: buffer to copy * @newheadroom: new free bytes at head * @newtailroom: new free bytes at tail * @gfp_mask: allocation priority * * Make a copy of both an &sk_buff and its data and while doing so * allocate additional space. * * This is used when the caller wishes to modify the data and needs a * private copy of the data to alter as well as more space for new fields. * Returns %NULL on failure or the pointer to the buffer * on success. The returned buffer has a reference count of 1. * * You must pass %GFP_ATOMIC as the allocation priority if this function * is called from an interrupt. */ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom, int newtailroom, gfp_t gfp_mask) { /* * Allocate the copy buffer */ int head_copy_len, head_copy_off; struct sk_buff *n; int oldheadroom; if (!skb_frags_readable(skb)) return NULL; if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) return NULL; oldheadroom = skb_headroom(skb); n = __alloc_skb(newheadroom + skb->len + newtailroom, gfp_mask, skb_alloc_rx_flag(skb), NUMA_NO_NODE); if (!n) return NULL; skb_reserve(n, newheadroom); /* Set the tail pointer and length */ skb_put(n, skb->len); head_copy_len = oldheadroom; head_copy_off = 0; if (newheadroom <= head_copy_len) head_copy_len = newheadroom; else head_copy_off = newheadroom - head_copy_len; /* Copy the linear header and data. */ BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, skb->len + head_copy_len)); skb_copy_header(n, skb); skb_headers_offset_update(n, newheadroom - oldheadroom); return n; } EXPORT_SYMBOL(skb_copy_expand); /** * __skb_pad - zero pad the tail of an skb * @skb: buffer to pad * @pad: space to pad * @free_on_error: free buffer on error * * Ensure that a buffer is followed by a padding area that is zero * filled. Used by network drivers which may DMA or transfer data * beyond the buffer end onto the wire. * * May return error in out of memory cases. The skb is freed on error * if @free_on_error is true. */ int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error) { int err; int ntail; /* If the skbuff is non linear tailroom is always zero.. */ if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { memset(skb->data+skb->len, 0, pad); return 0; } ntail = skb->data_len + pad - (skb->end - skb->tail); if (likely(skb_cloned(skb) || ntail > 0)) { err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); if (unlikely(err)) goto free_skb; } /* FIXME: The use of this function with non-linear skb's really needs * to be audited. */ err = skb_linearize(skb); if (unlikely(err)) goto free_skb; memset(skb->data + skb->len, 0, pad); return 0; free_skb: if (free_on_error) kfree_skb(skb); return err; } EXPORT_SYMBOL(__skb_pad); /** * pskb_put - add data to the tail of a potentially fragmented buffer * @skb: start of the buffer to use * @tail: tail fragment of the buffer to use * @len: amount of data to add * * This function extends the used data area of the potentially * fragmented buffer. @tail must be the last fragment of @skb -- or * @skb itself. If this would exceed the total buffer size the kernel * will panic. A pointer to the first byte of the extra data is * returned. */ void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) { if (tail != skb) { skb->data_len += len; skb->len += len; } return skb_put(tail, len); } EXPORT_SYMBOL_GPL(pskb_put); /** * skb_put - add data to a buffer * @skb: buffer to use * @len: amount of data to add * * This function extends the used data area of the buffer. If this would * exceed the total buffer size the kernel will panic. A pointer to the * first byte of the extra data is returned. */ void *skb_put(struct sk_buff *skb, unsigned int len) { void *tmp = skb_tail_pointer(skb); SKB_LINEAR_ASSERT(skb); skb->tail += len; skb->len += len; if (unlikely(skb->tail > skb->end)) skb_over_panic(skb, len, __builtin_return_address(0)); return tmp; } EXPORT_SYMBOL(skb_put); /** * skb_push - add data to the start of a buffer * @skb: buffer to use * @len: amount of data to add * * This function extends the used data area of the buffer at the buffer * start. If this would exceed the total buffer headroom the kernel will * panic. A pointer to the first byte of the extra data is returned. */ void *skb_push(struct sk_buff *skb, unsigned int len) { skb->data -= len; skb->len += len; if (unlikely(skb->data < skb->head)) skb_under_panic(skb, len, __builtin_return_address(0)); return skb->data; } EXPORT_SYMBOL(skb_push); /** * skb_pull - remove data from the start of a buffer * @skb: buffer to use * @len: amount of data to remove * * This function removes data from the start of a buffer, returning * the memory to the headroom. A pointer to the next data in the buffer * is returned. Once the data has been pulled future pushes will overwrite * the old data. */ void *skb_pull(struct sk_buff *skb, unsigned int len) { return skb_pull_inline(skb, len); } EXPORT_SYMBOL(skb_pull); /** * skb_pull_data - remove data from the start of a buffer returning its * original position. * @skb: buffer to use * @len: amount of data to remove * * This function removes data from the start of a buffer, returning * the memory to the headroom. A pointer to the original data in the buffer * is returned after checking if there is enough data to pull. Once the * data has been pulled future pushes will overwrite the old data. */ void *skb_pull_data(struct sk_buff *skb, size_t len) { void *data = skb->data; if (skb->len < len) return NULL; skb_pull(skb, len); return data; } EXPORT_SYMBOL(skb_pull_data); /** * skb_trim - remove end from a buffer * @skb: buffer to alter * @len: new length * * Cut the length of a buffer down by removing data from the tail. If * the buffer is already under the length specified it is not modified. * The skb must be linear. */ void skb_trim(struct sk_buff *skb, unsigned int len) { if (skb->len > len) __skb_trim(skb, len); } EXPORT_SYMBOL(skb_trim); /* Trims skb to length len. It can change skb pointers. */ int ___pskb_trim(struct sk_buff *skb, unsigned int len) { struct sk_buff **fragp; struct sk_buff *frag; int offset = skb_headlen(skb); int nfrags = skb_shinfo(skb)->nr_frags; int i; int err; if (skb_cloned(skb) && unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) return err; i = 0; if (offset >= len) goto drop_pages; for (; i < nfrags; i++) { int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); if (end < len) { offset = end; continue; } skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); drop_pages: skb_shinfo(skb)->nr_frags = i; for (; i < nfrags; i++) skb_frag_unref(skb, i); if (skb_has_frag_list(skb)) skb_drop_fraglist(skb); goto done; } for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); fragp = &frag->next) { int end = offset + frag->len; if (skb_shared(frag)) { struct sk_buff *nfrag; nfrag = skb_clone(frag, GFP_ATOMIC); if (unlikely(!nfrag)) return -ENOMEM; nfrag->next = frag->next; consume_skb(frag); frag = nfrag; *fragp = frag; } if (end < len) { offset = end; continue; } if (end > len && unlikely((err = pskb_trim(frag, len - offset)))) return err; if (frag->next) skb_drop_list(&frag->next); break; } done: if (len > skb_headlen(skb)) { skb->data_len -= skb->len - len; skb->len = len; } else { skb->len = len; skb->data_len = 0; skb_set_tail_pointer(skb, len); } if (!skb->sk || skb->destructor == sock_edemux) skb_condense(skb); return 0; } EXPORT_SYMBOL(___pskb_trim); /* Note : use pskb_trim_rcsum() instead of calling this directly */ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) { int delta = skb->len - len; skb->csum = csum_block_sub(skb->csum, skb_checksum(skb, len, delta, 0), len); } else if (skb->ip_summed == CHECKSUM_PARTIAL) { int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; int offset = skb_checksum_start_offset(skb) + skb->csum_offset; if (offset + sizeof(__sum16) > hdlen) return -EINVAL; } return __pskb_trim(skb, len); } EXPORT_SYMBOL(pskb_trim_rcsum_slow); /** * __pskb_pull_tail - advance tail of skb header * @skb: buffer to reallocate * @delta: number of bytes to advance tail * * The function makes a sense only on a fragmented &sk_buff, * it expands header moving its tail forward and copying necessary * data from fragmented part. * * &sk_buff MUST have reference count of 1. * * Returns %NULL (and &sk_buff does not change) if pull failed * or value of new tail of skb in the case of success. * * All the pointers pointing into skb header may change and must be * reloaded after call to this function. */ /* Moves tail of skb head forward, copying data from fragmented part, * when it is necessary. * 1. It may fail due to malloc failure. * 2. It may change skb pointers. * * It is pretty complicated. Luckily, it is called only in exceptional cases. */ void *__pskb_pull_tail(struct sk_buff *skb, int delta) { /* If skb has not enough free space at tail, get new one * plus 128 bytes for future expansions. If we have enough * room at tail, reallocate without expansion only if skb is cloned. */ int i, k, eat = (skb->tail + delta) - skb->end; if (!skb_frags_readable(skb)) return NULL; if (eat > 0 || skb_cloned(skb)) { if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, GFP_ATOMIC)) return NULL; } BUG_ON(skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)); /* Optimization: no fragments, no reasons to preestimate * size of pulled pages. Superb. */ if (!skb_has_frag_list(skb)) goto pull_pages; /* Estimate size of pulled pages. */ eat = delta; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); if (size >= eat) goto pull_pages; eat -= size; } /* If we need update frag list, we are in troubles. * Certainly, it is possible to add an offset to skb data, * but taking into account that pulling is expected to * be very rare operation, it is worth to fight against * further bloating skb head and crucify ourselves here instead. * Pure masohism, indeed. 8)8) */ if (eat) { struct sk_buff *list = skb_shinfo(skb)->frag_list; struct sk_buff *clone = NULL; struct sk_buff *insp = NULL; do { if (list->len <= eat) { /* Eaten as whole. */ eat -= list->len; list = list->next; insp = list; } else { /* Eaten partially. */ if (skb_is_gso(skb) && !list->head_frag && skb_headlen(list)) skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; if (skb_shared(list)) { /* Sucks! We need to fork list. :-( */ clone = skb_clone(list, GFP_ATOMIC); if (!clone) return NULL; insp = list->next; list = clone; } else { /* This may be pulled without * problems. */ insp = list; } if (!pskb_pull(list, eat)) { kfree_skb(clone); return NULL; } break; } } while (eat); /* Free pulled out fragments. */ while ((list = skb_shinfo(skb)->frag_list) != insp) { skb_shinfo(skb)->frag_list = list->next; consume_skb(list); } /* And insert new clone at head. */ if (clone) { clone->next = list; skb_shinfo(skb)->frag_list = clone; } } /* Success! Now we may commit changes to skb data. */ pull_pages: eat = delta; k = 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); if (size <= eat) { skb_frag_unref(skb, i); eat -= size; } else { skb_frag_t *frag = &skb_shinfo(skb)->frags[k]; *frag = skb_shinfo(skb)->frags[i]; if (eat) { skb_frag_off_add(frag, eat); skb_frag_size_sub(frag, eat); if (!i) goto end; eat = 0; } k++; } } skb_shinfo(skb)->nr_frags = k; end: skb->tail += delta; skb->data_len -= delta; if (!skb->data_len) skb_zcopy_clear(skb, false); return skb_tail_pointer(skb); } EXPORT_SYMBOL(__pskb_pull_tail); /** * skb_copy_bits - copy bits from skb to kernel buffer * @skb: source skb * @offset: offset in source * @to: destination buffer * @len: number of bytes to copy * * Copy the specified number of bytes from the source skb to the * destination buffer. * * CAUTION ! : * If its prototype is ever changed, * check arch/{*}/net/{*}.S files, * since it is called from BPF assembly code. */ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) { int start = skb_headlen(skb); struct sk_buff *frag_iter; int i, copy; if (offset > (int)skb->len - len) goto fault; /* Copy header. */ if ((copy = start - offset) > 0) { if (copy > len) copy = len; skb_copy_from_linear_data_offset(skb, offset, to, copy); if ((len -= copy) == 0) return 0; offset += copy; to += copy; } if (!skb_frags_readable(skb)) goto fault; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; skb_frag_t *f = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); end = start + skb_frag_size(f); if ((copy = end - offset) > 0) { u32 p_off, p_len, copied; struct page *p; u8 *vaddr; if (copy > len) copy = len; skb_frag_foreach_page(f, skb_frag_off(f) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); memcpy(to + copied, vaddr + p_off, p_len); kunmap_atomic(vaddr); } if ((len -= copy) == 0) return 0; offset += copy; to += copy; } start = end; } skb_walk_frags(skb, frag_iter) { int end; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { if (copy > len) copy = len; if (skb_copy_bits(frag_iter, offset - start, to, copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; to += copy; } start = end; } if (!len) return 0; fault: return -EFAULT; } EXPORT_SYMBOL(skb_copy_bits); /* * Callback from splice_to_pipe(), if we need to release some pages * at the end of the spd in case we error'ed out in filling the pipe. */ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) { put_page(spd->pages[i]); } static struct page *linear_to_page(struct page *page, unsigned int *len, unsigned int *offset, struct sock *sk) { struct page_frag *pfrag = sk_page_frag(sk); if (!sk_page_frag_refill(sk, pfrag)) return NULL; *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); memcpy(page_address(pfrag->page) + pfrag->offset, page_address(page) + *offset, *len); *offset = pfrag->offset; pfrag->offset += *len; return pfrag->page; } static bool spd_can_coalesce(const struct splice_pipe_desc *spd, struct page *page, unsigned int offset) { return spd->nr_pages && spd->pages[spd->nr_pages - 1] == page && (spd->partial[spd->nr_pages - 1].offset + spd->partial[spd->nr_pages - 1].len == offset); } /* * Fill page/offset/length into spd, if it can hold more pages. */ static bool spd_fill_page(struct splice_pipe_desc *spd, struct pipe_inode_info *pipe, struct page *page, unsigned int *len, unsigned int offset, bool linear, struct sock *sk) { if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) return true; if (linear) { page = linear_to_page(page, len, &offset, sk); if (!page) return true; } if (spd_can_coalesce(spd, page, offset)) { spd->partial[spd->nr_pages - 1].len += *len; return false; } get_page(page); spd->pages[spd->nr_pages] = page; spd->partial[spd->nr_pages].len = *len; spd->partial[spd->nr_pages].offset = offset; spd->nr_pages++; return false; } static bool __splice_segment(struct page *page, unsigned int poff, unsigned int plen, unsigned int *off, unsigned int *len, struct splice_pipe_desc *spd, bool linear, struct sock *sk, struct pipe_inode_info *pipe) { if (!*len) return true; /* skip this segment if already processed */ if (*off >= plen) { *off -= plen; return false; } /* ignore any bits we already processed */ poff += *off; plen -= *off; *off = 0; do { unsigned int flen = min(*len, plen); if (spd_fill_page(spd, pipe, page, &flen, poff, linear, sk)) return true; poff += flen; plen -= flen; *len -= flen; } while (*len && plen); return false; } /* * Map linear and fragment data from the skb to spd. It reports true if the * pipe is full or if we already spliced the requested length. */ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, unsigned int *offset, unsigned int *len, struct splice_pipe_desc *spd, struct sock *sk) { int seg; struct sk_buff *iter; /* map the linear part : * If skb->head_frag is set, this 'linear' part is backed by a * fragment, and if the head is not shared with any clones then * we can avoid a copy since we own the head portion of this page. */ if (__splice_segment(virt_to_page(skb->data), (unsigned long) skb->data & (PAGE_SIZE - 1), skb_headlen(skb), offset, len, spd, skb_head_is_locked(skb), sk, pipe)) return true; /* * then map the fragments */ if (!skb_frags_readable(skb)) return false; for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; if (WARN_ON_ONCE(!skb_frag_page(f))) return false; if (__splice_segment(skb_frag_page(f), skb_frag_off(f), skb_frag_size(f), offset, len, spd, false, sk, pipe)) return true; } skb_walk_frags(skb, iter) { if (*offset >= iter->len) { *offset -= iter->len; continue; } /* __skb_splice_bits() only fails if the output has no room * left, so no point in going over the frag_list for the error * case. */ if (__skb_splice_bits(iter, pipe, offset, len, spd, sk)) return true; } return false; } /* * Map data from the skb to a pipe. Should handle both the linear part, * the fragments, and the frag list. */ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, unsigned int flags) { struct partial_page partial[MAX_SKB_FRAGS]; struct page *pages[MAX_SKB_FRAGS]; struct splice_pipe_desc spd = { .pages = pages, .partial = partial, .nr_pages_max = MAX_SKB_FRAGS, .ops = &nosteal_pipe_buf_ops, .spd_release = sock_spd_release, }; int ret = 0; __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); if (spd.nr_pages) ret = splice_to_pipe(pipe, &spd); return ret; } EXPORT_SYMBOL_GPL(skb_splice_bits); static int sendmsg_locked(struct sock *sk, struct msghdr *msg) { struct socket *sock = sk->sk_socket; size_t size = msg_data_left(msg); if (!sock) return -EINVAL; if (!sock->ops->sendmsg_locked) return sock_no_sendmsg_locked(sk, msg, size); return sock->ops->sendmsg_locked(sk, msg, size); } static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg) { struct socket *sock = sk->sk_socket; if (!sock) return -EINVAL; return sock_sendmsg(sock, msg); } typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg); static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len, sendmsg_func sendmsg) { unsigned int orig_len = len; struct sk_buff *head = skb; unsigned short fragidx; int slen, ret; do_frag_list: /* Deal with head data */ while (offset < skb_headlen(skb) && len) { struct kvec kv; struct msghdr msg; slen = min_t(int, len, skb_headlen(skb) - offset); kv.iov_base = skb->data + offset; kv.iov_len = slen; memset(&msg, 0, sizeof(msg)); msg.msg_flags = MSG_DONTWAIT; iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen); ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, sendmsg_unlocked, sk, &msg); if (ret <= 0) goto error; offset += ret; len -= ret; } /* All the data was skb head? */ if (!len) goto out; /* Make offset relative to start of frags */ offset -= skb_headlen(skb); /* Find where we are in frag list */ for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; if (offset < skb_frag_size(frag)) break; offset -= skb_frag_size(frag); } for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; slen = min_t(size_t, len, skb_frag_size(frag) - offset); while (slen) { struct bio_vec bvec; struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT, }; bvec_set_page(&bvec, skb_frag_page(frag), slen, skb_frag_off(frag) + offset); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, slen); ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, sendmsg_unlocked, sk, &msg); if (ret <= 0) goto error; len -= ret; offset += ret; slen -= ret; } offset = 0; } if (len) { /* Process any frag lists */ if (skb == head) { if (skb_has_frag_list(skb)) { skb = skb_shinfo(skb)->frag_list; goto do_frag_list; } } else if (skb->next) { skb = skb->next; goto do_frag_list; } } out: return orig_len - len; error: return orig_len == len ? ret : orig_len - len; } /* Send skb data on a socket. Socket must be locked. */ int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len) { return __skb_send_sock(sk, skb, offset, len, sendmsg_locked); } EXPORT_SYMBOL_GPL(skb_send_sock_locked); /* Send skb data on a socket. Socket must be unlocked. */ int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) { return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked); } /** * skb_store_bits - store bits from kernel buffer to skb * @skb: destination buffer * @offset: offset in destination * @from: source buffer * @len: number of bytes to copy * * Copy the specified number of bytes from the source buffer to the * destination skb. This function handles all the messy bits of * traversing fragment lists and such. */ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) { int start = skb_headlen(skb); struct sk_buff *frag_iter; int i, copy; if (offset > (int)skb->len - len) goto fault; if ((copy = start - offset) > 0) { if (copy > len) copy = len; skb_copy_to_linear_data_offset(skb, offset, from, copy); if ((len -= copy) == 0) return 0; offset += copy; from += copy; } if (!skb_frags_readable(skb)) goto fault; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; int end; WARN_ON(start > offset + len); end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { u32 p_off, p_len, copied; struct page *p; u8 *vaddr; if (copy > len) copy = len; skb_frag_foreach_page(frag, skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); memcpy(vaddr + p_off, from + copied, p_len); kunmap_atomic(vaddr); } if ((len -= copy) == 0) return 0; offset += copy; from += copy; } start = end; } skb_walk_frags(skb, frag_iter) { int end; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { if (copy > len) copy = len; if (skb_store_bits(frag_iter, offset - start, from, copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; from += copy; } start = end; } if (!len) return 0; fault: return -EFAULT; } EXPORT_SYMBOL(skb_store_bits); /* Checksum skb data. */ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum, const struct skb_checksum_ops *ops) { int start = skb_headlen(skb); int i, copy = start - offset; struct sk_buff *frag_iter; int pos = 0; /* Checksum header. */ if (copy > 0) { if (copy > len) copy = len; csum = INDIRECT_CALL_1(ops->update, csum_partial_ext, skb->data + offset, copy, csum); if ((len -= copy) == 0) return csum; offset += copy; pos = copy; } if (WARN_ON_ONCE(!skb_frags_readable(skb))) return 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { u32 p_off, p_len, copied; struct page *p; __wsum csum2; u8 *vaddr; if (copy > len) copy = len; skb_frag_foreach_page(frag, skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); csum2 = INDIRECT_CALL_1(ops->update, csum_partial_ext, vaddr + p_off, p_len, 0); kunmap_atomic(vaddr); csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, csum, csum2, pos, p_len); pos += p_len; } if (!(len -= copy)) return csum; offset += copy; } start = end; } skb_walk_frags(skb, frag_iter) { int end; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { __wsum csum2; if (copy > len) copy = len; csum2 = __skb_checksum(frag_iter, offset - start, copy, 0, ops); csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, csum, csum2, pos, copy); if ((len -= copy) == 0) return csum; offset += copy; pos += copy; } start = end; } BUG_ON(len); return csum; } EXPORT_SYMBOL(__skb_checksum); __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum) { const struct skb_checksum_ops ops = { .update = csum_partial_ext, .combine = csum_block_add_ext, }; return __skb_checksum(skb, offset, len, csum, &ops); } EXPORT_SYMBOL(skb_checksum); /* Both of above in one bottle. */ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, int len) { int start = skb_headlen(skb); int i, copy = start - offset; struct sk_buff *frag_iter; int pos = 0; __wsum csum = 0; /* Copy header. */ if (copy > 0) { if (copy > len) copy = len; csum = csum_partial_copy_nocheck(skb->data + offset, to, copy); if ((len -= copy) == 0) return csum; offset += copy; to += copy; pos = copy; } if (!skb_frags_readable(skb)) return 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; WARN_ON(start > offset + len); end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); if ((copy = end - offset) > 0) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; u32 p_off, p_len, copied; struct page *p; __wsum csum2; u8 *vaddr; if (copy > len) copy = len; skb_frag_foreach_page(frag, skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); csum2 = csum_partial_copy_nocheck(vaddr + p_off, to + copied, p_len); kunmap_atomic(vaddr); csum = csum_block_add(csum, csum2, pos); pos += p_len; } if (!(len -= copy)) return csum; offset += copy; to += copy; } start = end; } skb_walk_frags(skb, frag_iter) { __wsum csum2; int end; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { if (copy > len) copy = len; csum2 = skb_copy_and_csum_bits(frag_iter, offset - start, to, copy); csum = csum_block_add(csum, csum2, pos); if ((len -= copy) == 0) return csum; offset += copy; to += copy; pos += copy; } start = end; } BUG_ON(len); return csum; } EXPORT_SYMBOL(skb_copy_and_csum_bits); __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) { __sum16 sum; sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); /* See comments in __skb_checksum_complete(). */ if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) netdev_rx_csum_fault(skb->dev, skb); } if (!skb_shared(skb)) skb->csum_valid = !sum; return sum; } EXPORT_SYMBOL(__skb_checksum_complete_head); /* This function assumes skb->csum already holds pseudo header's checksum, * which has been changed from the hardware checksum, for example, by * __skb_checksum_validate_complete(). And, the original skb->csum must * have been validated unsuccessfully for CHECKSUM_COMPLETE case. * * It returns non-zero if the recomputed checksum is still invalid, otherwise * zero. The new checksum is stored back into skb->csum unless the skb is * shared. */ __sum16 __skb_checksum_complete(struct sk_buff *skb) { __wsum csum; __sum16 sum; csum = skb_checksum(skb, 0, skb->len, 0); sum = csum_fold(csum_add(skb->csum, csum)); /* This check is inverted, because we already knew the hardware * checksum is invalid before calling this function. So, if the * re-computed checksum is valid instead, then we have a mismatch * between the original skb->csum and skb_checksum(). This means either * the original hardware checksum is incorrect or we screw up skb->csum * when moving skb->data around. */ if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) netdev_rx_csum_fault(skb->dev, skb); } if (!skb_shared(skb)) { /* Save full packet checksum */ skb->csum = csum; skb->ip_summed = CHECKSUM_COMPLETE; skb->csum_complete_sw = 1; skb->csum_valid = !sum; } return sum; } EXPORT_SYMBOL(__skb_checksum_complete); static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) { net_warn_ratelimited( "%s: attempt to compute crc32c without libcrc32c.ko\n", __func__); return 0; } static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, int offset, int len) { net_warn_ratelimited( "%s: attempt to compute crc32c without libcrc32c.ko\n", __func__); return 0; } static const struct skb_checksum_ops default_crc32c_ops = { .update = warn_crc32c_csum_update, .combine = warn_crc32c_csum_combine, }; const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = &default_crc32c_ops; EXPORT_SYMBOL(crc32c_csum_stub); /** * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() * @from: source buffer * * Calculates the amount of linear headroom needed in the 'to' skb passed * into skb_zerocopy(). */ unsigned int skb_zerocopy_headlen(const struct sk_buff *from) { unsigned int hlen = 0; if (!from->head_frag || skb_headlen(from) < L1_CACHE_BYTES || skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) { hlen = skb_headlen(from); if (!hlen) hlen = from->len; } if (skb_has_frag_list(from)) hlen = from->len; return hlen; } EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); /** * skb_zerocopy - Zero copy skb to skb * @to: destination buffer * @from: source buffer * @len: number of bytes to copy from source buffer * @hlen: size of linear headroom in destination buffer * * Copies up to `len` bytes from `from` to `to` by creating references * to the frags in the source buffer. * * The `hlen` as calculated by skb_zerocopy_headlen() specifies the * headroom in the `to` buffer. * * Return value: * 0: everything is OK * -ENOMEM: couldn't orphan frags of @from due to lack of memory * -EFAULT: skb_copy_bits() found some problem with skb geometry */ int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) { int i, j = 0; int plen = 0; /* length of skb->head fragment */ int ret; struct page *page; unsigned int offset; BUG_ON(!from->head_frag && !hlen); /* dont bother with small payloads */ if (len <= skb_tailroom(to)) return skb_copy_bits(from, 0, skb_put(to, len), len); if (hlen) { ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); if (unlikely(ret)) return ret; len -= hlen; } else { plen = min_t(int, skb_headlen(from), len); if (plen) { page = virt_to_head_page(from->head); offset = from->data - (unsigned char *)page_address(page); __skb_fill_netmem_desc(to, 0, page_to_netmem(page), offset, plen); get_page(page); j = 1; len -= plen; } } skb_len_add(to, len + plen); if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { skb_tx_error(from); return -ENOMEM; } skb_zerocopy_clone(to, from, GFP_ATOMIC); for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { int size; if (!len) break; skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]), len); skb_frag_size_set(&skb_shinfo(to)->frags[j], size); len -= size; skb_frag_ref(to, j); j++; } skb_shinfo(to)->nr_frags = j; return 0; } EXPORT_SYMBOL_GPL(skb_zerocopy); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) { __wsum csum; long csstart; if (skb->ip_summed == CHECKSUM_PARTIAL) csstart = skb_checksum_start_offset(skb); else csstart = skb_headlen(skb); BUG_ON(csstart > skb_headlen(skb)); skb_copy_from_linear_data(skb, to, csstart); csum = 0; if (csstart != skb->len) csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, skb->len - csstart); if (skb->ip_summed == CHECKSUM_PARTIAL) { long csstuff = csstart + skb->csum_offset; *((__sum16 *)(to + csstuff)) = csum_fold(csum); } } EXPORT_SYMBOL(skb_copy_and_csum_dev); /** * skb_dequeue - remove from the head of the queue * @list: list to dequeue from * * Remove the head of the list. The list lock is taken so the function * may be used safely with other locking list functions. The head item is * returned or %NULL if the list is empty. */ struct sk_buff *skb_dequeue(struct sk_buff_head *list) { unsigned long flags; struct sk_buff *result; spin_lock_irqsave(&list->lock, flags); result = __skb_dequeue(list); spin_unlock_irqrestore(&list->lock, flags); return result; } EXPORT_SYMBOL(skb_dequeue); /** * skb_dequeue_tail - remove from the tail of the queue * @list: list to dequeue from * * Remove the tail of the list. The list lock is taken so the function * may be used safely with other locking list functions. The tail item is * returned or %NULL if the list is empty. */ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) { unsigned long flags; struct sk_buff *result; spin_lock_irqsave(&list->lock, flags); result = __skb_dequeue_tail(list); spin_unlock_irqrestore(&list->lock, flags); return result; } EXPORT_SYMBOL(skb_dequeue_tail); /** * skb_queue_purge_reason - empty a list * @list: list to empty * @reason: drop reason * * Delete all buffers on an &sk_buff list. Each buffer is removed from * the list and one reference dropped. This function takes the list * lock and is atomic with respect to other list locking functions. */ void skb_queue_purge_reason(struct sk_buff_head *list, enum skb_drop_reason reason) { struct sk_buff_head tmp; unsigned long flags; if (skb_queue_empty_lockless(list)) return; __skb_queue_head_init(&tmp); spin_lock_irqsave(&list->lock, flags); skb_queue_splice_init(list, &tmp); spin_unlock_irqrestore(&list->lock, flags); __skb_queue_purge_reason(&tmp, reason); } EXPORT_SYMBOL(skb_queue_purge_reason); /** * skb_rbtree_purge - empty a skb rbtree * @root: root of the rbtree to empty * Return value: the sum of truesizes of all purged skbs. * * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from * the list and one reference dropped. This function does not take * any lock. Synchronization should be handled by the caller (e.g., TCP * out-of-order queue is protected by the socket lock). */ unsigned int skb_rbtree_purge(struct rb_root *root) { struct rb_node *p = rb_first(root); unsigned int sum = 0; while (p) { struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); p = rb_next(p); rb_erase(&skb->rbnode, root); sum += skb->truesize; kfree_skb(skb); } return sum; } void skb_errqueue_purge(struct sk_buff_head *list) { struct sk_buff *skb, *next; struct sk_buff_head kill; unsigned long flags; __skb_queue_head_init(&kill); spin_lock_irqsave(&list->lock, flags); skb_queue_walk_safe(list, skb, next) { if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY || SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) continue; __skb_unlink(skb, list); __skb_queue_tail(&kill, skb); } spin_unlock_irqrestore(&list->lock, flags); __skb_queue_purge(&kill); } EXPORT_SYMBOL(skb_errqueue_purge); /** * skb_queue_head - queue a buffer at the list head * @list: list to use * @newsk: buffer to queue * * Queue a buffer at the start of the list. This function takes the * list lock and can be used safely with other locking &sk_buff functions * safely. * * A buffer cannot be placed on two lists at the same time. */ void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) { unsigned long flags; spin_lock_irqsave(&list->lock, flags); __skb_queue_head(list, newsk); spin_unlock_irqrestore(&list->lock, flags); } EXPORT_SYMBOL(skb_queue_head); /** * skb_queue_tail - queue a buffer at the list tail * @list: list to use * @newsk: buffer to queue * * Queue a buffer at the tail of the list. This function takes the * list lock and can be used safely with other locking &sk_buff functions * safely. * * A buffer cannot be placed on two lists at the same time. */ void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) { unsigned long flags; spin_lock_irqsave(&list->lock, flags); __skb_queue_tail(list, newsk); spin_unlock_irqrestore(&list->lock, flags); } EXPORT_SYMBOL(skb_queue_tail); /** * skb_unlink - remove a buffer from a list * @skb: buffer to remove * @list: list to use * * Remove a packet from a list. The list locks are taken and this * function is atomic with respect to other list locked calls * * You must know what list the SKB is on. */ void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) { unsigned long flags; spin_lock_irqsave(&list->lock, flags); __skb_unlink(skb, list); spin_unlock_irqrestore(&list->lock, flags); } EXPORT_SYMBOL(skb_unlink); /** * skb_append - append a buffer * @old: buffer to insert after * @newsk: buffer to insert * @list: list to use * * Place a packet after a given packet in a list. The list locks are taken * and this function is atomic with respect to other list locked calls. * A buffer cannot be placed on two lists at the same time. */ void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) { unsigned long flags; spin_lock_irqsave(&list->lock, flags); __skb_queue_after(list, old, newsk); spin_unlock_irqrestore(&list->lock, flags); } EXPORT_SYMBOL(skb_append); static inline void skb_split_inside_header(struct sk_buff *skb, struct sk_buff* skb1, const u32 len, const int pos) { int i; skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), pos - len); /* And move data appendix as is. */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; skb1->unreadable = skb->unreadable; skb_shinfo(skb)->nr_frags = 0; skb1->data_len = skb->data_len; skb1->len += skb1->data_len; skb->data_len = 0; skb->len = len; skb_set_tail_pointer(skb, len); } static inline void skb_split_no_header(struct sk_buff *skb, struct sk_buff* skb1, const u32 len, int pos) { int i, k = 0; const int nfrags = skb_shinfo(skb)->nr_frags; skb_shinfo(skb)->nr_frags = 0; skb1->len = skb1->data_len = skb->len - len; skb->len = len; skb->data_len = len - pos; for (i = 0; i < nfrags; i++) { int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); if (pos + size > len) { skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; if (pos < len) { /* Split frag. * We have two variants in this case: * 1. Move all the frag to the second * part, if it is possible. F.e. * this approach is mandatory for TUX, * where splitting is expensive. * 2. Split is accurately. We make this. */ skb_frag_ref(skb, i); skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos); skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); skb_shinfo(skb)->nr_frags++; } k++; } else skb_shinfo(skb)->nr_frags++; pos += size; } skb_shinfo(skb1)->nr_frags = k; skb1->unreadable = skb->unreadable; } /** * skb_split - Split fragmented skb to two parts at length len. * @skb: the buffer to split * @skb1: the buffer to receive the second part * @len: new length for skb */ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) { int pos = skb_headlen(skb); const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY; skb_zcopy_downgrade_managed(skb); skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags; skb_zerocopy_clone(skb1, skb, 0); if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); else /* Second chunk has no header, nothing to copy. */ skb_split_no_header(skb, skb1, len, pos); } EXPORT_SYMBOL(skb_split); /* Shifting from/to a cloned skb is a no-go. * * Caller cannot keep skb_shinfo related pointers past calling here! */ static int skb_prepare_for_shift(struct sk_buff *skb) { return skb_unclone_keeptruesize(skb, GFP_ATOMIC); } /** * skb_shift - Shifts paged data partially from skb to another * @tgt: buffer into which tail data gets added * @skb: buffer from which the paged data comes from * @shiftlen: shift up to this many bytes * * Attempts to shift up to shiftlen worth of bytes, which may be less than * the length of the skb, from skb to tgt. Returns number bytes shifted. * It's up to caller to free skb if everything was shifted. * * If @tgt runs out of frags, the whole operation is aborted. * * Skb cannot include anything else but paged data while tgt is allowed * to have non-paged data as well. * * TODO: full sized shift could be optimized but that would need * specialized skb free'er to handle frags without up-to-date nr_frags. */ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) { int from, to, merge, todo; skb_frag_t *fragfrom, *fragto; BUG_ON(shiftlen > skb->len); if (skb_headlen(skb)) return 0; if (skb_zcopy(tgt) || skb_zcopy(skb)) return 0; DEBUG_NET_WARN_ON_ONCE(tgt->pp_recycle != skb->pp_recycle); DEBUG_NET_WARN_ON_ONCE(skb_cmp_decrypted(tgt, skb)); todo = shiftlen; from = 0; to = skb_shinfo(tgt)->nr_frags; fragfrom = &skb_shinfo(skb)->frags[from]; /* Actual merge is delayed until the point when we know we can * commit all, so that we don't have to undo partial changes */ if (!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), skb_frag_off(fragfrom))) { merge = -1; } else { merge = to - 1; todo -= skb_frag_size(fragfrom); if (todo < 0) { if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) return 0; /* All previous frag pointers might be stale! */ fragfrom = &skb_shinfo(skb)->frags[from]; fragto = &skb_shinfo(tgt)->frags[merge]; skb_frag_size_add(fragto, shiftlen); skb_frag_size_sub(fragfrom, shiftlen); skb_frag_off_add(fragfrom, shiftlen); goto onlymerged; } from++; } /* Skip full, not-fitting skb to avoid expensive operations */ if ((shiftlen == skb->len) && (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) return 0; if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) return 0; while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { if (to == MAX_SKB_FRAGS) return 0; fragfrom = &skb_shinfo(skb)->frags[from]; fragto = &skb_shinfo(tgt)->frags[to]; if (todo >= skb_frag_size(fragfrom)) { *fragto = *fragfrom; todo -= skb_frag_size(fragfrom); from++; to++; } else { __skb_frag_ref(fragfrom); skb_frag_page_copy(fragto, fragfrom); skb_frag_off_copy(fragto, fragfrom); skb_frag_size_set(fragto, todo); skb_frag_off_add(fragfrom, todo); skb_frag_size_sub(fragfrom, todo); todo = 0; to++; break; } } /* Ready to "commit" this state change to tgt */ skb_shinfo(tgt)->nr_frags = to; if (merge >= 0) { fragfrom = &skb_shinfo(skb)->frags[0]; fragto = &skb_shinfo(tgt)->frags[merge]; skb_frag_size_add(fragto, skb_frag_size(fragfrom)); __skb_frag_unref(fragfrom, skb->pp_recycle); } /* Reposition in the original skb */ to = 0; while (from < skb_shinfo(skb)->nr_frags) skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; skb_shinfo(skb)->nr_frags = to; BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); onlymerged: /* Most likely the tgt won't ever need its checksum anymore, skb on * the other hand might need it if it needs to be resent */ tgt->ip_summed = CHECKSUM_PARTIAL; skb->ip_summed = CHECKSUM_PARTIAL; skb_len_add(skb, -shiftlen); skb_len_add(tgt, shiftlen); return shiftlen; } /** * skb_prepare_seq_read - Prepare a sequential read of skb data * @skb: the buffer to read * @from: lower offset of data to be read * @to: upper offset of data to be read * @st: state variable * * Initializes the specified state variable. Must be called before * invoking skb_seq_read() for the first time. */ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, unsigned int to, struct skb_seq_state *st) { st->lower_offset = from; st->upper_offset = to; st->root_skb = st->cur_skb = skb; st->frag_idx = st->stepped_offset = 0; st->frag_data = NULL; st->frag_off = 0; } EXPORT_SYMBOL(skb_prepare_seq_read); /** * skb_seq_read - Sequentially read skb data * @consumed: number of bytes consumed by the caller so far * @data: destination pointer for data to be returned * @st: state variable * * Reads a block of skb data at @consumed relative to the * lower offset specified to skb_prepare_seq_read(). Assigns * the head of the data block to @data and returns the length * of the block or 0 if the end of the skb data or the upper * offset has been reached. * * The caller is not required to consume all of the data * returned, i.e. @consumed is typically set to the number * of bytes already consumed and the next call to * skb_seq_read() will return the remaining part of the block. * * Note 1: The size of each block of data returned can be arbitrary, * this limitation is the cost for zerocopy sequential * reads of potentially non linear data. * * Note 2: Fragment lists within fragments are not implemented * at the moment, state->root_skb could be replaced with * a stack for this purpose. */ unsigned int skb_seq_read(unsigned int consumed, const u8 **data, struct skb_seq_state *st) { unsigned int block_limit, abs_offset = consumed + st->lower_offset; skb_frag_t *frag; if (unlikely(abs_offset >= st->upper_offset)) { if (st->frag_data) { kunmap_atomic(st->frag_data); st->frag_data = NULL; } return 0; } next_skb: block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; if (abs_offset < block_limit && !st->frag_data) { *data = st->cur_skb->data + (abs_offset - st->stepped_offset); return block_limit - abs_offset; } if (!skb_frags_readable(st->cur_skb)) return 0; if (st->frag_idx == 0 && !st->frag_data) st->stepped_offset += skb_headlen(st->cur_skb); while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { unsigned int pg_idx, pg_off, pg_sz; frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; pg_idx = 0; pg_off = skb_frag_off(frag); pg_sz = skb_frag_size(frag); if (skb_frag_must_loop(skb_frag_page(frag))) { pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT; pg_off = offset_in_page(pg_off + st->frag_off); pg_sz = min_t(unsigned int, pg_sz - st->frag_off, PAGE_SIZE - pg_off); } block_limit = pg_sz + st->stepped_offset; if (abs_offset < block_limit) { if (!st->frag_data) st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx); *data = (u8 *)st->frag_data + pg_off + (abs_offset - st->stepped_offset); return block_limit - abs_offset; } if (st->frag_data) { kunmap_atomic(st->frag_data); st->frag_data = NULL; } st->stepped_offset += pg_sz; st->frag_off += pg_sz; if (st->frag_off == skb_frag_size(frag)) { st->frag_off = 0; st->frag_idx++; } } if (st->frag_data) { kunmap_atomic(st->frag_data); st->frag_data = NULL; } if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { st->cur_skb = skb_shinfo(st->root_skb)->frag_list; st->frag_idx = 0; goto next_skb; } else if (st->cur_skb->next) { st->cur_skb = st->cur_skb->next; st->frag_idx = 0; goto next_skb; } return 0; } EXPORT_SYMBOL(skb_seq_read); /** * skb_abort_seq_read - Abort a sequential read of skb data * @st: state variable * * Must be called if skb_seq_read() was not called until it * returned 0. */ void skb_abort_seq_read(struct skb_seq_state *st) { if (st->frag_data) kunmap_atomic(st->frag_data); } EXPORT_SYMBOL(skb_abort_seq_read); /** * skb_copy_seq_read() - copy from a skb_seq_state to a buffer * @st: source skb_seq_state * @offset: offset in source * @to: destination buffer * @len: number of bytes to copy * * Copy @len bytes from @offset bytes into the source @st to the destination * buffer @to. `offset` should increase (or be unchanged) with each subsequent * call to this function. If offset needs to decrease from the previous use `st` * should be reset first. * * Return: 0 on success or -EINVAL if the copy ended early */ int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len) { const u8 *data; u32 sqlen; for (;;) { sqlen = skb_seq_read(offset, &data, st); if (sqlen == 0) return -EINVAL; if (sqlen >= len) { memcpy(to, data, len); return 0; } memcpy(to, data, sqlen); to += sqlen; offset += sqlen; len -= sqlen; } } EXPORT_SYMBOL(skb_copy_seq_read); #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, struct ts_config *conf, struct ts_state *state) { return skb_seq_read(offset, text, TS_SKB_CB(state)); } static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) { skb_abort_seq_read(TS_SKB_CB(state)); } /** * skb_find_text - Find a text pattern in skb data * @skb: the buffer to look in * @from: search offset * @to: search limit * @config: textsearch configuration * * Finds a pattern in the skb data according to the specified * textsearch configuration. Use textsearch_next() to retrieve * subsequent occurrences of the pattern. Returns the offset * to the first occurrence or UINT_MAX if no match was found. */ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, unsigned int to, struct ts_config *config) { unsigned int patlen = config->ops->get_pattern_len(config); struct ts_state state; unsigned int ret; BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb)); config->get_next_block = skb_ts_get_next_block; config->finish = skb_ts_finish; skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state)); ret = textsearch_find(config, &state); return (ret + patlen <= to - from ? ret : UINT_MAX); } EXPORT_SYMBOL(skb_find_text); int skb_append_pagefrags(struct sk_buff *skb, struct page *page, int offset, size_t size, size_t max_frags) { int i = skb_shinfo(skb)->nr_frags; if (skb_can_coalesce(skb, i, page, offset)) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); } else if (i < max_frags) { skb_zcopy_downgrade_managed(skb); get_page(page); skb_fill_page_desc_noacc(skb, i, page, offset, size); } else { return -EMSGSIZE; } return 0; } EXPORT_SYMBOL_GPL(skb_append_pagefrags); /** * skb_pull_rcsum - pull skb and update receive checksum * @skb: buffer to update * @len: length of data pulled * * This function performs an skb_pull on the packet and updates * the CHECKSUM_COMPLETE checksum. It should be used on * receive path processing instead of skb_pull unless you know * that the checksum difference is zero (e.g., a valid IP header) * or you are setting ip_summed to CHECKSUM_NONE. */ void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) { unsigned char *data = skb->data; BUG_ON(len > skb->len); __skb_pull(skb, len); skb_postpull_rcsum(skb, data, len); return skb->data; } EXPORT_SYMBOL_GPL(skb_pull_rcsum); static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) { skb_frag_t head_frag; struct page *page; page = virt_to_head_page(frag_skb->head); skb_frag_fill_page_desc(&head_frag, page, frag_skb->data - (unsigned char *)page_address(page), skb_headlen(frag_skb)); return head_frag; } struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features, unsigned int offset) { struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; unsigned int tnl_hlen = skb_tnl_header_len(skb); unsigned int delta_truesize = 0; unsigned int delta_len = 0; struct sk_buff *tail = NULL; struct sk_buff *nskb, *tmp; int len_diff, err; skb_push(skb, -skb_network_offset(skb) + offset); /* Ensure the head is writeable before touching the shared info */ err = skb_unclone(skb, GFP_ATOMIC); if (err) goto err_linearize; skb_shinfo(skb)->frag_list = NULL; while (list_skb) { nskb = list_skb; list_skb = list_skb->next; err = 0; delta_truesize += nskb->truesize; if (skb_shared(nskb)) { tmp = skb_clone(nskb, GFP_ATOMIC); if (tmp) { consume_skb(nskb); nskb = tmp; err = skb_unclone(nskb, GFP_ATOMIC); } else { err = -ENOMEM; } } if (!tail) skb->next = nskb; else tail->next = nskb; if (unlikely(err)) { nskb->next = list_skb; goto err_linearize; } tail = nskb; delta_len += nskb->len; skb_push(nskb, -skb_network_offset(nskb) + offset); skb_release_head_state(nskb); len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb); __copy_skb_header(nskb, skb); skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); nskb->transport_header += len_diff; skb_copy_from_linear_data_offset(skb, -tnl_hlen, nskb->data - tnl_hlen, offset + tnl_hlen); if (skb_needs_linearize(nskb, features) && __skb_linearize(nskb)) goto err_linearize; } skb->truesize = skb->truesize - delta_truesize; skb->data_len = skb->data_len - delta_len; skb->len = skb->len - delta_len; skb_gso_reset(skb); skb->prev = tail; if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) goto err_linearize; skb_get(skb); return skb; err_linearize: kfree_skb_list(skb->next); skb->next = NULL; return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL_GPL(skb_segment_list); /** * skb_segment - Perform protocol segmentation on skb. * @head_skb: buffer to segment * @features: features for the output path (see dev->features) * * This function performs segmentation on the given skb. It returns * a pointer to the first in a list of new skbs for the segments. * In case of error it returns ERR_PTR(err). */ struct sk_buff *skb_segment(struct sk_buff *head_skb, netdev_features_t features) { struct sk_buff *segs = NULL; struct sk_buff *tail = NULL; struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; unsigned int mss = skb_shinfo(head_skb)->gso_size; unsigned int doffset = head_skb->data - skb_mac_header(head_skb); unsigned int offset = doffset; unsigned int tnl_hlen = skb_tnl_header_len(head_skb); unsigned int partial_segs = 0; unsigned int headroom; unsigned int len = head_skb->len; struct sk_buff *frag_skb; skb_frag_t *frag; __be16 proto; bool csum, sg; int err = -ENOMEM; int i = 0; int nfrags, pos; if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) && mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) { struct sk_buff *check_skb; for (check_skb = list_skb; check_skb; check_skb = check_skb->next) { if (skb_headlen(check_skb) && !check_skb->head_frag) { /* gso_size is untrusted, and we have a frag_list with * a linear non head_frag item. * * If head_skb's headlen does not fit requested gso_size, * it means that the frag_list members do NOT terminate * on exact gso_size boundaries. Hence we cannot perform * skb_frag_t page sharing. Therefore we must fallback to * copying the frag_list skbs; we do so by disabling SG. */ features &= ~NETIF_F_SG; break; } } } __skb_push(head_skb, doffset); proto = skb_network_protocol(head_skb, NULL); if (unlikely(!proto)) return ERR_PTR(-EINVAL); sg = !!(features & NETIF_F_SG); csum = !!can_checksum_protocol(features, proto); if (sg && csum && (mss != GSO_BY_FRAGS)) { if (!(features & NETIF_F_GSO_PARTIAL)) { struct sk_buff *iter; unsigned int frag_len; if (!list_skb || !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) goto normal; /* If we get here then all the required * GSO features except frag_list are supported. * Try to split the SKB to multiple GSO SKBs * with no frag_list. * Currently we can do that only when the buffers don't * have a linear part and all the buffers except * the last are of the same length. */ frag_len = list_skb->len; skb_walk_frags(head_skb, iter) { if (frag_len != iter->len && iter->next) goto normal; if (skb_headlen(iter) && !iter->head_frag) goto normal; len -= iter->len; } if (len != frag_len) goto normal; } /* GSO partial only requires that we trim off any excess that * doesn't fit into an MSS sized block, so take care of that * now. * Cap len to not accidentally hit GSO_BY_FRAGS. */ partial_segs = min(len, GSO_BY_FRAGS - 1) / mss; if (partial_segs > 1) mss *= partial_segs; else partial_segs = 0; } normal: headroom = skb_headroom(head_skb); pos = skb_headlen(head_skb); if (skb_orphan_frags(head_skb, GFP_ATOMIC)) return ERR_PTR(-ENOMEM); nfrags = skb_shinfo(head_skb)->nr_frags; frag = skb_shinfo(head_skb)->frags; frag_skb = head_skb; do { struct sk_buff *nskb; skb_frag_t *nskb_frag; int hsize; int size; if (unlikely(mss == GSO_BY_FRAGS)) { len = list_skb->len; } else { len = head_skb->len - offset; if (len > mss) len = mss; } hsize = skb_headlen(head_skb) - offset; if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) && (skb_headlen(list_skb) == len || sg)) { BUG_ON(skb_headlen(list_skb) > len); nskb = skb_clone(list_skb, GFP_ATOMIC); if (unlikely(!nskb)) goto err; i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; frag_skb = list_skb; pos += skb_headlen(list_skb); while (pos < offset + len) { BUG_ON(i >= nfrags); size = skb_frag_size(frag); if (pos + size > offset + len) break; i++; pos += size; frag++; } list_skb = list_skb->next; if (unlikely(pskb_trim(nskb, len))) { kfree_skb(nskb); goto err; } hsize = skb_end_offset(nskb); if (skb_cow_head(nskb, doffset + headroom)) { kfree_skb(nskb); goto err; } nskb->truesize += skb_end_offset(nskb) - hsize; skb_release_head_state(nskb); __skb_push(nskb, doffset); } else { if (hsize < 0) hsize = 0; if (hsize > len || !sg) hsize = len; nskb = __alloc_skb(hsize + doffset + headroom, GFP_ATOMIC, skb_alloc_rx_flag(head_skb), NUMA_NO_NODE); if (unlikely(!nskb)) goto err; skb_reserve(nskb, headroom); __skb_put(nskb, doffset); } if (segs) tail->next = nskb; else segs = nskb; tail = nskb; __copy_skb_header(nskb, head_skb); skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); skb_reset_mac_len(nskb); skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, nskb->data - tnl_hlen, doffset + tnl_hlen); if (nskb->len == len + doffset) goto perform_csum_check; if (!sg) { if (!csum) { if (!nskb->remcsum_offload) nskb->ip_summed = CHECKSUM_NONE; SKB_GSO_CB(nskb)->csum = skb_copy_and_csum_bits(head_skb, offset, skb_put(nskb, len), len); SKB_GSO_CB(nskb)->csum_start = skb_headroom(nskb) + doffset; } else { if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len)) goto err; } continue; } nskb_frag = skb_shinfo(nskb)->frags; skb_copy_from_linear_data_offset(head_skb, offset, skb_put(nskb, hsize), hsize); skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & SKBFL_SHARED_FRAG; if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) goto err; while (pos < offset + len) { if (i >= nfrags) { if (skb_orphan_frags(list_skb, GFP_ATOMIC) || skb_zerocopy_clone(nskb, list_skb, GFP_ATOMIC)) goto err; i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; frag_skb = list_skb; if (!skb_headlen(list_skb)) { BUG_ON(!nfrags); } else { BUG_ON(!list_skb->head_frag); /* to make room for head_frag. */ i--; frag--; } list_skb = list_skb->next; } if (unlikely(skb_shinfo(nskb)->nr_frags >= MAX_SKB_FRAGS)) { net_warn_ratelimited( "skb_segment: too many frags: %u %u\n", pos, mss); err = -EINVAL; goto err; } *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag; __skb_frag_ref(nskb_frag); size = skb_frag_size(nskb_frag); if (pos < offset) { skb_frag_off_add(nskb_frag, offset - pos); skb_frag_size_sub(nskb_frag, offset - pos); } skb_shinfo(nskb)->nr_frags++; if (pos + size <= offset + len) { i++; frag++; pos += size; } else { skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); goto skip_fraglist; } nskb_frag++; } skip_fraglist: nskb->data_len = len - hsize; nskb->len += nskb->data_len; nskb->truesize += nskb->data_len; perform_csum_check: if (!csum) { if (skb_has_shared_frag(nskb) && __skb_linearize(nskb)) goto err; if (!nskb->remcsum_offload) nskb->ip_summed = CHECKSUM_NONE; SKB_GSO_CB(nskb)->csum = skb_checksum(nskb, doffset, nskb->len - doffset, 0); SKB_GSO_CB(nskb)->csum_start = skb_headroom(nskb) + doffset; } } while ((offset += len) < head_skb->len); /* Some callers want to get the end of the list. * Put it in segs->prev to avoid walking the list. * (see validate_xmit_skb_list() for example) */ segs->prev = tail; if (partial_segs) { struct sk_buff *iter; int type = skb_shinfo(head_skb)->gso_type; unsigned short gso_size = skb_shinfo(head_skb)->gso_size; /* Update type to add partial and then remove dodgy if set */ type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; type &= ~SKB_GSO_DODGY; /* Update GSO info and prepare to start updating headers on * our way back down the stack of protocols. */ for (iter = segs; iter; iter = iter->next) { skb_shinfo(iter)->gso_size = gso_size; skb_shinfo(iter)->gso_segs = partial_segs; skb_shinfo(iter)->gso_type = type; SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset; } if (tail->len - doffset <= gso_size) skb_shinfo(tail)->gso_size = 0; else if (tail != segs) skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); } /* Following permits correct backpressure, for protocols * using skb_set_owner_w(). * Idea is to tranfert ownership from head_skb to last segment. */ if (head_skb->destructor == sock_wfree) { swap(tail->truesize, head_skb->truesize); swap(tail->destructor, head_skb->destructor); swap(tail->sk, head_skb->sk); } return segs; err: kfree_skb_list(segs); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(skb_segment); #ifdef CONFIG_SKB_EXTENSIONS #define SKB_EXT_ALIGN_VALUE 8 #define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), #endif #ifdef CONFIG_XFRM [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), #endif #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext), #endif #if IS_ENABLED(CONFIG_MPTCP) [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), #endif #if IS_ENABLED(CONFIG_MCTP_FLOWS) [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), #endif }; static __always_inline unsigned int skb_ext_total_length(void) { unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext); int i; for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++) l += skb_ext_type_len[i]; return l; } static void skb_extensions_init(void) { BUILD_BUG_ON(SKB_EXT_NUM >= 8); #if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL) BUILD_BUG_ON(skb_ext_total_length() > 255); #endif skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); } #else static void skb_extensions_init(void) {} #endif /* The SKB kmem_cache slab is critical for network performance. Never * merge/alias the slab with similar sized objects. This avoids fragmentation * that hurts performance of kmem_cache_{alloc,free}_bulk APIs. */ #ifndef CONFIG_SLUB_TINY #define FLAG_SKB_NO_MERGE SLAB_NO_MERGE #else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */ #define FLAG_SKB_NO_MERGE 0 #endif void __init skb_init(void) { net_hotdata.skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache", sizeof(struct sk_buff), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC| FLAG_SKB_NO_MERGE, offsetof(struct sk_buff, cb), sizeof_field(struct sk_buff, cb), NULL); net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", sizeof(struct sk_buff_fclones), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes. * struct skb_shared_info is located at the end of skb->head, * and should not be copied to/from user. */ net_hotdata.skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head", SKB_SMALL_HEAD_CACHE_SIZE, 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, 0, SKB_SMALL_HEAD_HEADROOM, NULL); skb_extensions_init(); } static int __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, unsigned int recursion_level) { int start = skb_headlen(skb); int i, copy = start - offset; struct sk_buff *frag_iter; int elt = 0; if (unlikely(recursion_level >= 24)) return -EMSGSIZE; if (copy > 0) { if (copy > len) copy = len; sg_set_buf(sg, skb->data + offset, copy); elt++; if ((len -= copy) == 0) return elt; offset += copy; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; WARN_ON(start > offset + len); end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); if ((copy = end - offset) > 0) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; if (unlikely(elt && sg_is_last(&sg[elt - 1]))) return -EMSGSIZE; if (copy > len) copy = len; sg_set_page(&sg[elt], skb_frag_page(frag), copy, skb_frag_off(frag) + offset - start); elt++; if (!(len -= copy)) return elt; offset += copy; } start = end; } skb_walk_frags(skb, frag_iter) { int end, ret; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { if (unlikely(elt && sg_is_last(&sg[elt - 1]))) return -EMSGSIZE; if (copy > len) copy = len; ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start, copy, recursion_level + 1); if (unlikely(ret < 0)) return ret; elt += ret; if ((len -= copy) == 0) return elt; offset += copy; } start = end; } BUG_ON(len); return elt; } /** * skb_to_sgvec - Fill a scatter-gather list from a socket buffer * @skb: Socket buffer containing the buffers to be mapped * @sg: The scatter-gather list to map into * @offset: The offset into the buffer's contents to start mapping * @len: Length of buffer space to be mapped * * Fill the specified scatter-gather list with mappings/pointers into a * region of the buffer space attached to a socket buffer. Returns either * the number of scatterlist items used, or -EMSGSIZE if the contents * could not fit. */ int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) { int nsg = __skb_to_sgvec(skb, sg, offset, len, 0); if (nsg <= 0) return nsg; sg_mark_end(&sg[nsg - 1]); return nsg; } EXPORT_SYMBOL_GPL(skb_to_sgvec); /* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given * sglist without mark the sg which contain last skb data as the end. * So the caller can mannipulate sg list as will when padding new data after * the first call without calling sg_unmark_end to expend sg list. * * Scenario to use skb_to_sgvec_nomark: * 1. sg_init_table * 2. skb_to_sgvec_nomark(payload1) * 3. skb_to_sgvec_nomark(payload2) * * This is equivalent to: * 1. sg_init_table * 2. skb_to_sgvec(payload1) * 3. sg_unmark_end * 4. skb_to_sgvec(payload2) * * When mapping multiple payload conditionally, skb_to_sgvec_nomark * is more preferable. */ int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) { return __skb_to_sgvec(skb, sg, offset, len, 0); } EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); /** * skb_cow_data - Check that a socket buffer's data buffers are writable * @skb: The socket buffer to check. * @tailbits: Amount of trailing space to be added * @trailer: Returned pointer to the skb where the @tailbits space begins * * Make sure that the data buffers attached to a socket buffer are * writable. If they are not, private copies are made of the data buffers * and the socket buffer is set to use these instead. * * If @tailbits is given, make sure that there is space to write @tailbits * bytes of data beyond current end of socket buffer. @trailer will be * set to point to the skb in which this space begins. * * The number of scatterlist elements required to completely map the * COW'd and extended socket buffer will be returned. */ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) { int copyflag; int elt; struct sk_buff *skb1, **skb_p; /* If skb is cloned or its head is paged, reallocate * head pulling out all the pages (pages are considered not writable * at the moment even if they are anonymous). */ if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && !__pskb_pull_tail(skb, __skb_pagelen(skb))) return -ENOMEM; /* Easy case. Most of packets will go this way. */ if (!skb_has_frag_list(skb)) { /* A little of trouble, not enough of space for trailer. * This should not happen, when stack is tuned to generate * good frames. OK, on miss we reallocate and reserve even more * space, 128 bytes is fair. */ if (skb_tailroom(skb) < tailbits && pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) return -ENOMEM; /* Voila! */ *trailer = skb; return 1; } /* Misery. We are in troubles, going to mincer fragments... */ elt = 1; skb_p = &skb_shinfo(skb)->frag_list; copyflag = 0; while ((skb1 = *skb_p) != NULL) { int ntail = 0; /* The fragment is partially pulled by someone, * this can happen on input. Copy it and everything * after it. */ if (skb_shared(skb1)) copyflag = 1; /* If the skb is the last, worry about trailer. */ if (skb1->next == NULL && tailbits) { if (skb_shinfo(skb1)->nr_frags || skb_has_frag_list(skb1) || skb_tailroom(skb1) < tailbits) ntail = tailbits + 128; } if (copyflag || skb_cloned(skb1) || ntail || skb_shinfo(skb1)->nr_frags || skb_has_frag_list(skb1)) { struct sk_buff *skb2; /* Fuck, we are miserable poor guys... */ if (ntail == 0) skb2 = skb_copy(skb1, GFP_ATOMIC); else skb2 = skb_copy_expand(skb1, skb_headroom(skb1), ntail, GFP_ATOMIC); if (unlikely(skb2 == NULL)) return -ENOMEM; if (skb1->sk) skb_set_owner_w(skb2, skb1->sk); /* Looking around. Are we still alive? * OK, link new skb, drop old one */ skb2->next = skb1->next; *skb_p = skb2; kfree_skb(skb1); skb1 = skb2; } elt++; *trailer = skb1; skb_p = &skb1->next; } return elt; } EXPORT_SYMBOL_GPL(skb_cow_data); static void sock_rmem_free(struct sk_buff *skb) { struct sock *sk = skb->sk; atomic_sub(skb->truesize, &sk->sk_rmem_alloc); } static void skb_set_err_queue(struct sk_buff *skb) { /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING. * So, it is safe to (mis)use it to mark skbs on the error queue. */ skb->pkt_type = PACKET_OUTGOING; BUILD_BUG_ON(PACKET_OUTGOING == 0); } /* * Note: We dont mem charge error packets (no sk_forward_alloc changes) */ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) { if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned int)READ_ONCE(sk->sk_rcvbuf)) return -ENOMEM; skb_orphan(skb); skb->sk = sk; skb->destructor = sock_rmem_free; atomic_add(skb->truesize, &sk->sk_rmem_alloc); skb_set_err_queue(skb); /* before exiting rcu section, make sure dst is refcounted */ skb_dst_force(skb); skb_queue_tail(&sk->sk_error_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); return 0; } EXPORT_SYMBOL(sock_queue_err_skb); static bool is_icmp_err_skb(const struct sk_buff *skb) { return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP || SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6); } struct sk_buff *sock_dequeue_err_skb(struct sock *sk) { struct sk_buff_head *q = &sk->sk_error_queue; struct sk_buff *skb, *skb_next = NULL; bool icmp_next = false; unsigned long flags; if (skb_queue_empty_lockless(q)) return NULL; spin_lock_irqsave(&q->lock, flags); skb = __skb_dequeue(q); if (skb && (skb_next = skb_peek(q))) { icmp_next = is_icmp_err_skb(skb_next); if (icmp_next) sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno; } spin_unlock_irqrestore(&q->lock, flags); if (is_icmp_err_skb(skb) && !icmp_next) sk->sk_err = 0; if (skb_next) sk_error_report(sk); return skb; } EXPORT_SYMBOL(sock_dequeue_err_skb); /** * skb_clone_sk - create clone of skb, and take reference to socket * @skb: the skb to clone * * This function creates a clone of a buffer that holds a reference on * sk_refcnt. Buffers created via this function are meant to be * returned using sock_queue_err_skb, or free via kfree_skb. * * When passing buffers allocated with this function to sock_queue_err_skb * it is necessary to wrap the call with sock_hold/sock_put in order to * prevent the socket from being released prior to being enqueued on * the sk_error_queue. */ struct sk_buff *skb_clone_sk(struct sk_buff *skb) { struct sock *sk = skb->sk; struct sk_buff *clone; if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; clone = skb_clone(skb, GFP_ATOMIC); if (!clone) { sock_put(sk); return NULL; } clone->sk = sk; clone->destructor = sock_efree; return clone; } EXPORT_SYMBOL(skb_clone_sk); static void __skb_complete_tx_timestamp(struct sk_buff *skb, struct sock *sk, int tstype, bool opt_stats) { struct sock_exterr_skb *serr; int err; BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = tstype; serr->opt_stats = opt_stats; serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; if (sk_is_tcp(sk)) serr->ee.ee_data -= atomic_read(&sk->sk_tskey); } err = sock_queue_err_skb(sk, skb); if (err) kfree_skb(skb); } static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) { bool ret; if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data))) return true; read_lock_bh(&sk->sk_callback_lock); ret = sk->sk_socket && sk->sk_socket->file && file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW); read_unlock_bh(&sk->sk_callback_lock); return ret; } void skb_complete_tx_timestamp(struct sk_buff *skb, struct skb_shared_hwtstamps *hwtstamps) { struct sock *sk = skb->sk; if (!skb_may_tx_timestamp(sk, false)) goto err; /* Take a reference to prevent skb_orphan() from freeing the socket, * but only if the socket refcount is not zero. */ if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { *skb_hwtstamps(skb) = *hwtstamps; __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); sock_put(sk); return; } err: kfree_skb(skb); } EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); void __skb_tstamp_tx(struct sk_buff *orig_skb, const struct sk_buff *ack_skb, struct skb_shared_hwtstamps *hwtstamps, struct sock *sk, int tstype) { struct sk_buff *skb; bool tsonly, opt_stats = false; u32 tsflags; if (!sk) return; tsflags = READ_ONCE(sk->sk_tsflags); if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) return; tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY; if (!skb_may_tx_timestamp(sk, tsonly)) return; if (tsonly) { #ifdef CONFIG_INET if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk_is_tcp(sk)) { skb = tcp_get_timestamping_opt_stats(sk, orig_skb, ack_skb); opt_stats = true; } else #endif skb = alloc_skb(0, GFP_ATOMIC); } else { skb = skb_clone(orig_skb, GFP_ATOMIC); if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) { kfree_skb(skb); return; } } if (!skb) return; if (tsonly) { skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags & SKBTX_ANY_TSTAMP; skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; } if (hwtstamps) *skb_hwtstamps(skb) = *hwtstamps; else __net_timestamp(skb); __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); } EXPORT_SYMBOL_GPL(__skb_tstamp_tx); void skb_tstamp_tx(struct sk_buff *orig_skb, struct skb_shared_hwtstamps *hwtstamps) { return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk, SCM_TSTAMP_SND); } EXPORT_SYMBOL_GPL(skb_tstamp_tx); #ifdef CONFIG_WIRELESS void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) { struct sock *sk = skb->sk; struct sock_exterr_skb *serr; int err = 1; skb->wifi_acked_valid = 1; skb->wifi_acked = acked; serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; /* Take a reference to prevent skb_orphan() from freeing the socket, * but only if the socket refcount is not zero. */ if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { err = sock_queue_err_skb(sk, skb); sock_put(sk); } if (err) kfree_skb(skb); } EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); #endif /* CONFIG_WIRELESS */ /** * skb_partial_csum_set - set up and verify partial csum values for packet * @skb: the skb to set * @start: the number of bytes after skb->data to start checksumming. * @off: the offset from start to place the checksum. * * For untrusted partially-checksummed packets, we need to make sure the values * for skb->csum_start and skb->csum_offset are valid so we don't oops. * * This function checks and sets those values and skb->ip_summed: if this * returns false you should drop the packet. */ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) { u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); u32 csum_start = skb_headroom(skb) + (u32)start; if (unlikely(csum_start >= U16_MAX || csum_end > skb_headlen(skb))) { net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", start, off, skb_headroom(skb), skb_headlen(skb)); return false; } skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = csum_start; skb->csum_offset = off; skb->transport_header = csum_start; return true; } EXPORT_SYMBOL_GPL(skb_partial_csum_set); static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, unsigned int max) { if (skb_headlen(skb) >= len) return 0; /* If we need to pullup then pullup to the max, so we * won't need to do it again. */ if (max > skb->len) max = skb->len; if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) return -ENOMEM; if (skb_headlen(skb) < len) return -EPROTO; return 0; } #define MAX_TCP_HDR_LEN (15 * 4) static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, typeof(IPPROTO_IP) proto, unsigned int off) { int err; switch (proto) { case IPPROTO_TCP: err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), off + MAX_TCP_HDR_LEN); if (!err && !skb_partial_csum_set(skb, off, offsetof(struct tcphdr, check))) err = -EPROTO; return err ? ERR_PTR(err) : &tcp_hdr(skb)->check; case IPPROTO_UDP: err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr), off + sizeof(struct udphdr)); if (!err && !skb_partial_csum_set(skb, off, offsetof(struct udphdr, check))) err = -EPROTO; return err ? ERR_PTR(err) : &udp_hdr(skb)->check; } return ERR_PTR(-EPROTO); } /* This value should be large enough to cover a tagged ethernet header plus * maximally sized IP and TCP or UDP headers. */ #define MAX_IP_HDR_LEN 128 static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) { unsigned int off; bool fragment; __sum16 *csum; int err; fragment = false; err = skb_maybe_pull_tail(skb, sizeof(struct iphdr), MAX_IP_HDR_LEN); if (err < 0) goto out; if (ip_is_fragment(ip_hdr(skb))) fragment = true; off = ip_hdrlen(skb); err = -EPROTO; if (fragment) goto out; csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off); if (IS_ERR(csum)) return PTR_ERR(csum); if (recalculate) *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, skb->len - off, ip_hdr(skb)->protocol, 0); err = 0; out: return err; } /* This value should be large enough to cover a tagged ethernet header plus * an IPv6 header, all options, and a maximal TCP or UDP header. */ #define MAX_IPV6_HDR_LEN 256 #define OPT_HDR(type, skb, off) \ (type *)(skb_network_header(skb) + (off)) static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) { int err; u8 nexthdr; unsigned int off; unsigned int len; bool fragment; bool done; __sum16 *csum; fragment = false; done = false; off = sizeof(struct ipv6hdr); err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); if (err < 0) goto out; nexthdr = ipv6_hdr(skb)->nexthdr; len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); while (off <= len && !done) { switch (nexthdr) { case IPPROTO_DSTOPTS: case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: { struct ipv6_opt_hdr *hp; err = skb_maybe_pull_tail(skb, off + sizeof(struct ipv6_opt_hdr), MAX_IPV6_HDR_LEN); if (err < 0) goto out; hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); nexthdr = hp->nexthdr; off += ipv6_optlen(hp); break; } case IPPROTO_AH: { struct ip_auth_hdr *hp; err = skb_maybe_pull_tail(skb, off + sizeof(struct ip_auth_hdr), MAX_IPV6_HDR_LEN); if (err < 0) goto out; hp = OPT_HDR(struct ip_auth_hdr, skb, off); nexthdr = hp->nexthdr; off += ipv6_authlen(hp); break; } case IPPROTO_FRAGMENT: { struct frag_hdr *hp; err = skb_maybe_pull_tail(skb, off + sizeof(struct frag_hdr), MAX_IPV6_HDR_LEN); if (err < 0) goto out; hp = OPT_HDR(struct frag_hdr, skb, off); if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) fragment = true; nexthdr = hp->nexthdr; off += sizeof(struct frag_hdr); break; } default: done = true; break; } } err = -EPROTO; if (!done || fragment) goto out; csum = skb_checksum_setup_ip(skb, nexthdr, off); if (IS_ERR(csum)) return PTR_ERR(csum); if (recalculate) *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len - off, nexthdr, 0); err = 0; out: return err; } /** * skb_checksum_setup - set up partial checksum offset * @skb: the skb to set up * @recalculate: if true the pseudo-header checksum will be recalculated */ int skb_checksum_setup(struct sk_buff *skb, bool recalculate) { int err; switch (skb->protocol) { case htons(ETH_P_IP): err = skb_checksum_setup_ipv4(skb, recalculate); break; case htons(ETH_P_IPV6): err = skb_checksum_setup_ipv6(skb, recalculate); break; default: err = -EPROTO; break; } return err; } EXPORT_SYMBOL(skb_checksum_setup); /** * skb_checksum_maybe_trim - maybe trims the given skb * @skb: the skb to check * @transport_len: the data length beyond the network header * * Checks whether the given skb has data beyond the given transport length. * If so, returns a cloned skb trimmed to this transport length. * Otherwise returns the provided skb. Returns NULL in error cases * (e.g. transport_len exceeds skb length or out-of-memory). * * Caller needs to set the skb transport header and free any returned skb if it * differs from the provided skb. */ static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, unsigned int transport_len) { struct sk_buff *skb_chk; unsigned int len = skb_transport_offset(skb) + transport_len; int ret; if (skb->len < len) return NULL; else if (skb->len == len) return skb; skb_chk = skb_clone(skb, GFP_ATOMIC); if (!skb_chk) return NULL; ret = pskb_trim_rcsum(skb_chk, len); if (ret) { kfree_skb(skb_chk); return NULL; } return skb_chk; } /** * skb_checksum_trimmed - validate checksum of an skb * @skb: the skb to check * @transport_len: the data length beyond the network header * @skb_chkf: checksum function to use * * Applies the given checksum function skb_chkf to the provided skb. * Returns a checked and maybe trimmed skb. Returns NULL on error. * * If the skb has data beyond the given transport length, then a * trimmed & cloned skb is checked and returned. * * Caller needs to set the skb transport header and free any returned skb if it * differs from the provided skb. */ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, unsigned int transport_len, __sum16(*skb_chkf)(struct sk_buff *skb)) { struct sk_buff *skb_chk; unsigned int offset = skb_transport_offset(skb); __sum16 ret; skb_chk = skb_checksum_maybe_trim(skb, transport_len); if (!skb_chk) goto err; if (!pskb_may_pull(skb_chk, offset)) goto err; skb_pull_rcsum(skb_chk, offset); ret = skb_chkf(skb_chk); skb_push_rcsum(skb_chk, offset); if (ret) goto err; return skb_chk; err: if (skb_chk && skb_chk != skb) kfree_skb(skb_chk); return NULL; } EXPORT_SYMBOL(skb_checksum_trimmed); void __skb_warn_lro_forwarding(const struct sk_buff *skb) { net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", skb->dev->name); } EXPORT_SYMBOL(__skb_warn_lro_forwarding); void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) { if (head_stolen) { skb_release_head_state(skb); kmem_cache_free(net_hotdata.skbuff_cache, skb); } else { __kfree_skb(skb); } } EXPORT_SYMBOL(kfree_skb_partial); /** * skb_try_coalesce - try to merge skb to prior one * @to: prior buffer * @from: buffer to add * @fragstolen: pointer to boolean * @delta_truesize: how much more was allocated than was requested */ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, bool *fragstolen, int *delta_truesize) { struct skb_shared_info *to_shinfo, *from_shinfo; int i, delta, len = from->len; *fragstolen = false; if (skb_cloned(to)) return false; /* In general, avoid mixing page_pool and non-page_pool allocated * pages within the same SKB. In theory we could take full * references if @from is cloned and !@to->pp_recycle but its * tricky (due to potential race with the clone disappearing) and * rare, so not worth dealing with. */ if (to->pp_recycle != from->pp_recycle) return false; if (skb_frags_readable(from) != skb_frags_readable(to)) return false; if (len <= skb_tailroom(to) && skb_frags_readable(from)) { if (len) BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); *delta_truesize = 0; return true; } to_shinfo = skb_shinfo(to); from_shinfo = skb_shinfo(from); if (to_shinfo->frag_list || from_shinfo->frag_list) return false; if (skb_zcopy(to) || skb_zcopy(from)) return false; if (skb_headlen(from) != 0) { struct page *page; unsigned int offset; if (to_shinfo->nr_frags + from_shinfo->nr_frags >= MAX_SKB_FRAGS) return false; if (skb_head_is_locked(from)) return false; delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); page = virt_to_head_page(from->head); offset = from->data - (unsigned char *)page_address(page); skb_fill_page_desc(to, to_shinfo->nr_frags, page, offset, skb_headlen(from)); *fragstolen = true; } else { if (to_shinfo->nr_frags + from_shinfo->nr_frags > MAX_SKB_FRAGS) return false; delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); } WARN_ON_ONCE(delta < len); memcpy(to_shinfo->frags + to_shinfo->nr_frags, from_shinfo->frags, from_shinfo->nr_frags * sizeof(skb_frag_t)); to_shinfo->nr_frags += from_shinfo->nr_frags; if (!skb_cloned(from)) from_shinfo->nr_frags = 0; /* if the skb is not cloned this does nothing * since we set nr_frags to 0. */ if (skb_pp_frag_ref(from)) { for (i = 0; i < from_shinfo->nr_frags; i++) __skb_frag_ref(&from_shinfo->frags[i]); } to->truesize += delta; to->len += len; to->data_len += len; *delta_truesize = delta; return true; } EXPORT_SYMBOL(skb_try_coalesce); /** * skb_scrub_packet - scrub an skb * * @skb: buffer to clean * @xnet: packet is crossing netns * * skb_scrub_packet can be used after encapsulating or decapsulating a packet * into/from a tunnel. Some information have to be cleared during these * operations. * skb_scrub_packet can also be used to clean a skb before injecting it in * another namespace (@xnet == true). We have to clear all information in the * skb that could impact namespace isolation. */ void skb_scrub_packet(struct sk_buff *skb, bool xnet) { skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); skb_ext_reset(skb); nf_reset_ct(skb); nf_reset_trace(skb); #ifdef CONFIG_NET_SWITCHDEV skb->offload_fwd_mark = 0; skb->offload_l3_fwd_mark = 0; #endif if (!xnet) return; ipvs_reset(skb); skb->mark = 0; skb_clear_tstamp(skb); } EXPORT_SYMBOL_GPL(skb_scrub_packet); static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) { int mac_len, meta_len; void *meta; if (skb_cow(skb, skb_headroom(skb)) < 0) { kfree_skb(skb); return NULL; } mac_len = skb->data - skb_mac_header(skb); if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) { memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb), mac_len - VLAN_HLEN - ETH_TLEN); } meta_len = skb_metadata_len(skb); if (meta_len) { meta = skb_metadata_end(skb) - meta_len; memmove(meta + VLAN_HLEN, meta, meta_len); } skb->mac_header += VLAN_HLEN; return skb; } struct sk_buff *skb_vlan_untag(struct sk_buff *skb) { struct vlan_hdr *vhdr; u16 vlan_tci; if (unlikely(skb_vlan_tag_present(skb))) { /* vlan_tci is already set-up so leave this for another time */ return skb; } skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) goto err_free; /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */ if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short)))) goto err_free; vhdr = (struct vlan_hdr *)skb->data; vlan_tci = ntohs(vhdr->h_vlan_TCI); __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); skb_pull_rcsum(skb, VLAN_HLEN); vlan_set_encap_proto(skb, vhdr); skb = skb_reorder_vlan_header(skb); if (unlikely(!skb)) goto err_free; skb_reset_network_header(skb); if (!skb_transport_header_was_set(skb)) skb_reset_transport_header(skb); skb_reset_mac_len(skb); return skb; err_free: kfree_skb(skb); return NULL; } EXPORT_SYMBOL(skb_vlan_untag); int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) { if (!pskb_may_pull(skb, write_len)) return -ENOMEM; if (!skb_frags_readable(skb)) return -EFAULT; if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) return 0; return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); } EXPORT_SYMBOL(skb_ensure_writable); int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev) { int needed_headroom = dev->needed_headroom; int needed_tailroom = dev->needed_tailroom; /* For tail taggers, we need to pad short frames ourselves, to ensure * that the tail tag does not fail at its role of being at the end of * the packet, once the conduit interface pads the frame. Account for * that pad length here, and pad later. */ if (unlikely(needed_tailroom && skb->len < ETH_ZLEN)) needed_tailroom += ETH_ZLEN - skb->len; /* skb_headroom() returns unsigned int... */ needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0); needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0); if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb))) /* No reallocation needed, yay! */ return 0; return pskb_expand_head(skb, needed_headroom, needed_tailroom, GFP_ATOMIC); } EXPORT_SYMBOL(skb_ensure_writable_head_tail); /* remove VLAN header from packet and update csum accordingly. * expects a non skb_vlan_tag_present skb with a vlan tag payload */ int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) { int offset = skb->data - skb_mac_header(skb); int err; if (WARN_ONCE(offset, "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n", offset)) { return -EINVAL; } err = skb_ensure_writable(skb, VLAN_ETH_HLEN); if (unlikely(err)) return err; skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); vlan_remove_tag(skb, vlan_tci); skb->mac_header += VLAN_HLEN; if (skb_network_offset(skb) < ETH_HLEN) skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_len(skb); return err; } EXPORT_SYMBOL(__skb_vlan_pop); /* Pop a vlan tag either from hwaccel or from payload. * Expects skb->data at mac header. */ int skb_vlan_pop(struct sk_buff *skb) { u16 vlan_tci; __be16 vlan_proto; int err; if (likely(skb_vlan_tag_present(skb))) { __vlan_hwaccel_clear_tag(skb); } else { if (unlikely(!eth_type_vlan(skb->protocol))) return 0; err = __skb_vlan_pop(skb, &vlan_tci); if (err) return err; } /* move next vlan tag to hw accel tag */ if (likely(!eth_type_vlan(skb->protocol))) return 0; vlan_proto = skb->protocol; err = __skb_vlan_pop(skb, &vlan_tci); if (unlikely(err)) return err; __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); return 0; } EXPORT_SYMBOL(skb_vlan_pop); /* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present). * Expects skb->data at mac header. */ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { if (skb_vlan_tag_present(skb)) { int offset = skb->data - skb_mac_header(skb); int err; if (WARN_ONCE(offset, "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n", offset)) { return -EINVAL; } err = __vlan_insert_tag(skb, skb->vlan_proto, skb_vlan_tag_get(skb)); if (err) return err; skb->protocol = skb->vlan_proto; skb->network_header -= VLAN_HLEN; skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); } __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); return 0; } EXPORT_SYMBOL(skb_vlan_push); /** * skb_eth_pop() - Drop the Ethernet header at the head of a packet * * @skb: Socket buffer to modify * * Drop the Ethernet header of @skb. * * Expects that skb->data points to the mac header and that no VLAN tags are * present. * * Returns 0 on success, -errno otherwise. */ int skb_eth_pop(struct sk_buff *skb) { if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) || skb_network_offset(skb) < ETH_HLEN) return -EPROTO; skb_pull_rcsum(skb, ETH_HLEN); skb_reset_mac_header(skb); skb_reset_mac_len(skb); return 0; } EXPORT_SYMBOL(skb_eth_pop); /** * skb_eth_push() - Add a new Ethernet header at the head of a packet * * @skb: Socket buffer to modify * @dst: Destination MAC address of the new header * @src: Source MAC address of the new header * * Prepend @skb with a new Ethernet header. * * Expects that skb->data points to the mac header, which must be empty. * * Returns 0 on success, -errno otherwise. */ int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, const unsigned char *src) { struct ethhdr *eth; int err; if (skb_network_offset(skb) || skb_vlan_tag_present(skb)) return -EPROTO; err = skb_cow_head(skb, sizeof(*eth)); if (err < 0) return err; skb_push(skb, sizeof(*eth)); skb_reset_mac_header(skb); skb_reset_mac_len(skb); eth = eth_hdr(skb); ether_addr_copy(eth->h_dest, dst); ether_addr_copy(eth->h_source, src); eth->h_proto = skb->protocol; skb_postpush_rcsum(skb, eth, sizeof(*eth)); return 0; } EXPORT_SYMBOL(skb_eth_push); /* Update the ethertype of hdr and the skb csum value if required. */ static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, __be16 ethertype) { if (skb->ip_summed == CHECKSUM_COMPLETE) { __be16 diff[] = { ~hdr->h_proto, ethertype }; skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); } hdr->h_proto = ethertype; } /** * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of * the packet * * @skb: buffer * @mpls_lse: MPLS label stack entry to push * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) * @mac_len: length of the MAC header * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is * ethernet * * Expects skb->data at mac header. * * Returns 0 on success, -errno otherwise. */ int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, int mac_len, bool ethernet) { struct mpls_shim_hdr *lse; int err; if (unlikely(!eth_p_mpls(mpls_proto))) return -EINVAL; /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */ if (skb->encapsulation) return -EINVAL; err = skb_cow_head(skb, MPLS_HLEN); if (unlikely(err)) return err; if (!skb->inner_protocol) { skb_set_inner_network_header(skb, skb_network_offset(skb)); skb_set_inner_protocol(skb, skb->protocol); } skb_push(skb, MPLS_HLEN); memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), mac_len); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); skb_reset_mac_len(skb); lse = mpls_hdr(skb); lse->label_stack_entry = mpls_lse; skb_postpush_rcsum(skb, lse, MPLS_HLEN); if (ethernet && mac_len >= ETH_HLEN) skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto); skb->protocol = mpls_proto; return 0; } EXPORT_SYMBOL_GPL(skb_mpls_push); /** * skb_mpls_pop() - pop the outermost MPLS header * * @skb: buffer * @next_proto: ethertype of header after popped MPLS header * @mac_len: length of the MAC header * @ethernet: flag to indicate if the packet is ethernet * * Expects skb->data at mac header. * * Returns 0 on success, -errno otherwise. */ int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, bool ethernet) { int err; if (unlikely(!eth_p_mpls(skb->protocol))) return 0; err = skb_ensure_writable(skb, mac_len + MPLS_HLEN); if (unlikely(err)) return err; skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), mac_len); __skb_pull(skb, MPLS_HLEN); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); if (ethernet && mac_len >= ETH_HLEN) { struct ethhdr *hdr; /* use mpls_hdr() to get ethertype to account for VLANs. */ hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); skb_mod_eth_type(skb, hdr, next_proto); } skb->protocol = next_proto; return 0; } EXPORT_SYMBOL_GPL(skb_mpls_pop); /** * skb_mpls_update_lse() - modify outermost MPLS header and update csum * * @skb: buffer * @mpls_lse: new MPLS label stack entry to update to * * Expects skb->data at mac header. * * Returns 0 on success, -errno otherwise. */ int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse) { int err; if (unlikely(!eth_p_mpls(skb->protocol))) return -EINVAL; err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); if (unlikely(err)) return err; if (skb->ip_summed == CHECKSUM_COMPLETE) { __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse }; skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); } mpls_hdr(skb)->label_stack_entry = mpls_lse; return 0; } EXPORT_SYMBOL_GPL(skb_mpls_update_lse); /** * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header * * @skb: buffer * * Expects skb->data at mac header. * * Returns 0 on success, -errno otherwise. */ int skb_mpls_dec_ttl(struct sk_buff *skb) { u32 lse; u8 ttl; if (unlikely(!eth_p_mpls(skb->protocol))) return -EINVAL; if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) return -ENOMEM; lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; if (!--ttl) return -EINVAL; lse &= ~MPLS_LS_TTL_MASK; lse |= ttl << MPLS_LS_TTL_SHIFT; return skb_mpls_update_lse(skb, cpu_to_be32(lse)); } EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); /** * alloc_skb_with_frags - allocate skb with page frags * * @header_len: size of linear part * @data_len: needed length in frags * @order: max page order desired. * @errcode: pointer to error code if any * @gfp_mask: allocation mask * * This can be used to allocate a paged skb, given a maximal order for frags. */ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, unsigned long data_len, int order, int *errcode, gfp_t gfp_mask) { unsigned long chunk; struct sk_buff *skb; struct page *page; int nr_frags = 0; *errcode = -EMSGSIZE; if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order))) return NULL; *errcode = -ENOBUFS; skb = alloc_skb(header_len, gfp_mask); if (!skb) return NULL; while (data_len) { if (nr_frags == MAX_SKB_FRAGS - 1) goto failure; while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order)) order--; if (order) { page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN, order); if (!page) { order--; continue; } } else { page = alloc_page(gfp_mask); if (!page) goto failure; } chunk = min_t(unsigned long, data_len, PAGE_SIZE << order); skb_fill_page_desc(skb, nr_frags, page, 0, chunk); nr_frags++; skb->truesize += (PAGE_SIZE << order); data_len -= chunk; } return skb; failure: kfree_skb(skb); return NULL; } EXPORT_SYMBOL(alloc_skb_with_frags); /* carve out the first off bytes from skb when off < headlen */ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, const int headlen, gfp_t gfp_mask) { int i; unsigned int size = skb_end_offset(skb); int new_hlen = headlen - off; u8 *data; if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) return -ENOMEM; size = SKB_WITH_OVERHEAD(size); /* Copy real data, and all frags */ skb_copy_from_linear_data_offset(skb, off, data, new_hlen); skb->len -= off; memcpy((struct skb_shared_info *)(data + size), skb_shinfo(skb), offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); if (skb_cloned(skb)) { /* drop the old head gracefully */ if (skb_orphan_frags(skb, gfp_mask)) { skb_kfree_head(data, size); return -ENOMEM; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_frag_ref(skb, i); if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); skb_release_data(skb, SKB_CONSUMED); } else { /* we can reuse existing recount- all we did was * relocate values */ skb_free_head(skb); } skb->head = data; skb->data = data; skb->head_frag = 0; skb_set_end_offset(skb, size); skb_set_tail_pointer(skb, skb_headlen(skb)); skb_headers_offset_update(skb, 0); skb->cloned = 0; skb->hdr_len = 0; skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); return 0; } static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); /* carve out the first eat bytes from skb's frag_list. May recurse into * pskb_carve() */ static int pskb_carve_frag_list(struct sk_buff *skb, struct skb_shared_info *shinfo, int eat, gfp_t gfp_mask) { struct sk_buff *list = shinfo->frag_list; struct sk_buff *clone = NULL; struct sk_buff *insp = NULL; do { if (!list) { pr_err("Not enough bytes to eat. Want %d\n", eat); return -EFAULT; } if (list->len <= eat) { /* Eaten as whole. */ eat -= list->len; list = list->next; insp = list; } else { /* Eaten partially. */ if (skb_shared(list)) { clone = skb_clone(list, gfp_mask); if (!clone) return -ENOMEM; insp = list->next; list = clone; } else { /* This may be pulled without problems. */ insp = list; } if (pskb_carve(list, eat, gfp_mask) < 0) { kfree_skb(clone); return -ENOMEM; } break; } } while (eat); /* Free pulled out fragments. */ while ((list = shinfo->frag_list) != insp) { shinfo->frag_list = list->next; consume_skb(list); } /* And insert new clone at head. */ if (clone) { clone->next = list; shinfo->frag_list = clone; } return 0; } /* carve off first len bytes from skb. Split line (off) is in the * non-linear part of skb */ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, int pos, gfp_t gfp_mask) { int i, k = 0; unsigned int size = skb_end_offset(skb); u8 *data; const int nfrags = skb_shinfo(skb)->nr_frags; struct skb_shared_info *shinfo; if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) return -ENOMEM; size = SKB_WITH_OVERHEAD(size); memcpy((struct skb_shared_info *)(data + size), skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); if (skb_orphan_frags(skb, gfp_mask)) { skb_kfree_head(data, size); return -ENOMEM; } shinfo = (struct skb_shared_info *)(data + size); for (i = 0; i < nfrags; i++) { int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]); if (pos + fsize > off) { shinfo->frags[k] = skb_shinfo(skb)->frags[i]; if (pos < off) { /* Split frag. * We have two variants in this case: * 1. Move all the frag to the second * part, if it is possible. F.e. * this approach is mandatory for TUX, * where splitting is expensive. * 2. Split is accurately. We make this. */ skb_frag_off_add(&shinfo->frags[0], off - pos); skb_frag_size_sub(&shinfo->frags[0], off - pos); } skb_frag_ref(skb, i); k++; } pos += fsize; } shinfo->nr_frags = k; if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); /* split line is in frag list */ if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) { /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ if (skb_has_frag_list(skb)) kfree_skb_list(skb_shinfo(skb)->frag_list); skb_kfree_head(data, size); return -ENOMEM; } skb_release_data(skb, SKB_CONSUMED); skb->head = data; skb->head_frag = 0; skb->data = data; skb_set_end_offset(skb, size); skb_reset_tail_pointer(skb); skb_headers_offset_update(skb, 0); skb->cloned = 0; skb->hdr_len = 0; skb->nohdr = 0; skb->len -= off; skb->data_len = skb->len; atomic_set(&skb_shinfo(skb)->dataref, 1); return 0; } /* remove len bytes from the beginning of the skb */ static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp) { int headlen = skb_headlen(skb); if (len < headlen) return pskb_carve_inside_header(skb, len, headlen, gfp); else return pskb_carve_inside_nonlinear(skb, len, headlen, gfp); } /* Extract to_copy bytes starting at off from skb, and return this in * a new skb */ struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy, gfp_t gfp) { struct sk_buff *clone = skb_clone(skb, gfp); if (!clone) return NULL; if (pskb_carve(clone, off, gfp) < 0 || pskb_trim(clone, to_copy)) { kfree_skb(clone); return NULL; } return clone; } EXPORT_SYMBOL(pskb_extract); /** * skb_condense - try to get rid of fragments/frag_list if possible * @skb: buffer * * Can be used to save memory before skb is added to a busy queue. * If packet has bytes in frags and enough tail room in skb->head, * pull all of them, so that we can free the frags right now and adjust * truesize. * Notes: * We do not reallocate skb->head thus can not fail. * Caller must re-evaluate skb->truesize if needed. */ void skb_condense(struct sk_buff *skb) { if (skb->data_len) { if (skb->data_len > skb->end - skb->tail || skb_cloned(skb) || !skb_frags_readable(skb)) return; /* Nice, we can free page frag(s) right now */ __pskb_pull_tail(skb, skb->data_len); } /* At this point, skb->truesize might be over estimated, * because skb had a fragment, and fragments do not tell * their truesize. * When we pulled its content into skb->head, fragment * was freed, but __pskb_pull_tail() could not possibly * adjust skb->truesize, not knowing the frag truesize. */ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); } EXPORT_SYMBOL(skb_condense); #ifdef CONFIG_SKB_EXTENSIONS static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) { return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); } /** * __skb_ext_alloc - allocate a new skb extensions storage * * @flags: See kmalloc(). * * Returns the newly allocated pointer. The pointer can later attached to a * skb via __skb_ext_set(). * Note: caller must handle the skb_ext as an opaque data. */ struct skb_ext *__skb_ext_alloc(gfp_t flags) { struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags); if (new) { memset(new->offset, 0, sizeof(new->offset)); refcount_set(&new->refcnt, 1); } return new; } static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, unsigned int old_active) { struct skb_ext *new; if (refcount_read(&old->refcnt) == 1) return old; new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); if (!new) return NULL; memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); refcount_set(&new->refcnt, 1); #ifdef CONFIG_XFRM if (old_active & (1 << SKB_EXT_SEC_PATH)) { struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH); unsigned int i; for (i = 0; i < sp->len; i++) xfrm_state_hold(sp->xvec[i]); } #endif #ifdef CONFIG_MCTP_FLOWS if (old_active & (1 << SKB_EXT_MCTP)) { struct mctp_flow *flow = skb_ext_get_ptr(old, SKB_EXT_MCTP); if (flow->key) refcount_inc(&flow->key->refs); } #endif __skb_ext_put(old); return new; } /** * __skb_ext_set - attach the specified extension storage to this skb * @skb: buffer * @id: extension id * @ext: extension storage previously allocated via __skb_ext_alloc() * * Existing extensions, if any, are cleared. * * Returns the pointer to the extension. */ void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, struct skb_ext *ext) { unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext); skb_ext_put(skb); newlen = newoff + skb_ext_type_len[id]; ext->chunks = newlen; ext->offset[id] = newoff; skb->extensions = ext; skb->active_extensions = 1 << id; return skb_ext_get_ptr(ext, id); } /** * skb_ext_add - allocate space for given extension, COW if needed * @skb: buffer * @id: extension to allocate space for * * Allocates enough space for the given extension. * If the extension is already present, a pointer to that extension * is returned. * * If the skb was cloned, COW applies and the returned memory can be * modified without changing the extension space of clones buffers. * * Returns pointer to the extension or NULL on allocation failure. */ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) { struct skb_ext *new, *old = NULL; unsigned int newlen, newoff; if (skb->active_extensions) { old = skb->extensions; new = skb_ext_maybe_cow(old, skb->active_extensions); if (!new) return NULL; if (__skb_ext_exist(new, id)) goto set_active; newoff = new->chunks; } else { newoff = SKB_EXT_CHUNKSIZEOF(*new); new = __skb_ext_alloc(GFP_ATOMIC); if (!new) return NULL; } newlen = newoff + skb_ext_type_len[id]; new->chunks = newlen; new->offset[id] = newoff; set_active: skb->slow_gro = 1; skb->extensions = new; skb->active_extensions |= 1 << id; return skb_ext_get_ptr(new, id); } EXPORT_SYMBOL(skb_ext_add); #ifdef CONFIG_XFRM static void skb_ext_put_sp(struct sec_path *sp) { unsigned int i; for (i = 0; i < sp->len; i++) xfrm_state_put(sp->xvec[i]); } #endif #ifdef CONFIG_MCTP_FLOWS static void skb_ext_put_mctp(struct mctp_flow *flow) { if (flow->key) mctp_key_unref(flow->key); } #endif void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) { struct skb_ext *ext = skb->extensions; skb->active_extensions &= ~(1 << id); if (skb->active_extensions == 0) { skb->extensions = NULL; __skb_ext_put(ext); #ifdef CONFIG_XFRM } else if (id == SKB_EXT_SEC_PATH && refcount_read(&ext->refcnt) == 1) { struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH); skb_ext_put_sp(sp); sp->len = 0; #endif } } EXPORT_SYMBOL(__skb_ext_del); void __skb_ext_put(struct skb_ext *ext) { /* If this is last clone, nothing can increment * it after check passes. Avoids one atomic op. */ if (refcount_read(&ext->refcnt) == 1) goto free_now; if (!refcount_dec_and_test(&ext->refcnt)) return; free_now: #ifdef CONFIG_XFRM if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); #endif #ifdef CONFIG_MCTP_FLOWS if (__skb_ext_exist(ext, SKB_EXT_MCTP)) skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP)); #endif kmem_cache_free(skbuff_ext_cache, ext); } EXPORT_SYMBOL(__skb_ext_put); #endif /* CONFIG_SKB_EXTENSIONS */ static void kfree_skb_napi_cache(struct sk_buff *skb) { /* if SKB is a clone, don't handle this case */ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { __kfree_skb(skb); return; } local_bh_disable(); __napi_kfree_skb(skb, SKB_CONSUMED); local_bh_enable(); } /** * skb_attempt_defer_free - queue skb for remote freeing * @skb: buffer * * Put @skb in a per-cpu list, using the cpu which * allocated the skb/pages to reduce false sharing * and memory zone spinlock contention. */ void skb_attempt_defer_free(struct sk_buff *skb) { int cpu = skb->alloc_cpu; struct softnet_data *sd; unsigned int defer_max; bool kick; if (cpu == raw_smp_processor_id() || WARN_ON_ONCE(cpu >= nr_cpu_ids) || !cpu_online(cpu)) { nodefer: kfree_skb_napi_cache(skb); return; } DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); DEBUG_NET_WARN_ON_ONCE(skb->destructor); sd = &per_cpu(softnet_data, cpu); defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); if (READ_ONCE(sd->defer_count) >= defer_max) goto nodefer; spin_lock_bh(&sd->defer_lock); /* Send an IPI every time queue reaches half capacity. */ kick = sd->defer_count == (defer_max >> 1); /* Paired with the READ_ONCE() few lines above */ WRITE_ONCE(sd->defer_count, sd->defer_count + 1); skb->next = sd->defer_list; /* Paired with READ_ONCE() in skb_defer_free_flush() */ WRITE_ONCE(sd->defer_list, skb); spin_unlock_bh(&sd->defer_lock); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). */ if (unlikely(kick)) kick_defer_list_purge(sd, cpu); } static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, size_t offset, size_t len) { const char *kaddr; __wsum csum; kaddr = kmap_local_page(page); csum = csum_partial(kaddr + offset, len, 0); kunmap_local(kaddr); skb->csum = csum_block_add(skb->csum, csum, skb->len); } /** * skb_splice_from_iter - Splice (or copy) pages to skbuff * @skb: The buffer to add pages to * @iter: Iterator representing the pages to be added * @maxsize: Maximum amount of pages to be added * @gfp: Allocation flags * * This is a common helper function for supporting MSG_SPLICE_PAGES. It * extracts pages from an iterator and adds them to the socket buffer if * possible, copying them to fragments if not possible (such as if they're slab * pages). * * Returns the amount of data spliced/copied or -EMSGSIZE if there's * insufficient space in the buffer to transfer anything. */ ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, ssize_t maxsize, gfp_t gfp) { size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags); struct page *pages[8], **ppages = pages; ssize_t spliced = 0, ret = 0; unsigned int i; while (iter->count > 0) { ssize_t space, nr, len; size_t off; ret = -EMSGSIZE; space = frag_limit - skb_shinfo(skb)->nr_frags; if (space < 0) break; /* We might be able to coalesce without increasing nr_frags */ nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages)); len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off); if (len <= 0) { ret = len ?: -EIO; break; } i = 0; do { struct page *page = pages[i++]; size_t part = min_t(size_t, PAGE_SIZE - off, len); ret = -EIO; if (WARN_ON_ONCE(!sendpage_ok(page))) goto out; ret = skb_append_pagefrags(skb, page, off, part, frag_limit); if (ret < 0) { iov_iter_revert(iter, len); goto out; } if (skb->ip_summed == CHECKSUM_NONE) skb_splice_csum_page(skb, page, off, part); off = 0; spliced += part; maxsize -= part; len -= part; } while (len > 0); if (maxsize <= 0) break; } out: skb_len_add(skb, spliced); return spliced ?: ret; } EXPORT_SYMBOL(skb_splice_from_iter); static __always_inline size_t memcpy_from_iter_csum(void *iter_from, size_t progress, size_t len, void *to, void *priv2) { __wsum *csum = priv2; __wsum next = csum_partial_copy_nocheck(iter_from, to + progress, len); *csum = csum_block_add(*csum, next, progress); return 0; } static __always_inline size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { __wsum next, *csum = priv2; next = csum_and_copy_from_user(iter_from, to + progress, len); *csum = csum_block_add(*csum, next, progress); return next ? 0 : len; } bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i) { size_t copied; if (WARN_ON_ONCE(!i->data_source)) return false; copied = iterate_and_advance2(i, bytes, addr, csum, copy_from_user_iter_csum, memcpy_from_iter_csum); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } EXPORT_SYMBOL(csum_and_copy_from_iter_full); |
| 10 10 6 4 4 3 2 2 2 2 2 61 58 7 22 22 37 2 2 2 33 35 35 14 44 41 2 39 38 3 37 37 41 241 242 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 | // SPDX-License-Identifier: GPL-2.0-only /* * irqchip.c: Common API for in kernel interrupt controllers * Copyright (c) 2007, Intel Corporation. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * Copyright (c) 2013, Alexander Graf <agraf@suse.de> * * This file is derived from virt/kvm/irq_comm.c. * * Authors: * Yaozu (Eddie) Dong <Eddie.dong@intel.com> * Alexander Graf <agraf@suse.de> */ #include <linux/kvm_host.h> #include <linux/slab.h> #include <linux/srcu.h> #include <linux/export.h> #include <trace/events/kvm.h> int kvm_irq_map_gsi(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *entries, int gsi) { struct kvm_irq_routing_table *irq_rt; struct kvm_kernel_irq_routing_entry *e; int n = 0; irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu, lockdep_is_held(&kvm->irq_lock)); if (irq_rt && gsi < irq_rt->nr_rt_entries) { hlist_for_each_entry(e, &irq_rt->map[gsi], link) { entries[n] = *e; ++n; } } return n; } int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin) { struct kvm_irq_routing_table *irq_rt; irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); return irq_rt->chip[irqchip][pin]; } int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) { struct kvm_kernel_irq_routing_entry route; if (!kvm_arch_irqchip_in_kernel(kvm) || (msi->flags & ~KVM_MSI_VALID_DEVID)) return -EINVAL; route.msi.address_lo = msi->address_lo; route.msi.address_hi = msi->address_hi; route.msi.data = msi->data; route.msi.flags = msi->flags; route.msi.devid = msi->devid; return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false); } /* * Return value: * < 0 Interrupt was ignored (masked or not delivered for other reasons) * = 0 Interrupt was coalesced (previous irq is still pending) * > 0 Number of CPUs interrupt was delivered to */ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status) { struct kvm_kernel_irq_routing_entry irq_set[KVM_NR_IRQCHIPS]; int ret = -1, i, idx; trace_kvm_set_irq(irq, level, irq_source_id); /* Not possible to detect if the guest uses the PIC or the * IOAPIC. So set the bit in both. The guest will ignore * writes to the unused one. */ idx = srcu_read_lock(&kvm->irq_srcu); i = kvm_irq_map_gsi(kvm, irq_set, irq); srcu_read_unlock(&kvm->irq_srcu, idx); while (i--) { int r; r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level, line_status); if (r < 0) continue; ret = r + ((ret < 0) ? 0 : ret); } return ret; } static void free_irq_routing_table(struct kvm_irq_routing_table *rt) { int i; if (!rt) return; for (i = 0; i < rt->nr_rt_entries; ++i) { struct kvm_kernel_irq_routing_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &rt->map[i], link) { hlist_del(&e->link); kfree(e); } } kfree(rt); } void kvm_free_irq_routing(struct kvm *kvm) { /* Called only during vm destruction. Nobody can use the pointer at this stage */ struct kvm_irq_routing_table *rt = rcu_access_pointer(kvm->irq_routing); free_irq_routing_table(rt); } static int setup_routing_entry(struct kvm *kvm, struct kvm_irq_routing_table *rt, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { struct kvm_kernel_irq_routing_entry *ei; int r; u32 gsi = array_index_nospec(ue->gsi, KVM_MAX_IRQ_ROUTES); /* * Do not allow GSI to be mapped to the same irqchip more than once. * Allow only one to one mapping between GSI and non-irqchip routing. */ hlist_for_each_entry(ei, &rt->map[gsi], link) if (ei->type != KVM_IRQ_ROUTING_IRQCHIP || ue->type != KVM_IRQ_ROUTING_IRQCHIP || ue->u.irqchip.irqchip == ei->irqchip.irqchip) return -EINVAL; e->gsi = gsi; e->type = ue->type; r = kvm_set_routing_entry(kvm, e, ue); if (r) return r; if (e->type == KVM_IRQ_ROUTING_IRQCHIP) rt->chip[e->irqchip.irqchip][e->irqchip.pin] = e->gsi; hlist_add_head(&e->link, &rt->map[e->gsi]); return 0; } void __attribute__((weak)) kvm_arch_irq_routing_update(struct kvm *kvm) { } bool __weak kvm_arch_can_set_irq_routing(struct kvm *kvm) { return true; } int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *ue, unsigned nr, unsigned flags) { struct kvm_irq_routing_table *new, *old; struct kvm_kernel_irq_routing_entry *e; u32 i, j, nr_rt_entries = 0; int r; for (i = 0; i < nr; ++i) { if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES) return -EINVAL; nr_rt_entries = max(nr_rt_entries, ue[i].gsi); } nr_rt_entries += 1; new = kzalloc(struct_size(new, map, nr_rt_entries), GFP_KERNEL_ACCOUNT); if (!new) return -ENOMEM; new->nr_rt_entries = nr_rt_entries; for (i = 0; i < KVM_NR_IRQCHIPS; i++) for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++) new->chip[i][j] = -1; for (i = 0; i < nr; ++i) { r = -ENOMEM; e = kzalloc(sizeof(*e), GFP_KERNEL_ACCOUNT); if (!e) goto out; r = -EINVAL; switch (ue->type) { case KVM_IRQ_ROUTING_MSI: if (ue->flags & ~KVM_MSI_VALID_DEVID) goto free_entry; break; default: if (ue->flags) goto free_entry; break; } r = setup_routing_entry(kvm, new, e, ue); if (r) goto free_entry; ++ue; } mutex_lock(&kvm->irq_lock); old = rcu_dereference_protected(kvm->irq_routing, 1); rcu_assign_pointer(kvm->irq_routing, new); kvm_irq_routing_update(kvm); kvm_arch_irq_routing_update(kvm); mutex_unlock(&kvm->irq_lock); kvm_arch_post_irq_routing_update(kvm); synchronize_srcu_expedited(&kvm->irq_srcu); new = old; r = 0; goto out; free_entry: kfree(e); out: free_irq_routing_table(new); return r; } /* * Allocate empty IRQ routing by default so that additional setup isn't needed * when userspace-driven IRQ routing is activated, and so that kvm->irq_routing * is guaranteed to be non-NULL. */ int kvm_init_irq_routing(struct kvm *kvm) { struct kvm_irq_routing_table *new; int chip_size; new = kzalloc(struct_size(new, map, 1), GFP_KERNEL_ACCOUNT); if (!new) return -ENOMEM; new->nr_rt_entries = 1; chip_size = sizeof(int) * KVM_NR_IRQCHIPS * KVM_IRQCHIP_NUM_PINS; memset(new->chip, -1, chip_size); RCU_INIT_POINTER(kvm->irq_routing, new); return 0; } |
| 26 139 177 165 9 232 22 4 4 4 4 220 42 188 30 39 220 42 42 188 42 42 44 1 184 31 22 31 18 20 4 2 141 263 107 147 19 24 4 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FS_H #define _LINUX_FS_H #include <linux/linkage.h> #include <linux/wait_bit.h> #include <linux/kdev_t.h> #include <linux/dcache.h> #include <linux/path.h> #include <linux/stat.h> #include <linux/cache.h> #include <linux/list.h> #include <linux/list_lru.h> #include <linux/llist.h> #include <linux/radix-tree.h> #include <linux/xarray.h> #include <linux/rbtree.h> #include <linux/init.h> #include <linux/pid.h> #include <linux/bug.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/mm_types.h> #include <linux/capability.h> #include <linux/semaphore.h> #include <linux/fcntl.h> #include <linux/rculist_bl.h> #include <linux/atomic.h> #include <linux/shrinker.h> #include <linux/migrate_mode.h> #include <linux/uidgid.h> #include <linux/lockdep.h> #include <linux/percpu-rwsem.h> #include <linux/workqueue.h> #include <linux/delayed_call.h> #include <linux/uuid.h> #include <linux/errseq.h> #include <linux/ioprio.h> #include <linux/fs_types.h> #include <linux/build_bug.h> #include <linux/stddef.h> #include <linux/mount.h> #include <linux/cred.h> #include <linux/mnt_idmapping.h> #include <linux/slab.h> #include <linux/maple_tree.h> #include <linux/rw_hint.h> #include <linux/file_ref.h> #include <linux/unicode.h> #include <asm/byteorder.h> #include <uapi/linux/fs.h> struct backing_dev_info; struct bdi_writeback; struct bio; struct io_comp_batch; struct export_operations; struct fiemap_extent_info; struct hd_geometry; struct iovec; struct kiocb; struct kobject; struct pipe_inode_info; struct poll_table_struct; struct kstatfs; struct vm_area_struct; struct vfsmount; struct cred; struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; struct fscrypt_inode_info; struct fscrypt_operations; struct fsverity_info; struct fsverity_operations; struct fsnotify_mark_connector; struct fsnotify_sb_info; struct fs_context; struct fs_parameter_spec; struct fileattr; struct iomap_ops; extern void __init inode_init(void); extern void __init inode_init_early(void); extern void __init files_init(void); extern void __init files_maxfiles_init(void); extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; typedef __kernel_rwf_t rwf_t; struct buffer_head; typedef int (get_block_t)(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, ssize_t bytes, void *private); #define MAY_EXEC 0x00000001 #define MAY_WRITE 0x00000002 #define MAY_READ 0x00000004 #define MAY_APPEND 0x00000008 #define MAY_ACCESS 0x00000010 #define MAY_OPEN 0x00000020 #define MAY_CHDIR 0x00000040 /* called from RCU mode, don't block */ #define MAY_NOT_BLOCK 0x00000080 /* * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond * to O_WRONLY and O_RDWR via the strange trick in do_dentry_open() */ /* file is open for reading */ #define FMODE_READ ((__force fmode_t)(1 << 0)) /* file is open for writing */ #define FMODE_WRITE ((__force fmode_t)(1 << 1)) /* file is seekable */ #define FMODE_LSEEK ((__force fmode_t)(1 << 2)) /* file can be accessed using pread */ #define FMODE_PREAD ((__force fmode_t)(1 << 3)) /* file can be accessed using pwrite */ #define FMODE_PWRITE ((__force fmode_t)(1 << 4)) /* File is opened for execution with sys_execve / sys_uselib */ #define FMODE_EXEC ((__force fmode_t)(1 << 5)) /* File writes are restricted (block device specific) */ #define FMODE_WRITE_RESTRICTED ((__force fmode_t)(1 << 6)) /* File supports atomic writes */ #define FMODE_CAN_ATOMIC_WRITE ((__force fmode_t)(1 << 7)) /* FMODE_* bit 8 */ /* 32bit hashes as llseek() offset (for directories) */ #define FMODE_32BITHASH ((__force fmode_t)(1 << 9)) /* 64bit hashes as llseek() offset (for directories) */ #define FMODE_64BITHASH ((__force fmode_t)(1 << 10)) /* * Don't update ctime and mtime. * * Currently a special hack for the XFS open_by_handle ioctl, but we'll * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon. */ #define FMODE_NOCMTIME ((__force fmode_t)(1 << 11)) /* Expect random access pattern */ #define FMODE_RANDOM ((__force fmode_t)(1 << 12)) /* FMODE_* bit 13 */ /* File is opened with O_PATH; almost nothing can be done with it */ #define FMODE_PATH ((__force fmode_t)(1 << 14)) /* File needs atomic accesses to f_pos */ #define FMODE_ATOMIC_POS ((__force fmode_t)(1 << 15)) /* Write access to underlying fs */ #define FMODE_WRITER ((__force fmode_t)(1 << 16)) /* Has read method(s) */ #define FMODE_CAN_READ ((__force fmode_t)(1 << 17)) /* Has write method(s) */ #define FMODE_CAN_WRITE ((__force fmode_t)(1 << 18)) #define FMODE_OPENED ((__force fmode_t)(1 << 19)) #define FMODE_CREATED ((__force fmode_t)(1 << 20)) /* File is stream-like */ #define FMODE_STREAM ((__force fmode_t)(1 << 21)) /* File supports DIRECT IO */ #define FMODE_CAN_ODIRECT ((__force fmode_t)(1 << 22)) #define FMODE_NOREUSE ((__force fmode_t)(1 << 23)) /* File is embedded in backing_file object */ #define FMODE_BACKING ((__force fmode_t)(1 << 24)) /* * Together with FMODE_NONOTIFY_PERM defines which fsnotify events shouldn't be * generated (see below) */ #define FMODE_NONOTIFY ((__force fmode_t)(1 << 25)) /* * Together with FMODE_NONOTIFY defines which fsnotify events shouldn't be * generated (see below) */ #define FMODE_NONOTIFY_PERM ((__force fmode_t)(1 << 26)) /* File is capable of returning -EAGAIN if I/O will block */ #define FMODE_NOWAIT ((__force fmode_t)(1 << 27)) /* File represents mount that needs unmounting */ #define FMODE_NEED_UNMOUNT ((__force fmode_t)(1 << 28)) /* File does not contribute to nr_files count */ #define FMODE_NOACCOUNT ((__force fmode_t)(1 << 29)) /* * The two FMODE_NONOTIFY* define which fsnotify events should not be generated * for a file. These are the possible values of (f->f_mode & * FMODE_FSNOTIFY_MASK) and their meaning: * * FMODE_NONOTIFY - suppress all (incl. non-permission) events. * FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events. * FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only pre-content events. */ #define FMODE_FSNOTIFY_MASK \ (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM) #define FMODE_FSNOTIFY_NONE(mode) \ ((mode & FMODE_FSNOTIFY_MASK) == FMODE_NONOTIFY) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS #define FMODE_FSNOTIFY_PERM(mode) \ ((mode & FMODE_FSNOTIFY_MASK) == 0 || \ (mode & FMODE_FSNOTIFY_MASK) == (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)) #define FMODE_FSNOTIFY_HSM(mode) \ ((mode & FMODE_FSNOTIFY_MASK) == 0) #else #define FMODE_FSNOTIFY_PERM(mode) 0 #define FMODE_FSNOTIFY_HSM(mode) 0 #endif /* * Attribute flags. These should be or-ed together to figure out what * has been changed! */ #define ATTR_MODE (1 << 0) #define ATTR_UID (1 << 1) #define ATTR_GID (1 << 2) #define ATTR_SIZE (1 << 3) #define ATTR_ATIME (1 << 4) #define ATTR_MTIME (1 << 5) #define ATTR_CTIME (1 << 6) #define ATTR_ATIME_SET (1 << 7) #define ATTR_MTIME_SET (1 << 8) #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ #define ATTR_KILL_SUID (1 << 11) #define ATTR_KILL_SGID (1 << 12) #define ATTR_FILE (1 << 13) #define ATTR_KILL_PRIV (1 << 14) #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ #define ATTR_TIMES_SET (1 << 16) #define ATTR_TOUCH (1 << 17) #define ATTR_DELEG (1 << 18) /* Delegated attrs. Don't break write delegations */ /* * Whiteout is represented by a char device. The following constants define the * mode and device number to use. */ #define WHITEOUT_MODE 0 #define WHITEOUT_DEV 0 /* * This is the Inode Attributes structure, used for notify_change(). It * uses the above definitions as flags, to know which values have changed. * Also, in this manner, a Filesystem can look at only the values it cares * about. Basically, these are the attributes that the VFS layer can * request to change from the FS layer. * * Derek Atkins <warlord@MIT.EDU> 94-10-20 */ struct iattr { unsigned int ia_valid; umode_t ia_mode; /* * The two anonymous unions wrap structures with the same member. * * Filesystems raising FS_ALLOW_IDMAP need to use ia_vfs{g,u}id which * are a dedicated type requiring the filesystem to use the dedicated * helpers. Other filesystem can continue to use ia_{g,u}id until they * have been ported. * * They always contain the same value. In other words FS_ALLOW_IDMAP * pass down the same value on idmapped mounts as they would on regular * mounts. */ union { kuid_t ia_uid; vfsuid_t ia_vfsuid; }; union { kgid_t ia_gid; vfsgid_t ia_vfsgid; }; loff_t ia_size; struct timespec64 ia_atime; struct timespec64 ia_mtime; struct timespec64 ia_ctime; /* * Not an attribute, but an auxiliary info for filesystems wanting to * implement an ftruncate() like method. NOTE: filesystem should * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL). */ struct file *ia_file; }; /* * Includes for diskquotas. */ #include <linux/quota.h> /* * Maximum number of layers of fs stack. Needs to be limited to * prevent kernel stack overflow */ #define FILESYSTEM_MAX_STACK_DEPTH 2 /** * enum positive_aop_returns - aop return codes with specific semantics * * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has * completed, that the page is still locked, and * should be considered active. The VM uses this hint * to return the page to the active list -- it won't * be a candidate for writeback again in the near * future. Other callers must be careful to unlock * the page if they get this return. Returned by * writepage(); * * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has * unlocked it and the page might have been truncated. * The caller should back up to acquiring a new page and * trying again. The aop will be taking reasonable * precautions not to livelock. If the caller held a page * reference, it should drop it before retrying. Returned * by read_folio(). * * address_space_operation functions return these large constants to indicate * special semantics to the caller. These are much larger than the bytes in a * page to allow for functions that return the number of bytes operated on in a * given page. */ enum positive_aop_returns { AOP_WRITEPAGE_ACTIVATE = 0x80000, AOP_TRUNCATED_PAGE = 0x80001, }; /* * oh the beauties of C type declarations. */ struct page; struct address_space; struct writeback_control; struct readahead_control; /* Match RWF_* bits to IOCB bits */ #define IOCB_HIPRI (__force int) RWF_HIPRI #define IOCB_DSYNC (__force int) RWF_DSYNC #define IOCB_SYNC (__force int) RWF_SYNC #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC #define IOCB_DONTCACHE (__force int) RWF_DONTCACHE /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) #define IOCB_DIRECT (1 << 17) #define IOCB_WRITE (1 << 18) /* iocb->ki_waitq is valid */ #define IOCB_WAITQ (1 << 19) #define IOCB_NOIO (1 << 20) /* can use bio alloc cache */ #define IOCB_ALLOC_CACHE (1 << 21) /* * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the * iocb completion can be passed back to the owner for execution from a safe * context rather than needing to be punted through a workqueue. If this * flag is set, the bio completion handling may set iocb->dio_complete to a * handler function and iocb->private to context information for that handler. * The issuer should call the handler with that context information from task * context to complete the processing of the iocb. Note that while this * provides a task context for the dio_complete() callback, it should only be * used on the completion side for non-IO generating completions. It's fine to * call blocking functions from this callback, but they should not wait for * unrelated IO (like cache flushing, new IO generation, etc). */ #define IOCB_DIO_CALLER_COMP (1 << 22) /* kiocb is a read or write operation submitted by fs/aio.c. */ #define IOCB_AIO_RW (1 << 23) #define IOCB_HAS_METADATA (1 << 24) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ { IOCB_HIPRI, "HIPRI" }, \ { IOCB_DSYNC, "DSYNC" }, \ { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ { IOCB_ATOMIC, "ATOMIC" }, \ { IOCB_DONTCACHE, "DONTCACHE" }, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ { IOCB_WAITQ, "WAITQ" }, \ { IOCB_NOIO, "NOIO" }, \ { IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \ { IOCB_DIO_CALLER_COMP, "CALLER_COMP" } struct kiocb { struct file *ki_filp; loff_t ki_pos; void (*ki_complete)(struct kiocb *iocb, long ret); void *private; int ki_flags; u16 ki_ioprio; /* See linux/ioprio.h */ union { /* * Only used for async buffered reads, where it denotes the * page waitqueue associated with completing the read. Valid * IFF IOCB_WAITQ is set. */ struct wait_page_queue *ki_waitq; /* * Can be used for O_DIRECT IO, where the completion handling * is punted back to the issuer of the IO. May only be set * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer * must then check for presence of this handler when ki_complete * is invoked. The data passed in to this handler must be * assigned to ->private when dio_complete is assigned. */ ssize_t (*dio_complete)(void *data); }; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) { return kiocb->ki_complete == NULL; } struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); int (*read_folio)(struct file *, struct folio *); /* Write back some dirty pages from this mapping. */ int (*writepages)(struct address_space *, struct writeback_control *); /* Mark a folio dirty. Return true if this dirtied it */ bool (*dirty_folio)(struct address_space *, struct folio *); void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata); int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata); /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t offset, size_t len); bool (*release_folio)(struct folio *, gfp_t); void (*free_folio)(struct folio *folio); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* * migrate the contents of a folio to the specified target. If * migrate_mode is MIGRATE_ASYNC, it must not block. */ int (*migrate_folio)(struct address_space *, struct folio *dst, struct folio *src, enum migrate_mode); int (*launder_folio)(struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb); int (*error_remove_folio)(struct address_space *, struct folio *); /* swapfile support */ int (*swap_activate)(struct swap_info_struct *sis, struct file *file, sector_t *span); void (*swap_deactivate)(struct file *file); int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; extern const struct address_space_operations empty_aops; /** * struct address_space - Contents of a cacheable, mappable object. * @host: Owner, either the inode or the block_device. * @i_pages: Cached pages. * @invalidate_lock: Guards coherency between page cache contents and * file offset->disk block mappings in the filesystem during invalidates. * It is also used to block modification of page cache contents through * memory mappings. * @gfp_mask: Memory allocation flags to use for allocating pages. * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings. * @nr_thps: Number of THPs in the pagecache (non-shmem only). * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. * @nrpages: Number of page entries, protected by the i_pages lock. * @writeback_index: Writeback starts here. * @a_ops: Methods. * @flags: Error bits and flags (AS_*). * @wb_err: The most recent error which has occurred. * @i_private_lock: For use by the owner of the address_space. * @i_private_list: For use by the owner of the address_space. * @i_private_data: For use by the owner of the address_space. */ struct address_space { struct inode *host; struct xarray i_pages; struct rw_semaphore invalidate_lock; gfp_t gfp_mask; atomic_t i_mmap_writable; #ifdef CONFIG_READ_ONLY_THP_FOR_FS /* number of thp, only for non-shmem files */ atomic_t nr_thps; #endif struct rb_root_cached i_mmap; unsigned long nrpages; pgoff_t writeback_index; const struct address_space_operations *a_ops; unsigned long flags; errseq_t wb_err; spinlock_t i_private_lock; struct list_head i_private_list; struct rw_semaphore i_mmap_rwsem; void * i_private_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but * must be enforced here for CRIS, to let the least significant bit * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. */ /* XArray tags, for tagging dirty and writeback pages in the pagecache. */ #define PAGECACHE_TAG_DIRTY XA_MARK_0 #define PAGECACHE_TAG_WRITEBACK XA_MARK_1 #define PAGECACHE_TAG_TOWRITE XA_MARK_2 /* * Returns true if any of the pages in the mapping are marked with the tag. */ static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag) { return xa_marked(&mapping->i_pages, tag); } static inline void i_mmap_lock_write(struct address_space *mapping) { down_write(&mapping->i_mmap_rwsem); } static inline int i_mmap_trylock_write(struct address_space *mapping) { return down_write_trylock(&mapping->i_mmap_rwsem); } static inline void i_mmap_unlock_write(struct address_space *mapping) { up_write(&mapping->i_mmap_rwsem); } static inline int i_mmap_trylock_read(struct address_space *mapping) { return down_read_trylock(&mapping->i_mmap_rwsem); } static inline void i_mmap_lock_read(struct address_space *mapping) { down_read(&mapping->i_mmap_rwsem); } static inline void i_mmap_unlock_read(struct address_space *mapping) { up_read(&mapping->i_mmap_rwsem); } static inline void i_mmap_assert_locked(struct address_space *mapping) { lockdep_assert_held(&mapping->i_mmap_rwsem); } static inline void i_mmap_assert_write_locked(struct address_space *mapping) { lockdep_assert_held_write(&mapping->i_mmap_rwsem); } /* * Might pages of this file be mapped into userspace? */ static inline int mapping_mapped(struct address_space *mapping) { return !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root); } /* * Might pages of this file have been modified in userspace? * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap * marks vma as VM_SHARED if it is shared, and the file was opened for * writing i.e. vma may be mprotected writable even if now readonly. * * If i_mmap_writable is negative, no new writable mappings are allowed. You * can only deny writable mappings, if none exists right now. */ static inline int mapping_writably_mapped(struct address_space *mapping) { return atomic_read(&mapping->i_mmap_writable) > 0; } static inline int mapping_map_writable(struct address_space *mapping) { return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? 0 : -EPERM; } static inline void mapping_unmap_writable(struct address_space *mapping) { atomic_dec(&mapping->i_mmap_writable); } static inline int mapping_deny_writable(struct address_space *mapping) { return atomic_dec_unless_positive(&mapping->i_mmap_writable) ? 0 : -EBUSY; } static inline void mapping_allow_writable(struct address_space *mapping) { atomic_inc(&mapping->i_mmap_writable); } /* * Use sequence counter to get consistent i_size on 32-bit processors. */ #if BITS_PER_LONG==32 && defined(CONFIG_SMP) #include <linux/seqlock.h> #define __NEED_I_SIZE_ORDERED #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) #else #define i_size_ordered_init(inode) do { } while (0) #endif struct posix_acl; #define ACL_NOT_CACHED ((void *)(-1)) /* * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to * cache the ACL. This also means that ->get_inode_acl() can be called in RCU * mode with the LOOKUP_RCU flag. */ #define ACL_DONT_CACHE ((void *)(-3)) static inline struct posix_acl * uncached_acl_sentinel(struct task_struct *task) { return (void *)task + 1; } static inline bool is_uncached_acl(struct posix_acl *acl) { return (long)acl & 1; } #define IOP_FASTPERM 0x0001 #define IOP_LOOKUP 0x0002 #define IOP_NOFOLLOW 0x0004 #define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 #define IOP_MGTIME 0x0020 #define IOP_CACHED_LINK 0x0040 /* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning * of the 'struct inode' */ struct inode { umode_t i_mode; unsigned short i_opflags; kuid_t i_uid; kgid_t i_gid; unsigned int i_flags; #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif const struct inode_operations *i_op; struct super_block *i_sb; struct address_space *i_mapping; #ifdef CONFIG_SECURITY void *i_security; #endif /* Stat data, not accessed from path walking */ unsigned long i_ino; /* * Filesystems may only read i_nlink directly. They shall use the * following functions for modification: * * (set|clear|inc|drop)_nlink * inode_(inc|dec)_link_count */ union { const unsigned int i_nlink; unsigned int __i_nlink; }; dev_t i_rdev; loff_t i_size; time64_t i_atime_sec; time64_t i_mtime_sec; time64_t i_ctime_sec; u32 i_atime_nsec; u32 i_mtime_nsec; u32 i_ctime_nsec; u32 i_generation; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; u8 i_blkbits; enum rw_hint i_write_hint; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED seqcount_t i_size_seqcount; #endif /* Misc */ u32 i_state; /* 32-bit hole */ struct rw_semaphore i_rwsem; unsigned long dirtied_when; /* jiffies of first dirtying */ unsigned long dirtied_time_when; struct hlist_node i_hash; struct list_head i_io_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ /* foreign inode detection, see wbc_detach_inode() */ int i_wb_frn_winner; u16 i_wb_frn_avg_time; u16 i_wb_frn_history; #endif struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; struct list_head i_wb_list; /* backing dev writeback list */ union { struct hlist_head i_dentry; struct rcu_head i_rcu; }; atomic64_t i_version; atomic64_t i_sequence; /* see futex */ atomic_t i_count; atomic_t i_dio_count; atomic_t i_writecount; #if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING) atomic_t i_readcount; /* struct files open RO */ #endif union { const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ void (*free_inode)(struct inode *); }; struct file_lock_context *i_flctx; struct address_space i_data; union { struct list_head i_devices; int i_linklen; }; union { struct pipe_inode_info *i_pipe; struct cdev *i_cdev; char *i_link; unsigned i_dir_seq; }; #ifdef CONFIG_FSNOTIFY __u32 i_fsnotify_mask; /* all events this inode cares about */ /* 32-bit hole reserved for expanding i_fsnotify_mask */ struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_inode_info *i_crypt_info; #endif #ifdef CONFIG_FS_VERITY struct fsverity_info *i_verity_info; #endif void *i_private; /* fs or device private pointer */ } __randomize_layout; static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) { int testlen; /* * TODO: patch it into a debug-only check if relevant macros show up. * In the meantime, since we are suffering strlen even on production kernels * to find the right length, do a fixup if the wrong value got passed. */ testlen = strlen(link); if (testlen != linklen) { WARN_ONCE(1, "bad length passed for symlink [%s] (got %d, expected %d)", link, linklen, testlen); linklen = testlen; } inode->i_link = link; inode->i_linklen = linklen; inode->i_opflags |= IOP_CACHED_LINK; } /* * Get bit address from inode->i_state to use with wait_var_event() * infrastructre. */ #define inode_state_wait_address(inode, bit) ((char *)&(inode)->i_state + (bit)) struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, struct inode *inode, u32 bit); static inline void inode_wake_up_bit(struct inode *inode, u32 bit) { /* Caller is responsible for correct memory barriers. */ wake_up_var(inode_state_wait_address(inode, bit)); } struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode); static inline unsigned int i_blocksize(const struct inode *node) { return (1 << node->i_blkbits); } static inline int inode_unhashed(struct inode *inode) { return hlist_unhashed(&inode->i_hash); } /* * __mark_inode_dirty expects inodes to be hashed. Since we don't * want special inodes in the fileset inode space, we make them * appear hashed, but do not put on any lists. hlist_del() * will work fine and require no locking. */ static inline void inode_fake_hash(struct inode *inode) { hlist_add_fake(&inode->i_hash); } /* * inode->i_mutex nesting subclasses for the lock validator: * * 0: the object of the current VFS operation * 1: parent * 2: child/target * 3: xattr * 4: second non-directory * 5: second parent (when locking independent directories in rename) * * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two * non-directories at once. * * The locking order between these classes is * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory */ enum inode_i_mutex_lock_class { I_MUTEX_NORMAL, I_MUTEX_PARENT, I_MUTEX_CHILD, I_MUTEX_XATTR, I_MUTEX_NONDIR2, I_MUTEX_PARENT2, }; static inline void inode_lock(struct inode *inode) { down_write(&inode->i_rwsem); } static inline void inode_unlock(struct inode *inode) { up_write(&inode->i_rwsem); } static inline void inode_lock_shared(struct inode *inode) { down_read(&inode->i_rwsem); } static inline void inode_unlock_shared(struct inode *inode) { up_read(&inode->i_rwsem); } static inline int inode_trylock(struct inode *inode) { return down_write_trylock(&inode->i_rwsem); } static inline int inode_trylock_shared(struct inode *inode) { return down_read_trylock(&inode->i_rwsem); } static inline int inode_is_locked(struct inode *inode) { return rwsem_is_locked(&inode->i_rwsem); } static inline void inode_lock_nested(struct inode *inode, unsigned subclass) { down_write_nested(&inode->i_rwsem, subclass); } static inline void inode_lock_shared_nested(struct inode *inode, unsigned subclass) { down_read_nested(&inode->i_rwsem, subclass); } static inline void filemap_invalidate_lock(struct address_space *mapping) { down_write(&mapping->invalidate_lock); } static inline void filemap_invalidate_unlock(struct address_space *mapping) { up_write(&mapping->invalidate_lock); } static inline void filemap_invalidate_lock_shared(struct address_space *mapping) { down_read(&mapping->invalidate_lock); } static inline int filemap_invalidate_trylock_shared( struct address_space *mapping) { return down_read_trylock(&mapping->invalidate_lock); } static inline void filemap_invalidate_unlock_shared( struct address_space *mapping) { up_read(&mapping->invalidate_lock); } void lock_two_nondirectories(struct inode *, struct inode*); void unlock_two_nondirectories(struct inode *, struct inode*); void filemap_invalidate_lock_two(struct address_space *mapping1, struct address_space *mapping2); void filemap_invalidate_unlock_two(struct address_space *mapping1, struct address_space *mapping2); /* * NOTE: in a 32bit arch with a preemptable kernel and * an UP compile the i_size_read/write must be atomic * with respect to the local cpu (unlike with preempt disabled), * but they don't need to be atomic with respect to other cpus like in * true SMP (so they need either to either locally disable irq around * the read or for example on x86 they can be still implemented as a * cmpxchg8b without the need of the lock prefix). For SMP compiles * and 64bit archs it makes no difference if preempt is enabled or not. */ static inline loff_t i_size_read(const struct inode *inode) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) loff_t i_size; unsigned int seq; do { seq = read_seqcount_begin(&inode->i_size_seqcount); i_size = inode->i_size; } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); return i_size; #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) loff_t i_size; preempt_disable(); i_size = inode->i_size; preempt_enable(); return i_size; #else /* Pairs with smp_store_release() in i_size_write() */ return smp_load_acquire(&inode->i_size); #endif } /* * NOTE: unlike i_size_read(), i_size_write() does need locking around it * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount * can be lost, resulting in subsequent i_size_read() calls spinning forever. */ static inline void i_size_write(struct inode *inode, loff_t i_size) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) preempt_disable(); write_seqcount_begin(&inode->i_size_seqcount); inode->i_size = i_size; write_seqcount_end(&inode->i_size_seqcount); preempt_enable(); #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) preempt_disable(); inode->i_size = i_size; preempt_enable(); #else /* * Pairs with smp_load_acquire() in i_size_read() to ensure * changes related to inode size (such as page contents) are * visible before we see the changed inode size. */ smp_store_release(&inode->i_size, i_size); #endif } static inline unsigned iminor(const struct inode *inode) { return MINOR(inode->i_rdev); } static inline unsigned imajor(const struct inode *inode) { return MAJOR(inode->i_rdev); } struct fown_struct { struct file *file; /* backpointer for security modules */ rwlock_t lock; /* protects pid, uid, euid fields */ struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ kuid_t uid, euid; /* uid/euid of process setting the owner */ int signum; /* posix.1b rt signal to be delivered on IO */ }; /** * struct file_ra_state - Track a file's readahead state. * @start: Where the most recent readahead started. * @size: Number of pages read in the most recent readahead. * @async_size: Numer of pages that were/are not needed immediately * and so were/are genuinely "ahead". Start next readahead when * the first of these pages is accessed. * @ra_pages: Maximum size of a readahead request, copied from the bdi. * @mmap_miss: How many mmap accesses missed in the page cache. * @prev_pos: The last byte in the most recent read request. * * When this structure is passed to ->readahead(), the "most recent" * readahead means the current readahead. */ struct file_ra_state { pgoff_t start; unsigned int size; unsigned int async_size; unsigned int ra_pages; unsigned int mmap_miss; loff_t prev_pos; }; /* * Check if @index falls in the readahead windows. */ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) { return (index >= ra->start && index < ra->start + ra->size); } /** * struct file - Represents a file * @f_ref: reference count * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context. * @f_mode: FMODE_* flags often used in hotpaths * @f_op: file operations * @f_mapping: Contents of a cacheable, mappable object. * @private_data: filesystem or driver specific data * @f_inode: cached inode * @f_flags: file flags * @f_iocb_flags: iocb flags * @f_cred: stashed credentials of creator/opener * @f_path: path of the file * @f_pos_lock: lock protecting file position * @f_pipe: specific to pipes * @f_pos: file position * @f_security: LSM security context of this file * @f_owner: file owner * @f_wb_err: writeback error * @f_sb_err: per sb writeback errors * @f_ep: link of all epoll hooks for this file * @f_task_work: task work entry point * @f_llist: work queue entrypoint * @f_ra: file's readahead state * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) */ struct file { file_ref_t f_ref; spinlock_t f_lock; fmode_t f_mode; const struct file_operations *f_op; struct address_space *f_mapping; void *private_data; struct inode *f_inode; unsigned int f_flags; unsigned int f_iocb_flags; const struct cred *f_cred; /* --- cacheline 1 boundary (64 bytes) --- */ struct path f_path; union { /* regular files (with FMODE_ATOMIC_POS) and directories */ struct mutex f_pos_lock; /* pipes */ u64 f_pipe; }; loff_t f_pos; #ifdef CONFIG_SECURITY void *f_security; #endif /* --- cacheline 2 boundary (128 bytes) --- */ struct fown_struct *f_owner; errseq_t f_wb_err; errseq_t f_sb_err; #ifdef CONFIG_EPOLL struct hlist_head *f_ep; #endif union { struct callback_head f_task_work; struct llist_node f_llist; struct file_ra_state f_ra; freeptr_t f_freeptr; }; /* --- cacheline 3 boundary (192 bytes) --- */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ struct file_handle { __u32 handle_bytes; int handle_type; /* file identifier */ unsigned char f_handle[] __counted_by(handle_bytes); }; static inline struct file *get_file(struct file *f) { file_ref_inc(&f->f_ref); return f; } struct file *get_file_rcu(struct file __rcu **f); struct file *get_file_active(struct file **f); #define file_count(f) file_ref_read(&(f)->f_ref) #define MAX_NON_LFS ((1UL<<31) - 1) /* Page cache limit. The filesystems should put that into their s_maxbytes limits, otherwise bad things can happen in VM. */ #if BITS_PER_LONG==32 #define MAX_LFS_FILESIZE ((loff_t)ULONG_MAX << PAGE_SHIFT) #elif BITS_PER_LONG==64 #define MAX_LFS_FILESIZE ((loff_t)LLONG_MAX) #endif /* legacy typedef, should eventually be removed */ typedef void *fl_owner_t; struct file_lock; struct file_lease; /* The following constant reflects the upper bound of the file/locking space */ #ifndef OFFSET_MAX #define OFFSET_MAX type_max(loff_t) #define OFFT_OFFSET_MAX type_max(off_t) #endif int file_f_owner_allocate(struct file *file); static inline struct fown_struct *file_f_owner(const struct file *file) { return READ_ONCE(file->f_owner); } extern void send_sigio(struct fown_struct *fown, int fd, int band); static inline struct inode *file_inode(const struct file *f) { return f->f_inode; } /* * file_dentry() is a relic from the days that overlayfs was using files with a * "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs. * In those days, file_dentry() was needed to get the underlying fs dentry that * matches f_inode. * Files with "fake" path should not exist nowadays, so use an assertion to make * sure that file_dentry() was not papering over filesystem bugs. */ static inline struct dentry *file_dentry(const struct file *file) { struct dentry *dentry = file->f_path.dentry; WARN_ON_ONCE(d_inode(dentry) != file_inode(file)); return dentry; } struct fasync_struct { rwlock_t fa_lock; int magic; int fa_fd; struct fasync_struct *fa_next; /* singly linked list */ struct file *fa_file; struct rcu_head fa_rcu; }; #define FASYNC_MAGIC 0x4601 /* SMP safe fasync helpers: */ extern int fasync_helper(int, struct file *, int, struct fasync_struct **); extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *); extern int fasync_remove_entry(struct file *, struct fasync_struct **); extern struct fasync_struct *fasync_alloc(void); extern void fasync_free(struct fasync_struct *); /* can be called from interrupts */ extern void kill_fasync(struct fasync_struct **, int, int); extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force); extern int f_setown(struct file *filp, int who, int force); extern void f_delown(struct file *filp); extern pid_t f_getown(struct file *filp); extern int send_sigurg(struct file *file); /* * sb->s_flags. Note that these mirror the equivalent MS_* flags where * represented in both. */ #define SB_RDONLY BIT(0) /* Mount read-only */ #define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */ #define SB_NODEV BIT(2) /* Disallow access to device special files */ #define SB_NOEXEC BIT(3) /* Disallow program execution */ #define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */ #define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */ #define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */ #define SB_NOATIME BIT(10) /* Do not update access times. */ #define SB_NODIRATIME BIT(11) /* Do not update directory access times */ #define SB_SILENT BIT(15) #define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */ #define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */ #define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */ #define SB_I_VERSION BIT(23) /* Update inode I_version field */ #define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ #define SB_DEAD BIT(21) #define SB_DYING BIT(24) #define SB_SUBMOUNT BIT(26) #define SB_FORCE BIT(27) #define SB_NOSEC BIT(28) #define SB_BORN BIT(29) #define SB_ACTIVE BIT(30) #define SB_NOUSER BIT(31) /* These flags relate to encoding and casefolding */ #define SB_ENC_STRICT_MODE_FL (1 << 0) #define sb_has_strict_encoding(sb) \ (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL) /* * Umount options */ #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ #define MNT_DETACH 0x00000002 /* Just detach from the tree */ #define MNT_EXPIRE 0x00000004 /* Mark for expiry */ #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */ /* sb->s_iflags */ #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ #define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ #define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ #define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ /* sb->s_iflags to limit user namespace mounts */ #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 #define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ #define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ #define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ #define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ #define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ /* Possible states of 'frozen' field */ enum { SB_UNFROZEN = 0, /* FS is unfrozen */ SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop * internal threads if needed) */ SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ }; #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) struct sb_writers { unsigned short frozen; /* Is sb frozen? */ int freeze_kcount; /* How many kernel freeze requests? */ int freeze_ucount; /* How many userspace freeze requests? */ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; struct super_block { struct list_head s_list; /* Keep this first */ dev_t s_dev; /* search index; _not_ kdev_t */ unsigned char s_blocksize_bits; unsigned long s_blocksize; loff_t s_maxbytes; /* Max file size */ struct file_system_type *s_type; const struct super_operations *s_op; const struct dquot_operations *dq_op; const struct quotactl_ops *s_qcop; const struct export_operations *s_export_op; unsigned long s_flags; unsigned long s_iflags; /* internal SB_I_* flags */ unsigned long s_magic; struct dentry *s_root; struct rw_semaphore s_umount; int s_count; atomic_t s_active; #ifdef CONFIG_SECURITY void *s_security; #endif const struct xattr_handler * const *s_xattr; #ifdef CONFIG_FS_ENCRYPTION const struct fscrypt_operations *s_cop; struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */ #endif #ifdef CONFIG_FS_VERITY const struct fsverity_operations *s_vop; #endif #if IS_ENABLED(CONFIG_UNICODE) struct unicode_map *s_encoding; __u16 s_encoding_flags; #endif struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ struct file *s_bdev_file; struct backing_dev_info *s_bdi; struct mtd_info *s_mtd; struct hlist_node s_instances; unsigned int s_quota_types; /* Bitmask of supported quota types */ struct quota_info s_dquot; /* Diskquota specific options */ struct sb_writers s_writers; /* * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and * s_fsnotify_info together for cache efficiency. They are frequently * accessed and rarely modified. */ void *s_fs_info; /* Filesystem private info */ /* Granularity of c/m/atime in ns (cannot be worse than a second) */ u32 s_time_gran; /* Time limits for c/m/atime in seconds */ time64_t s_time_min; time64_t s_time_max; #ifdef CONFIG_FSNOTIFY u32 s_fsnotify_mask; struct fsnotify_sb_info *s_fsnotify_info; #endif /* * q: why are s_id and s_sysfs_name not the same? both are human * readable strings that identify the filesystem * a: s_id is allowed to change at runtime; it's used in log messages, * and we want to when a device starts out as single device (s_id is dev * name) but then a device is hot added and we have to switch to * identifying it by UUID * but s_sysfs_name is a handle for programmatic access, and can't * change at runtime */ char s_id[32]; /* Informational name */ uuid_t s_uuid; /* UUID */ u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */ /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */ char s_sysfs_name[UUID_STRING_LEN + 1]; unsigned int s_max_links; /* * The next field is for VFS *only*. No filesystems have any business * even looking at it. You had been warned. */ struct mutex s_vfs_rename_mutex; /* Kludge */ /* * Filesystem subtype. If non-empty the filesystem type field * in /proc/mounts will be "type.subtype" */ const char *s_subtype; const struct dentry_operations *s_d_op; /* default d_op for dentries */ struct shrinker *s_shrink; /* per-sb shrinker handle */ /* Number of inodes with nlink == 0 but still referenced */ atomic_long_t s_remove_count; /* Read-only state of the superblock is being changed */ int s_readonly_remount; /* per-sb errseq_t for reporting writeback errors via syncfs */ errseq_t s_wb_err; /* AIO completions deferred from interrupt context */ struct workqueue_struct *s_dio_done_wq; struct hlist_head s_pins; /* * Owning user namespace and default context in which to * interpret filesystem uids, gids, quotas, device nodes, * xattrs and security labels. */ struct user_namespace *s_user_ns; /* * The list_lru structure is essentially just a pointer to a table * of per-node lru lists, each of which has its own spinlock. * There is no need to put them into separate cachelines. */ struct list_lru s_dentry_lru; struct list_lru s_inode_lru; struct rcu_head rcu; struct work_struct destroy_work; struct mutex s_sync_lock; /* sync serialisation lock */ /* * Indicates how deep in a filesystem stack this SB is */ int s_stack_depth; /* s_inode_list_lock protects s_inodes */ spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; struct list_head s_inodes; /* all inodes */ spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ } __randomize_layout; static inline struct user_namespace *i_user_ns(const struct inode *inode) { return inode->i_sb->s_user_ns; } /* Helper functions so that in most cases filesystems will * not need to deal directly with kuid_t and kgid_t and can * instead deal with the raw numeric values that are stored * in the filesystem. */ static inline uid_t i_uid_read(const struct inode *inode) { return from_kuid(i_user_ns(inode), inode->i_uid); } static inline gid_t i_gid_read(const struct inode *inode) { return from_kgid(i_user_ns(inode), inode->i_gid); } static inline void i_uid_write(struct inode *inode, uid_t uid) { inode->i_uid = make_kuid(i_user_ns(inode), uid); } static inline void i_gid_write(struct inode *inode, gid_t gid) { inode->i_gid = make_kgid(i_user_ns(inode), gid); } /** * i_uid_into_vfsuid - map an inode's i_uid down according to an idmapping * @idmap: idmap of the mount the inode was found from * @inode: inode to map * * Return: whe inode's i_uid mapped down according to @idmap. * If the inode's i_uid has no mapping INVALID_VFSUID is returned. */ static inline vfsuid_t i_uid_into_vfsuid(struct mnt_idmap *idmap, const struct inode *inode) { return make_vfsuid(idmap, i_user_ns(inode), inode->i_uid); } /** * i_uid_needs_update - check whether inode's i_uid needs to be updated * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Check whether the $inode's i_uid field needs to be updated taking idmapped * mounts into account if the filesystem supports it. * * Return: true if @inode's i_uid field needs to be updated, false if not. */ static inline bool i_uid_needs_update(struct mnt_idmap *idmap, const struct iattr *attr, const struct inode *inode) { return ((attr->ia_valid & ATTR_UID) && !vfsuid_eq(attr->ia_vfsuid, i_uid_into_vfsuid(idmap, inode))); } /** * i_uid_update - update @inode's i_uid field * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Safely update @inode's i_uid field translating the vfsuid of any idmapped * mount into the filesystem kuid. */ static inline void i_uid_update(struct mnt_idmap *idmap, const struct iattr *attr, struct inode *inode) { if (attr->ia_valid & ATTR_UID) inode->i_uid = from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid); } /** * i_gid_into_vfsgid - map an inode's i_gid down according to an idmapping * @idmap: idmap of the mount the inode was found from * @inode: inode to map * * Return: the inode's i_gid mapped down according to @idmap. * If the inode's i_gid has no mapping INVALID_VFSGID is returned. */ static inline vfsgid_t i_gid_into_vfsgid(struct mnt_idmap *idmap, const struct inode *inode) { return make_vfsgid(idmap, i_user_ns(inode), inode->i_gid); } /** * i_gid_needs_update - check whether inode's i_gid needs to be updated * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Check whether the $inode's i_gid field needs to be updated taking idmapped * mounts into account if the filesystem supports it. * * Return: true if @inode's i_gid field needs to be updated, false if not. */ static inline bool i_gid_needs_update(struct mnt_idmap *idmap, const struct iattr *attr, const struct inode *inode) { return ((attr->ia_valid & ATTR_GID) && !vfsgid_eq(attr->ia_vfsgid, i_gid_into_vfsgid(idmap, inode))); } /** * i_gid_update - update @inode's i_gid field * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Safely update @inode's i_gid field translating the vfsgid of any idmapped * mount into the filesystem kgid. */ static inline void i_gid_update(struct mnt_idmap *idmap, const struct iattr *attr, struct inode *inode) { if (attr->ia_valid & ATTR_GID) inode->i_gid = from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid); } /** * inode_fsuid_set - initialize inode's i_uid field with callers fsuid * @inode: inode to initialize * @idmap: idmap of the mount the inode was found from * * Initialize the i_uid field of @inode. If the inode was found/created via * an idmapped mount map the caller's fsuid according to @idmap. */ static inline void inode_fsuid_set(struct inode *inode, struct mnt_idmap *idmap) { inode->i_uid = mapped_fsuid(idmap, i_user_ns(inode)); } /** * inode_fsgid_set - initialize inode's i_gid field with callers fsgid * @inode: inode to initialize * @idmap: idmap of the mount the inode was found from * * Initialize the i_gid field of @inode. If the inode was found/created via * an idmapped mount map the caller's fsgid according to @idmap. */ static inline void inode_fsgid_set(struct inode *inode, struct mnt_idmap *idmap) { inode->i_gid = mapped_fsgid(idmap, i_user_ns(inode)); } /** * fsuidgid_has_mapping() - check whether caller's fsuid/fsgid is mapped * @sb: the superblock we want a mapping in * @idmap: idmap of the relevant mount * * Check whether the caller's fsuid and fsgid have a valid mapping in the * s_user_ns of the superblock @sb. If the caller is on an idmapped mount map * the caller's fsuid and fsgid according to the @idmap first. * * Return: true if fsuid and fsgid is mapped, false if not. */ static inline bool fsuidgid_has_mapping(struct super_block *sb, struct mnt_idmap *idmap) { struct user_namespace *fs_userns = sb->s_user_ns; kuid_t kuid; kgid_t kgid; kuid = mapped_fsuid(idmap, fs_userns); if (!uid_valid(kuid)) return false; kgid = mapped_fsgid(idmap, fs_userns); if (!gid_valid(kgid)) return false; return kuid_has_mapping(fs_userns, kuid) && kgid_has_mapping(fs_userns, kgid); } struct timespec64 current_time(struct inode *inode); struct timespec64 inode_set_ctime_current(struct inode *inode); struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 update); static inline time64_t inode_get_atime_sec(const struct inode *inode) { return inode->i_atime_sec; } static inline long inode_get_atime_nsec(const struct inode *inode) { return inode->i_atime_nsec; } static inline struct timespec64 inode_get_atime(const struct inode *inode) { struct timespec64 ts = { .tv_sec = inode_get_atime_sec(inode), .tv_nsec = inode_get_atime_nsec(inode) }; return ts; } static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode, struct timespec64 ts) { inode->i_atime_sec = ts.tv_sec; inode->i_atime_nsec = ts.tv_nsec; return ts; } static inline struct timespec64 inode_set_atime(struct inode *inode, time64_t sec, long nsec) { struct timespec64 ts = { .tv_sec = sec, .tv_nsec = nsec }; return inode_set_atime_to_ts(inode, ts); } static inline time64_t inode_get_mtime_sec(const struct inode *inode) { return inode->i_mtime_sec; } static inline long inode_get_mtime_nsec(const struct inode *inode) { return inode->i_mtime_nsec; } static inline struct timespec64 inode_get_mtime(const struct inode *inode) { struct timespec64 ts = { .tv_sec = inode_get_mtime_sec(inode), .tv_nsec = inode_get_mtime_nsec(inode) }; return ts; } static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode, struct timespec64 ts) { inode->i_mtime_sec = ts.tv_sec; inode->i_mtime_nsec = ts.tv_nsec; return ts; } static inline struct timespec64 inode_set_mtime(struct inode *inode, time64_t sec, long nsec) { struct timespec64 ts = { .tv_sec = sec, .tv_nsec = nsec }; return inode_set_mtime_to_ts(inode, ts); } /* * Multigrain timestamps * * Conditionally use fine-grained ctime and mtime timestamps when there * are users actively observing them via getattr. The primary use-case * for this is NFS clients that use the ctime to distinguish between * different states of the file, and that are often fooled by multiple * operations that occur in the same coarse-grained timer tick. */ #define I_CTIME_QUERIED ((u32)BIT(31)) static inline time64_t inode_get_ctime_sec(const struct inode *inode) { return inode->i_ctime_sec; } static inline long inode_get_ctime_nsec(const struct inode *inode) { return inode->i_ctime_nsec & ~I_CTIME_QUERIED; } static inline struct timespec64 inode_get_ctime(const struct inode *inode) { struct timespec64 ts = { .tv_sec = inode_get_ctime_sec(inode), .tv_nsec = inode_get_ctime_nsec(inode) }; return ts; } struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts); /** * inode_set_ctime - set the ctime in the inode * @inode: inode in which to set the ctime * @sec: tv_sec value to set * @nsec: tv_nsec value to set * * Set the ctime in @inode to { @sec, @nsec } */ static inline struct timespec64 inode_set_ctime(struct inode *inode, time64_t sec, long nsec) { struct timespec64 ts = { .tv_sec = sec, .tv_nsec = nsec }; return inode_set_ctime_to_ts(inode, ts); } struct timespec64 simple_inode_init_ts(struct inode *inode); /* * Snapshotting support. */ /* * These are internal functions, please use sb_start_{write,pagefault,intwrite} * instead. */ static inline void __sb_end_write(struct super_block *sb, int level) { percpu_up_read(sb->s_writers.rw_sem + level-1); } static inline void __sb_start_write(struct super_block *sb, int level) { percpu_down_read(sb->s_writers.rw_sem + level - 1); } static inline bool __sb_start_write_trylock(struct super_block *sb, int level) { return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); } #define __sb_writers_acquired(sb, lev) \ percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) #define __sb_writers_release(sb, lev) \ percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], _THIS_IP_) /** * __sb_write_started - check if sb freeze level is held * @sb: the super we write to * @level: the freeze level * * * > 0 - sb freeze level is held * * 0 - sb freeze level is not held * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN */ static inline int __sb_write_started(const struct super_block *sb, int level) { return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); } /** * sb_write_started - check if SB_FREEZE_WRITE is held * @sb: the super we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. */ static inline bool sb_write_started(const struct super_block *sb) { return __sb_write_started(sb, SB_FREEZE_WRITE); } /** * sb_write_not_started - check if SB_FREEZE_WRITE is not held * @sb: the super we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. */ static inline bool sb_write_not_started(const struct super_block *sb) { return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0; } /** * file_write_started - check if SB_FREEZE_WRITE is held * @file: the file we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. * May be false positive with !S_ISREG, because file_start_write() has * no effect on !S_ISREG. */ static inline bool file_write_started(const struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return true; return sb_write_started(file_inode(file)->i_sb); } /** * file_write_not_started - check if SB_FREEZE_WRITE is not held * @file: the file we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. * May be false positive with !S_ISREG, because file_start_write() has * no effect on !S_ISREG. */ static inline bool file_write_not_started(const struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return true; return sb_write_not_started(file_inode(file)->i_sb); } /** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to * * Decrement number of writers to the filesystem. Wake up possible waiters * wanting to freeze the filesystem. */ static inline void sb_end_write(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_WRITE); } /** * sb_end_pagefault - drop write access to a superblock from a page fault * @sb: the super we wrote to * * Decrement number of processes handling write page fault to the filesystem. * Wake up possible waiters wanting to freeze the filesystem. */ static inline void sb_end_pagefault(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_PAGEFAULT); } /** * sb_end_intwrite - drop write access to a superblock for internal fs purposes * @sb: the super we wrote to * * Decrement fs-internal number of writers to the filesystem. Wake up possible * waiters wanting to freeze the filesystem. */ static inline void sb_end_intwrite(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_FS); } /** * sb_start_write - get write access to a superblock * @sb: the super we write to * * When a process wants to write data or metadata to a file system (i.e. dirty * a page or an inode), it should embed the operation in a sb_start_write() - * sb_end_write() pair to get exclusion against file system freezing. This * function increments number of writers preventing freezing. If the file * system is already frozen, the function waits until the file system is * thawed. * * Since freeze protection behaves as a lock, users have to preserve * ordering of freeze protection and other filesystem locks. Generally, * freeze protection should be the outermost lock. In particular, we have: * * sb_start_write * -> i_mutex (write path, truncate, directory ops, ...) * -> s_umount (freeze_super, thaw_super) */ static inline void sb_start_write(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_WRITE); } static inline bool sb_start_write_trylock(struct super_block *sb) { return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); } /** * sb_start_pagefault - get write access to a superblock from a page fault * @sb: the super we write to * * When a process starts handling write page fault, it should embed the * operation into sb_start_pagefault() - sb_end_pagefault() pair to get * exclusion against file system freezing. This is needed since the page fault * is going to dirty a page. This function increments number of running page * faults preventing freezing. If the file system is already frozen, the * function waits until the file system is thawed. * * Since page fault freeze protection behaves as a lock, users have to preserve * ordering of freeze protection and other filesystem locks. It is advised to * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault * handling code implies lock dependency: * * mmap_lock * -> sb_start_pagefault */ static inline void sb_start_pagefault(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_PAGEFAULT); } /** * sb_start_intwrite - get write access to a superblock for internal fs purposes * @sb: the super we write to * * This is the third level of protection against filesystem freezing. It is * free for use by a filesystem. The only requirement is that it must rank * below sb_start_pagefault. * * For example filesystem can call sb_start_intwrite() when starting a * transaction which somewhat eases handling of freezing for internal sources * of filesystem changes (internal fs threads, discarding preallocation on file * close, etc.). */ static inline void sb_start_intwrite(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_FS); } static inline bool sb_start_intwrite_trylock(struct super_block *sb) { return __sb_start_write_trylock(sb, SB_FREEZE_FS); } bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode); /* * VFS helper functions.. */ int vfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); int vfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t); int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); int vfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *, struct dentry *, struct inode **); int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *); int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, struct inode **); /** * struct renamedata - contains all information required for renaming * @old_mnt_idmap: idmap of the old mount the inode was found from * @old_dir: parent of source * @old_dentry: source * @new_mnt_idmap: idmap of the new mount the inode was found from * @new_dir: parent of destination * @new_dentry: destination * @delegated_inode: returns an inode needing a delegation break * @flags: rename flags */ struct renamedata { struct mnt_idmap *old_mnt_idmap; struct inode *old_dir; struct dentry *old_dentry; struct mnt_idmap *new_mnt_idmap; struct inode *new_dir; struct dentry *new_dentry; struct inode **delegated_inode; unsigned int flags; } __randomize_layout; int vfs_rename(struct renamedata *); static inline int vfs_whiteout(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); } struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred); struct file *kernel_file_open(const struct path *path, int flags, const struct cred *cred); int vfs_mkobj(struct dentry *, umode_t, int (*f)(struct dentry *, umode_t, void *), void *); int vfs_fchown(struct file *file, uid_t user, gid_t group); int vfs_fchmod(struct file *file, umode_t mode); int vfs_utimes(const struct path *path, struct timespec64 *times); extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #else #define compat_ptr_ioctl NULL #endif /* * VFS file helper functions. */ void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode, const struct inode *dir, umode_t mode); extern bool may_open_dev(const struct path *path); umode_t mode_strip_sgid(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode); bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid); /* * This is the "filldir" function type, used by readdir() to let * the kernel specify what kind of dirent layout it wants to have. * This allows the kernel to read directories into kernel space or * to have different dirent layouts depending on the binary type. * Return 'true' to keep going and 'false' if there are no more entries. */ struct dir_context; typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64, unsigned); struct dir_context { filldir_t actor; loff_t pos; }; /* * These flags let !MMU mmap() govern direct device mapping vs immediate * copying more easily for MAP_PRIVATE, especially for ROM filesystems. * * NOMMU_MAP_COPY: Copy can be mapped (MAP_PRIVATE) * NOMMU_MAP_DIRECT: Can be mapped directly (MAP_SHARED) * NOMMU_MAP_READ: Can be mapped for reading * NOMMU_MAP_WRITE: Can be mapped for writing * NOMMU_MAP_EXEC: Can be mapped for execution */ #define NOMMU_MAP_COPY 0x00000001 #define NOMMU_MAP_DIRECT 0x00000008 #define NOMMU_MAP_READ VM_MAYREAD #define NOMMU_MAP_WRITE VM_MAYWRITE #define NOMMU_MAP_EXEC VM_MAYEXEC #define NOMMU_VMFLAGS \ (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC) /* * These flags control the behavior of the remap_file_range function pointer. * If it is called with len == 0 that means "remap to end of source file". * See Documentation/filesystems/vfs.rst for more details about this call. * * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate) * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request */ #define REMAP_FILE_DEDUP (1 << 0) #define REMAP_FILE_CAN_SHORTEN (1 << 1) /* * These flags signal that the caller is ok with altering various aspects of * the behavior of the remap operation. The changes must be made by the * implementation; the vfs remap helper functions can take advantage of them. * Flags in this category exist to preserve the quirky behavior of the hoisted * btrfs clone/dedupe ioctls. */ #define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN) /* * These flags control the behavior of vfs_copy_file_range(). * They are not available to the user via syscall. * * COPY_FILE_SPLICE: call splice direct instead of fs clone/copy ops */ #define COPY_FILE_SPLICE (1 << 0) struct iov_iter; struct io_uring_cmd; struct offset_ctx; typedef unsigned int __bitwise fop_flags_t; struct file_operations { struct module *owner; fop_flags_t fop_flags; loff_t (*llseek) (struct file *, loff_t, int); ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *, unsigned int flags); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); int (*fsync) (struct file *, loff_t, loff_t, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); void (*splice_eof)(struct file *file); int (*setlease)(struct file *, int, struct file_lease **, void **); long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len); void (*show_fdinfo)(struct seq_file *m, struct file *f); #ifndef CONFIG_MMU unsigned (*mmap_capabilities)(struct file *); #endif ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, unsigned int poll_flags); } __randomize_layout; /* Supports async buffered reads */ #define FOP_BUFFER_RASYNC ((__force fop_flags_t)(1 << 0)) /* Supports async buffered writes */ #define FOP_BUFFER_WASYNC ((__force fop_flags_t)(1 << 1)) /* Supports synchronous page faults for mappings */ #define FOP_MMAP_SYNC ((__force fop_flags_t)(1 << 2)) /* Supports non-exclusive O_DIRECT writes from multiple threads */ #define FOP_DIO_PARALLEL_WRITE ((__force fop_flags_t)(1 << 3)) /* Contains huge pages */ #define FOP_HUGE_PAGES ((__force fop_flags_t)(1 << 4)) /* Treat loff_t as unsigned (e.g., /dev/mem) */ #define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5)) /* Supports asynchronous lock callbacks */ #define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6)) /* File system supports uncached read/write buffered IO */ #define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, int (*) (struct file *, struct dir_context *)); #define WRAP_DIR_ITER(x) \ static int shared_##x(struct file *file , struct dir_context *ctx) \ { return wrap_directory_iterator(file, ctx, x); } struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *); int (*permission) (struct mnt_idmap *, struct inode *, int); struct posix_acl * (*get_inode_acl)(struct inode *, int, bool); int (*readlink) (struct dentry *, char __user *,int); int (*create) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t, bool); int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *, const char *); int (*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t); int (*rmdir) (struct inode *,struct dentry *); int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t,dev_t); int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); int (*setattr) (struct mnt_idmap *, struct dentry *, struct iattr *); int (*getattr) (struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); int (*update_time)(struct inode *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); int (*tmpfile) (struct mnt_idmap *, struct inode *, struct file *, umode_t); struct posix_acl *(*get_acl)(struct mnt_idmap *, struct dentry *, int); int (*set_acl)(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); int (*fileattr_set)(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); struct offset_ctx *(*get_offset_ctx)(struct inode *inode); } ____cacheline_aligned; static inline int call_mmap(struct file *file, struct vm_area_struct *vma) { return file->f_op->mmap(file, vma); } extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write); int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, const struct iomap_ops *dax_read_ops); int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *count, unsigned int remap_flags); extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); extern int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same); extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags); /** * enum freeze_holder - holder of the freeze * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed * * Indicate who the owner of the freeze or thaw request is and whether * the freeze needs to be exclusive or can nest. * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the * same holder aren't allowed. It is however allowed to hold a single * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at * the same time. This is relied upon by some filesystems during online * repair or similar. */ enum freeze_holder { FREEZE_HOLDER_KERNEL = (1U << 0), FREEZE_HOLDER_USERSPACE = (1U << 1), FREEZE_MAY_NEST = (1U << 2), }; struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); void (*destroy_inode)(struct inode *); void (*free_inode)(struct inode *); void (*dirty_inode) (struct inode *, int flags); int (*write_inode) (struct inode *, struct writeback_control *wbc); int (*drop_inode) (struct inode *); void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); int (*freeze_super) (struct super_block *, enum freeze_holder who); int (*freeze_fs) (struct super_block *); int (*thaw_super) (struct super_block *, enum freeze_holder who); int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*umount_begin) (struct super_block *); int (*show_options)(struct seq_file *, struct dentry *); int (*show_devname)(struct seq_file *, struct dentry *); int (*show_path)(struct seq_file *, struct dentry *); int (*show_stats)(struct seq_file *, struct dentry *); #ifdef CONFIG_QUOTA ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); struct dquot __rcu **(*get_dquots)(struct inode *); #endif long (*nr_cached_objects)(struct super_block *, struct shrink_control *); long (*free_cached_objects)(struct super_block *, struct shrink_control *); void (*shutdown)(struct super_block *sb); }; /* * Inode flags - they have no relation to superblock flags now */ #define S_SYNC (1 << 0) /* Writes are synced at once */ #define S_NOATIME (1 << 1) /* Do not update access times */ #define S_APPEND (1 << 2) /* Append-only file */ #define S_IMMUTABLE (1 << 3) /* Immutable file */ #define S_DEAD (1 << 4) /* removed, but still open directory */ #define S_NOQUOTA (1 << 5) /* Inode is not counted to quota */ #define S_DIRSYNC (1 << 6) /* Directory modifications are synchronous */ #define S_NOCMTIME (1 << 7) /* Do not update file c/mtime */ #define S_SWAPFILE (1 << 8) /* Do not truncate: swapon got its bmaps */ #define S_PRIVATE (1 << 9) /* Inode is fs-internal */ #define S_IMA (1 << 10) /* Inode has an associated IMA struct */ #define S_AUTOMOUNT (1 << 11) /* Automount/referral quasi-directory */ #define S_NOSEC (1 << 12) /* no suid or xattr security attributes */ #ifdef CONFIG_FS_DAX #define S_DAX (1 << 13) /* Direct Access, avoiding the page cache */ #else #define S_DAX 0 /* Make all the DAX code disappear */ #endif #define S_ENCRYPTED (1 << 14) /* Encrypted file (using fs/crypto/) */ #define S_CASEFOLD (1 << 15) /* Casefolded file */ #define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */ #define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */ /* * Note that nosuid etc flags are inode-specific: setting some file-system * flags just means all the inodes inherit those flags by default. It might be * possible to override it selectively if you really wanted to with some * ioctl() that is not currently implemented. * * Exception: SB_RDONLY is always applied to the entire file system. * * Unfortunately, it is possible to change a filesystems flags with it mounted * with files in use. This means that all of the inodes will not have their * i_flags updated. Hence, i_flags no longer inherit the superblock mount * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org */ #define __IS_FLG(inode, flg) ((inode)->i_sb->s_flags & (flg)) static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; } #define IS_RDONLY(inode) sb_rdonly((inode)->i_sb) #define IS_SYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS) || \ ((inode)->i_flags & S_SYNC)) #define IS_DIRSYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS|SB_DIRSYNC) || \ ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) #define IS_MANDLOCK(inode) __IS_FLG(inode, SB_MANDLOCK) #define IS_NOATIME(inode) __IS_FLG(inode, SB_RDONLY|SB_NOATIME) #define IS_I_VERSION(inode) __IS_FLG(inode, SB_I_VERSION) #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) #ifdef CONFIG_FS_POSIX_ACL #define IS_POSIXACL(inode) __IS_FLG(inode, SB_POSIXACL) #else #define IS_POSIXACL(inode) 0 #endif #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) #ifdef CONFIG_SWAP #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE) #else #define IS_SWAPFILE(inode) ((void)(inode), 0U) #endif #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE) #define IS_IMA(inode) ((inode)->i_flags & S_IMA) #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) #define IS_DAX(inode) ((inode)->i_flags & S_DAX) #define IS_ENCRYPTED(inode) ((inode)->i_flags & S_ENCRYPTED) #define IS_CASEFOLDED(inode) ((inode)->i_flags & S_CASEFOLD) #define IS_VERITY(inode) ((inode)->i_flags & S_VERITY) #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ (inode)->i_rdev == WHITEOUT_DEV) static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap, struct inode *inode) { return !vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)); } static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) { *kiocb = (struct kiocb) { .ki_filp = filp, .ki_flags = filp->f_iocb_flags, .ki_ioprio = get_current_ioprio(), }; } static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, struct file *filp) { *kiocb = (struct kiocb) { .ki_filp = filp, .ki_flags = kiocb_src->ki_flags, .ki_ioprio = kiocb_src->ki_ioprio, .ki_pos = kiocb_src->ki_pos, }; } /* * Inode state bits. Protected by inode->i_lock * * Four bits determine the dirty state of the inode: I_DIRTY_SYNC, * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME. * * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at * various stages of removing an inode. * * Two bits are used for locking and completion notification, I_NEW and I_SYNC. * * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on * fdatasync() (unless I_DIRTY_DATASYNC is also set). * Timestamp updates are the usual cause. * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of * these changes separately from I_DIRTY_SYNC so that we * don't have to write inode on fdatasync() when only * e.g. the timestamps have changed. * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. * I_DIRTY_TIME The inode itself has dirty timestamps, and the * lazytime mount option is enabled. We keep track of this * separately from I_DIRTY_SYNC in order to implement * lazytime. This gets cleared if I_DIRTY_INODE * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already * in place because writeback might already be in progress * and we don't want to lose the time update * I_NEW Serves as both a mutex and completion notification. * New inodes set I_NEW. If two processes both create * the same inode, one of them will release its inode and * wait for I_NEW to be released before returning. * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can * also cause waiting on I_NEW, without I_NEW actually * being set. find_inode() uses this to prevent returning * nearly-dead inodes. * I_WILL_FREE Must be set when calling write_inode_now() if i_count * is zero. I_FREEING must be set when I_WILL_FREE is * cleared. * I_FREEING Set when inode is about to be freed but still has dirty * pages or buffers attached or the inode itself is still * dirty. * I_CLEAR Added by clear_inode(). In this state the inode is * clean and can be destroyed. Inode keeps I_FREEING. * * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are * prohibited for many purposes. iget() must wait for * the inode to be completely released, then create it * anew. Other functions will just ignore such inodes, * if appropriate. I_NEW is used for waiting. * * I_SYNC Writeback of inode is running. The bit is set during * data writeback, and cleared with a wakeup on the bit * address once it is done. The bit is also used to pin * the inode in memory for flusher thread. * * I_REFERENCED Marks the inode as recently references on the LRU list. * * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to * synchronize competing switching instances and to tell * wb stat updates to grab the i_pages lock. See * inode_switch_wbs_work_fn() for details. * * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper * and work dirs among overlayfs mounts. * * I_CREATING New object's inode in the middle of setting up. * * I_DONTCACHE Evict inode as soon as it is not used anymore. * * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. * Used to detect that mark_inode_dirty() should not move * inode between dirty lists. * * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. * * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding * i_count. * * Q: What is the difference between I_WILL_FREE and I_FREEING? * * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait * upon. There's one free address left. */ #define __I_NEW 0 #define I_NEW (1 << __I_NEW) #define __I_SYNC 1 #define I_SYNC (1 << __I_SYNC) #define __I_LRU_ISOLATING 2 #define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING) #define I_DIRTY_SYNC (1 << 3) #define I_DIRTY_DATASYNC (1 << 4) #define I_DIRTY_PAGES (1 << 5) #define I_WILL_FREE (1 << 6) #define I_FREEING (1 << 7) #define I_CLEAR (1 << 8) #define I_REFERENCED (1 << 9) #define I_LINKABLE (1 << 10) #define I_DIRTY_TIME (1 << 11) #define I_WB_SWITCH (1 << 12) #define I_OVL_INUSE (1 << 13) #define I_CREATING (1 << 14) #define I_DONTCACHE (1 << 15) #define I_SYNC_QUEUED (1 << 16) #define I_PINNING_NETFS_WB (1 << 17) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) extern void __mark_inode_dirty(struct inode *, int); static inline void mark_inode_dirty(struct inode *inode) { __mark_inode_dirty(inode, I_DIRTY); } static inline void mark_inode_dirty_sync(struct inode *inode) { __mark_inode_dirty(inode, I_DIRTY_SYNC); } /* * Returns true if the given inode itself only has dirty timestamps (its pages * may still be dirty) and isn't currently being allocated or freed. * Filesystems should call this if when writing an inode when lazytime is * enabled, they want to opportunistically write the timestamps of other inodes * located very nearby on-disk, e.g. in the same inode block. This returns true * if the given inode is in need of such an opportunistic update. Requires * i_lock, or at least later re-checking under i_lock. */ static inline bool inode_is_dirtytime_only(struct inode *inode) { return (inode->i_state & (I_DIRTY_TIME | I_NEW | I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME; } extern void inc_nlink(struct inode *inode); extern void drop_nlink(struct inode *inode); extern void clear_nlink(struct inode *inode); extern void set_nlink(struct inode *inode, unsigned int nlink); static inline void inode_inc_link_count(struct inode *inode) { inc_nlink(inode); mark_inode_dirty(inode); } static inline void inode_dec_link_count(struct inode *inode) { drop_nlink(inode); mark_inode_dirty(inode); } enum file_time_flags { S_ATIME = 1, S_MTIME = 2, S_CTIME = 4, S_VERSION = 8, }; extern bool atime_needs_update(const struct path *, struct inode *); extern void touch_atime(const struct path *); int inode_update_time(struct inode *inode, int flags); static inline void file_accessed(struct file *file) { if (!(file->f_flags & O_NOATIME)) touch_atime(&file->f_path); } extern int file_modified(struct file *file); int kiocb_modified(struct kiocb *iocb); int sync_inode_metadata(struct inode *inode, int wait); struct file_system_type { const char *name; int fs_flags; #define FS_REQUIRES_DEV 1 #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ #define FS_MGTIME 64 /* FS uses multigrain timestamps */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); void (*kill_sb) (struct super_block *); struct module *owner; struct file_system_type * next; struct hlist_head fs_supers; struct lock_class_key s_lock_key; struct lock_class_key s_umount_key; struct lock_class_key s_vfs_rename_key; struct lock_class_key s_writers_key[SB_FREEZE_LEVELS]; struct lock_class_key i_lock_key; struct lock_class_key i_mutex_key; struct lock_class_key invalidate_lock_key; struct lock_class_key i_mutex_dir_key; }; #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME) /** * is_mgtime: is this inode using multigrain timestamps * @inode: inode to test for multigrain timestamps * * Return true if the inode uses multigrain timestamps, false otherwise. */ static inline bool is_mgtime(const struct inode *inode) { return inode->i_opflags & IOP_MGTIME; } extern struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_single(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); void kill_block_super(struct super_block *sb); void kill_anon_super(struct super_block *sb); void kill_litter_super(struct super_block *sb); void deactivate_super(struct super_block *sb); void deactivate_locked_super(struct super_block *sb); int set_anon_super(struct super_block *s, void *data); int set_anon_super_fc(struct super_block *s, struct fs_context *fc); int get_anon_bdev(dev_t *); void free_anon_bdev(dev_t); struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*set)(struct super_block *, struct fs_context *)); struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), int flags, void *data); struct super_block *sget_dev(struct fs_context *fc, dev_t dev); /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ #define fops_get(fops) ({ \ const struct file_operations *_fops = (fops); \ (((_fops) && try_module_get((_fops)->owner) ? (_fops) : NULL)); \ }) #define fops_put(fops) ({ \ const struct file_operations *_fops = (fops); \ if (_fops) \ module_put((_fops)->owner); \ }) /* * This one is to be used *ONLY* from ->open() instances. * fops must be non-NULL, pinned down *and* module dependencies * should be sufficient to pin the caller down as well. */ #define replace_fops(f, fops) \ do { \ struct file *__file = (f); \ fops_put(__file->f_op); \ BUG_ON(!(__file->f_op = (fops))); \ } while(0) extern int register_filesystem(struct file_system_type *); extern int unregister_filesystem(struct file_system_type *); extern int vfs_statfs(const struct path *, struct kstatfs *); extern int user_statfs(const char __user *, struct kstatfs *); extern int fd_statfs(int, struct kstatfs *); int freeze_super(struct super_block *super, enum freeze_holder who); int thaw_super(struct super_block *super, enum freeze_holder who); extern __printf(2, 3) int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); extern int super_setup_bdi(struct super_block *sb); static inline void super_set_uuid(struct super_block *sb, const u8 *uuid, unsigned len) { if (WARN_ON(len > sizeof(sb->s_uuid))) len = sizeof(sb->s_uuid); sb->s_uuid_len = len; memcpy(&sb->s_uuid, uuid, len); } /* set sb sysfs name based on sb->s_bdev */ static inline void super_set_sysfs_name_bdev(struct super_block *sb) { snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pg", sb->s_bdev); } /* set sb sysfs name based on sb->s_uuid */ static inline void super_set_sysfs_name_uuid(struct super_block *sb) { WARN_ON(sb->s_uuid_len != sizeof(sb->s_uuid)); snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pU", sb->s_uuid.b); } /* set sb sysfs name based on sb->s_id */ static inline void super_set_sysfs_name_id(struct super_block *sb) { strscpy(sb->s_sysfs_name, sb->s_id, sizeof(sb->s_sysfs_name)); } /* try to use something standard before you use this */ __printf(2, 3) static inline void super_set_sysfs_name_generic(struct super_block *sb, const char *fmt, ...) { va_list args; va_start(args, fmt); vsnprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), fmt, args); va_end(args); } extern int current_umask(void); extern void ihold(struct inode * inode); extern void iput(struct inode *); int inode_update_timestamps(struct inode *inode, int flags); int generic_update_time(struct inode *, int); /* /sys/fs */ extern struct kobject *fs_kobj; #define MAX_RW_COUNT (INT_MAX & PAGE_MASK) /* fs/open.c */ struct audit_names; struct filename { const char *name; /* pointer to actual string */ const __user char *uptr; /* original userland pointer */ atomic_t refcnt; struct audit_names *aname; const char iname[]; }; static_assert(offsetof(struct filename, iname) % sizeof(long) == 0); static inline struct mnt_idmap *file_mnt_idmap(const struct file *file) { return mnt_idmap(file->f_path.mnt); } /** * is_idmapped_mnt - check whether a mount is mapped * @mnt: the mount to check * * If @mnt has an non @nop_mnt_idmap attached to it then @mnt is mapped. * * Return: true if mount is mapped, false if not. */ static inline bool is_idmapped_mnt(const struct vfsmount *mnt) { return mnt_idmap(mnt) != &nop_mnt_idmap; } extern long vfs_truncate(const struct path *, loff_t); int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start, unsigned int time_attrs, struct file *filp); extern int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode); extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(const struct path *, const char *, int, umode_t); static inline struct file *file_open_root_mnt(struct vfsmount *mnt, const char *name, int flags, umode_t mode) { return file_open_root(&(struct path){.mnt = mnt, .dentry = mnt->mnt_root}, name, flags, mode); } struct file *dentry_open(const struct path *path, int flags, const struct cred *creds); struct file *dentry_open_nonotify(const struct path *path, int flags, const struct cred *cred); struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred); struct path *backing_file_user_path(struct file *f); /* * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file * stored in ->vm_file is a backing file whose f_inode is on the underlying * filesystem. When the mapped file path and inode number are displayed to * user (e.g. via /proc/<pid>/maps), these helpers should be used to get the * path and inode number to display to the user, which is the path of the fd * that user has requested to map and the inode number that would be returned * by fstat() on that same fd. */ /* Get the path to display in /proc/<pid>/maps */ static inline const struct path *file_user_path(struct file *f) { if (unlikely(f->f_mode & FMODE_BACKING)) return backing_file_user_path(f); return &f->f_path; } /* Get the inode whose inode number to display in /proc/<pid>/maps */ static inline const struct inode *file_user_inode(struct file *f) { if (unlikely(f->f_mode & FMODE_BACKING)) return d_inode(backing_file_user_path(f)->dentry); return file_inode(f); } static inline struct file *file_clone_open(struct file *file) { return dentry_open(&file->f_path, file->f_flags, file->f_cred); } extern int filp_close(struct file *, fl_owner_t id); extern struct filename *getname_flags(const char __user *, int); extern struct filename *getname_uflags(const char __user *, int); extern struct filename *getname(const char __user *); extern struct filename *getname_kernel(const char *); extern struct filename *__getname_maybe_null(const char __user *); static inline struct filename *getname_maybe_null(const char __user *name, int flags) { if (!(flags & AT_EMPTY_PATH)) return getname(name); if (!name) return NULL; return __getname_maybe_null(name); } extern void putname(struct filename *name); extern int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)); extern int finish_no_open(struct file *file, struct dentry *dentry); /* Helper for the simple case when original dentry is used */ static inline int finish_open_simple(struct file *file, int error) { if (error) return error; return finish_open(file, file->f_path.dentry, NULL); } /* fs/dcache.c */ extern void __init vfs_caches_init_early(void); extern void __init vfs_caches_init(void); extern struct kmem_cache *names_cachep; #define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL) #define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) extern struct super_block *blockdev_superblock; static inline bool sb_is_blkdev_sb(struct super_block *sb) { return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock; } void emergency_thaw_all(void); extern int sync_filesystem(struct super_block *); extern const struct file_operations def_blk_fops; extern const struct file_operations def_chr_fops; /* fs/char_dev.c */ #define CHRDEV_MAJOR_MAX 512 /* Marks the bottom of the first segment of free char majors */ #define CHRDEV_MAJOR_DYN_END 234 /* Marks the top and bottom of the second segment of free char majors */ #define CHRDEV_MAJOR_DYN_EXT_START 511 #define CHRDEV_MAJOR_DYN_EXT_END 384 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *); extern int register_chrdev_region(dev_t, unsigned, const char *); extern int __register_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops); extern void __unregister_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name); extern void unregister_chrdev_region(dev_t, unsigned); extern void chrdev_show(struct seq_file *,off_t); static inline int register_chrdev(unsigned int major, const char *name, const struct file_operations *fops) { return __register_chrdev(major, 0, 256, name, fops); } static inline void unregister_chrdev(unsigned int major, const char *name) { __unregister_chrdev(major, 0, 256, name); } extern void init_special_inode(struct inode *, umode_t, dev_t); /* Invalid inode operations -- fs/bad_inode.c */ extern void make_bad_inode(struct inode *); extern bool is_bad_inode(struct inode *); extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, loff_t lend); extern int __must_check file_check_and_advance_wb_err(struct file *file); extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, loff_t end); static inline int file_write_and_wait(struct file *file) { return file_write_and_wait_range(file, 0, LLONG_MAX); } extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, unsigned int flags); static inline bool iocb_is_dsync(const struct kiocb *iocb) { return (iocb->ki_flags & IOCB_DSYNC) || IS_SYNC(iocb->ki_filp->f_mapping->host); } /* * Sync the bytes written if this was a synchronous write. Expect ki_pos * to already be updated for the write, and will return either the amount * of bytes passed in, or an error if syncing the file failed. */ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) { if (iocb_is_dsync(iocb)) { int ret = vfs_fsync_range(iocb->ki_filp, iocb->ki_pos - count, iocb->ki_pos - 1, (iocb->ki_flags & IOCB_SYNC) ? 0 : 1); if (ret) return ret; } else if (iocb->ki_flags & IOCB_DONTCACHE) { struct address_space *mapping = iocb->ki_filp->f_mapping; filemap_fdatawrite_range_kick(mapping, iocb->ki_pos, iocb->ki_pos + count); } return count; } extern void emergency_sync(void); extern void emergency_remount(void); #ifdef CONFIG_BLOCK extern int bmap(struct inode *inode, sector_t *block); #else static inline int bmap(struct inode *inode, sector_t *block) { return -EINVAL; } #endif int notify_change(struct mnt_idmap *, struct dentry *, struct iattr *, struct inode **); int inode_permission(struct mnt_idmap *, struct inode *, int); int generic_permission(struct mnt_idmap *, struct inode *, int); static inline int file_permission(struct file *file, int mask) { return inode_permission(file_mnt_idmap(file), file_inode(file), mask); } static inline int path_permission(const struct path *path, int mask) { return inode_permission(mnt_idmap(path->mnt), d_inode(path->dentry), mask); } int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode); static inline bool execute_ok(struct inode *inode) { return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode); } static inline bool inode_wrong_type(const struct inode *inode, umode_t mode) { return (inode->i_mode ^ mode) & S_IFMT; } /** * file_start_write - get write access to a superblock for regular file io * @file: the file we want to write to * * This is a variant of sb_start_write() which is a noop on non-regualr file. * Should be matched with a call to file_end_write(). */ static inline void file_start_write(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return; sb_start_write(file_inode(file)->i_sb); } static inline bool file_start_write_trylock(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return true; return sb_start_write_trylock(file_inode(file)->i_sb); } /** * file_end_write - drop write access to a superblock of a regular file * @file: the file we wrote to * * Should be matched with a call to file_start_write(). */ static inline void file_end_write(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return; sb_end_write(file_inode(file)->i_sb); } /** * kiocb_start_write - get write access to a superblock for async file io * @iocb: the io context we want to submit the write with * * This is a variant of sb_start_write() for async io submission. * Should be matched with a call to kiocb_end_write(). */ static inline void kiocb_start_write(struct kiocb *iocb) { struct inode *inode = file_inode(iocb->ki_filp); sb_start_write(inode->i_sb); /* * Fool lockdep by telling it the lock got released so that it * doesn't complain about the held lock when we return to userspace. */ __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); } /** * kiocb_end_write - drop write access to a superblock after async file io * @iocb: the io context we sumbitted the write with * * Should be matched with a call to kiocb_start_write(). */ static inline void kiocb_end_write(struct kiocb *iocb) { struct inode *inode = file_inode(iocb->ki_filp); /* * Tell lockdep we inherited freeze protection from submission thread. */ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); sb_end_write(inode->i_sb); } /* * This is used for regular files where some users -- especially the * currently executed binary in a process, previously handled via * VM_DENYWRITE -- cannot handle concurrent write (and maybe mmap * read-write shared) accesses. * * get_write_access() gets write permission for a file. * put_write_access() releases this write permission. * deny_write_access() denies write access to a file. * allow_write_access() re-enables write access to a file. * * The i_writecount field of an inode can have the following values: * 0: no write access, no denied write access * < 0: (-i_writecount) users that denied write access to the file. * > 0: (i_writecount) users that have write access to the file. * * Normally we operate on that counter with atomic_{inc,dec} and it's safe * except for the cases where we don't hold i_writecount yet. Then we need to * use {get,deny}_write_access() - these functions check the sign and refuse * to do the change if sign is wrong. */ static inline int get_write_access(struct inode *inode) { return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY; } static inline int deny_write_access(struct file *file) { struct inode *inode = file_inode(file); return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY; } static inline void put_write_access(struct inode * inode) { atomic_dec(&inode->i_writecount); } static inline void allow_write_access(struct file *file) { if (file) atomic_inc(&file_inode(file)->i_writecount); } /* * Do not prevent write to executable file when watched by pre-content events. * * Note that FMODE_FSNOTIFY_HSM mode is set depending on pre-content watches at * the time of file open and remains constant for entire lifetime of the file, * so if pre-content watches are added post execution or removed before the end * of the execution, it will not cause i_writecount reference leak. */ static inline int exe_file_deny_write_access(struct file *exe_file) { if (unlikely(FMODE_FSNOTIFY_HSM(exe_file->f_mode))) return 0; return deny_write_access(exe_file); } static inline void exe_file_allow_write_access(struct file *exe_file) { if (unlikely(!exe_file || FMODE_FSNOTIFY_HSM(exe_file->f_mode))) return; allow_write_access(exe_file); } static inline void file_set_fsnotify_mode(struct file *file, fmode_t mode) { file->f_mode &= ~FMODE_FSNOTIFY_MASK; file->f_mode |= mode; } static inline bool inode_is_open_for_write(const struct inode *inode) { return atomic_read(&inode->i_writecount) > 0; } #if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING) static inline void i_readcount_dec(struct inode *inode) { BUG_ON(atomic_dec_return(&inode->i_readcount) < 0); } static inline void i_readcount_inc(struct inode *inode) { atomic_inc(&inode->i_readcount); } #else static inline void i_readcount_dec(struct inode *inode) { return; } static inline void i_readcount_inc(struct inode *inode) { return; } #endif extern int do_pipe_flags(int *, int); extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *); ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos); extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *); extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *); extern struct file * open_exec(const char *); /* fs/dcache.c -- generic fs support functions */ extern bool is_subdir(struct dentry *, struct dentry *); extern bool path_is_under(const struct path *, const struct path *); extern char *file_path(struct file *, char *, int); /** * is_dot_dotdot - returns true only if @name is "." or ".." * @name: file name to check * @len: length of file name, in bytes */ static inline bool is_dot_dotdot(const char *name, size_t len) { return len && unlikely(name[0] == '.') && (len == 1 || (len == 2 && name[1] == '.')); } #include <linux/err.h> /* needed for stackable file system support */ extern loff_t default_llseek(struct file *file, loff_t offset, int whence); extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence); extern int inode_init_always_gfp(struct super_block *, struct inode *, gfp_t); static inline int inode_init_always(struct super_block *sb, struct inode *inode) { return inode_init_always_gfp(sb, inode, GFP_NOFS); } extern void inode_init_once(struct inode *); extern void address_space_init_once(struct address_space *mapping); extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); extern int inode_needs_sync(struct inode *inode); extern int generic_delete_inode(struct inode *inode); static inline int generic_drop_inode(struct inode *inode) { return !inode->i_nlink || inode_unhashed(inode); } extern void d_mark_dontcache(struct inode *inode); extern struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data); extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data); extern struct inode *ilookup(struct super_block *sb, unsigned long ino); extern struct inode *inode_insert5(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data); struct inode *iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); struct inode *iget5_locked_rcu(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); extern struct inode * iget_locked(struct super_block *, unsigned long); extern struct inode *find_inode_nowait(struct super_block *, unsigned long, int (*match)(struct inode *, unsigned long, void *), void *data); extern struct inode *find_inode_rcu(struct super_block *, unsigned long, int (*)(struct inode *, void *), void *); extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long); extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); extern int insert_inode_locked(struct inode *); #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void lockdep_annotate_inode_mutex_key(struct inode *inode); #else static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { }; #endif extern void unlock_new_inode(struct inode *); extern void discard_new_inode(struct inode *); extern unsigned int get_next_ino(void); extern void evict_inodes(struct super_block *sb); void dump_mapping(const struct address_space *); /* * Userspace may rely on the inode number being non-zero. For example, glibc * simply ignores files with zero i_ino in unlink() and other places. * * As an additional complication, if userspace was compiled with * _FILE_OFFSET_BITS=32 on a 64-bit kernel we'll only end up reading out the * lower 32 bits, so we need to check that those aren't zero explicitly. With * _FILE_OFFSET_BITS=64, this may cause some harmless false-negatives, but * better safe than sorry. */ static inline bool is_zero_ino(ino_t ino) { return (u32)ino == 0; } /* * inode->i_lock must be held */ static inline void __iget(struct inode *inode) { atomic_inc(&inode->i_count); } extern void iget_failed(struct inode *); extern void clear_inode(struct inode *); extern void __destroy_inode(struct inode *); extern struct inode *new_inode_pseudo(struct super_block *sb); extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); extern int file_remove_privs_flags(struct file *file, unsigned int flags); extern int file_remove_privs(struct file *); int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode); /* * This must be used for allocating filesystems specific inodes to set * up the inode reclaim context correctly. */ #define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp) extern void __insert_inode_hash(struct inode *, unsigned long hashval); static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); } extern void __remove_inode_hash(struct inode *); static inline void remove_inode_hash(struct inode *inode) { if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash)) __remove_inode_hash(inode); } extern void inode_sb_list_add(struct inode *inode); extern void inode_add_lru(struct inode *inode); extern int sb_set_blocksize(struct super_block *, int); extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); int generic_write_checks_count(struct kiocb *iocb, loff_t *count); extern int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count); extern int generic_file_rw_checks(struct file *file_in, struct file *file_out); ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *to, ssize_t already_read); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *); ssize_t generic_perform_write(struct kiocb *, struct iov_iter *); ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, ssize_t direct_written, ssize_t buffered_written); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags); ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags); ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, struct iov_iter *iter); ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter); /* fs/splice.c */ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); ssize_t copy_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); extern ssize_t iter_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); extern loff_t noop_llseek(struct file *file, loff_t offset, int whence); extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize); extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence); extern loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof); loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence, u64 *cookie); extern loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size); extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t); extern loff_t no_seek_end_llseek(struct file *, loff_t, int); int rw_verify_area(int, struct file *, const loff_t *, size_t); extern int generic_file_open(struct inode * inode, struct file * filp); extern int nonseekable_open(struct inode * inode, struct file * filp); extern int stream_open(struct inode * inode, struct file * filp); #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(struct bio *bio, struct inode *inode, loff_t file_offset); enum { /* need locking between buffered and direct access */ DIO_LOCKING = 0x01, /* filesystem does not support filling holes */ DIO_SKIP_HOLES = 0x02, }; ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, get_block_t get_block, dio_iodone_t end_io, int flags); static inline ssize_t blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct iov_iter *iter, get_block_t get_block) { return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, get_block, NULL, DIO_LOCKING | DIO_SKIP_HOLES); } #endif bool inode_dio_finished(const struct inode *inode); void inode_dio_wait(struct inode *inode); void inode_dio_wait_interruptible(struct inode *inode); /** * inode_dio_begin - signal start of a direct I/O requests * @inode: inode the direct I/O happens on * * This is called once we've finished processing a direct I/O request, * and is used to wake up callers waiting for direct I/O to be quiesced. */ static inline void inode_dio_begin(struct inode *inode) { atomic_inc(&inode->i_dio_count); } /** * inode_dio_end - signal finish of a direct I/O requests * @inode: inode the direct I/O happens on * * This is called once we've finished processing a direct I/O request, * and is used to wake up callers waiting for direct I/O to be quiesced. */ static inline void inode_dio_end(struct inode *inode) { if (atomic_dec_and_test(&inode->i_dio_count)) wake_up_var(&inode->i_dio_count); } extern void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask); extern const struct file_operations generic_ro_fops; #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) extern int readlink_copy(char __user *, int, const char *, int); extern int page_readlink(struct dentry *, char __user *, int); extern const char *page_get_link(struct dentry *, struct inode *, struct delayed_call *); extern void page_put_link(void *); extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode); void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); void generic_fill_statx_atomic_writes(struct kstat *stat, unsigned int unit_min, unsigned int unit_max); extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int); extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int); void __inode_add_bytes(struct inode *inode, loff_t bytes); void inode_add_bytes(struct inode *inode, loff_t bytes); void __inode_sub_bytes(struct inode *inode, loff_t bytes); void inode_sub_bytes(struct inode *inode, loff_t bytes); static inline loff_t __inode_get_bytes(struct inode *inode) { return (((loff_t)inode->i_blocks) << 9) + inode->i_bytes; } loff_t inode_get_bytes(struct inode *inode); void inode_set_bytes(struct inode *inode, loff_t bytes); const char *simple_get_link(struct dentry *, struct inode *, struct delayed_call *); extern const struct inode_operations simple_symlink_inode_operations; extern int iterate_dir(struct file *, struct dir_context *); int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int flags); int vfs_fstat(int fd, struct kstat *stat); static inline int vfs_stat(const char __user *filename, struct kstat *stat) { return vfs_fstatat(AT_FDCWD, filename, stat, 0); } static inline int vfs_lstat(const char __user *name, struct kstat *stat) { return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW); } extern const char *vfs_get_link(struct dentry *, struct delayed_call *); extern int vfs_readlink(struct dentry *, char __user *, int); extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); extern void iterate_supers(void (*)(struct super_block *, void *), void *); extern void iterate_supers_type(struct file_system_type *, void (*)(struct super_block *, void *), void *); extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); extern loff_t dcache_dir_lseek(struct file *, loff_t, int); extern int dcache_readdir(struct file *, struct dir_context *); extern int simple_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern int simple_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int simple_statfs(struct dentry *, struct kstatfs *); extern int simple_open(struct inode *inode, struct file *file); extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); extern int simple_rename(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); extern void simple_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); extern int noop_fsync(struct file *, loff_t, loff_t, int); extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int simple_empty(struct dentry *); extern int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata); extern const struct address_space_operations ram_aops; extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); extern int simple_nosetlease(struct file *, int, struct file_lease **, void **); extern const struct dentry_operations simple_dentry_operations; extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags); extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); extern const struct file_operations simple_dir_operations; extern const struct inode_operations simple_dir_inode_operations; extern void make_empty_dir_inode(struct inode *inode); extern bool is_empty_dir_inode(struct inode *inode); struct tree_descr { const char *name; const struct file_operations *ops; int mode; }; struct dentry *d_alloc_name(struct dentry *, const char *); extern int simple_fill_super(struct super_block *, unsigned long, const struct tree_descr *); extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); extern void simple_release_fs(struct vfsmount **mount, int *count); extern ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available); extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count); struct offset_ctx { struct maple_tree mt; unsigned long next_offset; }; void simple_offset_init(struct offset_ctx *octx); int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry); void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry); int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); int simple_offset_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); void simple_offset_destroy(struct offset_ctx *octx); extern const struct file_operations simple_offset_dir_operations; extern int __generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); extern void generic_set_sb_d_ops(struct super_block *sb); extern int generic_ci_match(const struct inode *parent, const struct qstr *name, const struct qstr *folded_name, const u8 *de_name, u32 de_name_len); #if IS_ENABLED(CONFIG_UNICODE) int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str); int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); /** * generic_ci_validate_strict_name - Check if a given name is suitable * for a directory * * This functions checks if the proposed filename is valid for the * parent directory. That means that only valid UTF-8 filenames will be * accepted for casefold directories from filesystems created with the * strict encoding flag. That also means that any name will be * accepted for directories that doesn't have casefold enabled, or * aren't being strict with the encoding. * * @dir: inode of the directory where the new file will be created * @name: name of the new file * * Return: * * True: if the filename is suitable for this directory. It can be * true if a given name is not suitable for a strict encoding * directory, but the directory being used isn't strict * * False if the filename isn't suitable for this directory. This only * happens when a directory is casefolded and the filesystem is strict * about its encoding. */ static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name) { if (!IS_CASEFOLDED(dir) || !sb_has_strict_encoding(dir->i_sb)) return true; /* * A casefold dir must have a encoding set, unless the filesystem * is corrupted */ if (WARN_ON_ONCE(!dir->i_sb->s_encoding)) return true; return !utf8_validate(dir->i_sb->s_encoding, name); } #else static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name) { return true; } #endif static inline bool sb_has_encoding(const struct super_block *sb) { #if IS_ENABLED(CONFIG_UNICODE) return !!sb->s_encoding; #else return false; #endif } int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid); int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *); extern int inode_newsize_ok(const struct inode *, loff_t offset); void setattr_copy(struct mnt_idmap *, struct inode *inode, const struct iattr *attr); extern int file_update_time(struct file *file); static inline bool vma_is_dax(const struct vm_area_struct *vma) { return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); } static inline bool vma_is_fsdax(struct vm_area_struct *vma) { struct inode *inode; if (!IS_ENABLED(CONFIG_FS_DAX) || !vma->vm_file) return false; if (!vma_is_dax(vma)) return false; inode = file_inode(vma->vm_file); if (S_ISCHR(inode->i_mode)) return false; /* device-dax */ return true; } static inline int iocb_flags(struct file *file) { int res = 0; if (file->f_flags & O_APPEND) res |= IOCB_APPEND; if (file->f_flags & O_DIRECT) res |= IOCB_DIRECT; if (file->f_flags & O_DSYNC) res |= IOCB_DSYNC; if (file->f_flags & __O_SYNC) res |= IOCB_SYNC; return res; } static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, int rw_type) { int kiocb_flags = 0; /* make sure there's no overlap between RWF and private IOCB flags */ BUILD_BUG_ON((__force int) RWF_SUPPORTED & IOCB_EVENTFD); if (!flags) return 0; if (unlikely(flags & ~RWF_SUPPORTED)) return -EOPNOTSUPP; if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND))) return -EINVAL; if (flags & RWF_NOWAIT) { if (!(ki->ki_filp->f_mode & FMODE_NOWAIT)) return -EOPNOTSUPP; } if (flags & RWF_ATOMIC) { if (rw_type != WRITE) return -EOPNOTSUPP; if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) return -EOPNOTSUPP; } if (flags & RWF_DONTCACHE) { /* file system must support it */ if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE)) return -EOPNOTSUPP; /* DAX mappings not supported */ if (IS_DAX(ki->ki_filp->f_mapping->host)) return -EOPNOTSUPP; } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) { if (IS_APPEND(file_inode(ki->ki_filp))) return -EPERM; ki->ki_flags &= ~IOCB_APPEND; } ki->ki_flags |= kiocb_flags; return 0; } /* Transaction based IO helpers */ /* * An argresp is stored in an allocated page and holds the * size of the argument or response, along with its content */ struct simple_transaction_argresp { ssize_t size; char data[]; }; #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp)) char *simple_transaction_get(struct file *file, const char __user *buf, size_t size); ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos); int simple_transaction_release(struct inode *inode, struct file *file); void simple_transaction_set(struct file *file, size_t n); /* * simple attribute files * * These attributes behave similar to those in sysfs: * * Writing to an attribute immediately sets a value, an open file can be * written to multiple times. * * Reading from an attribute creates a buffer from the value that might get * read with multiple read calls. When the attribute has been read * completely, no further read calls are possible until the file is opened * again. * * All attributes contain a text representation of a numeric value * that are accessed with the get() and set() functions. */ #define DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed) \ static int __fops ## _open(struct inode *inode, struct file *file) \ { \ __simple_attr_check_format(__fmt, 0ull); \ return simple_attr_open(inode, file, __get, __set, __fmt); \ } \ static const struct file_operations __fops = { \ .owner = THIS_MODULE, \ .open = __fops ## _open, \ .release = simple_attr_release, \ .read = simple_attr_read, \ .write = (__is_signed) ? simple_attr_write_signed : simple_attr_write, \ .llseek = generic_file_llseek, \ } #define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false) #define DEFINE_SIMPLE_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt) \ DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true) static inline __printf(1, 2) void __simple_attr_check_format(const char *fmt, ...) { /* don't do anything, just let the compiler check the arguments; */ } int simple_attr_open(struct inode *inode, struct file *file, int (*get)(void *, u64 *), int (*set)(void *, u64), const char *fmt); int simple_attr_release(struct inode *inode, struct file *file); ssize_t simple_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos); ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, size_t len, loff_t *ppos); struct ctl_table; int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) #define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE)) static inline bool is_sxid(umode_t mode) { return mode & (S_ISUID | S_ISGID); } static inline int check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) { if (!(dir->i_mode & S_ISVTX)) return 0; return __check_sticky(idmap, dir, inode); } static inline void inode_has_no_xattr(struct inode *inode) { if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & SB_NOSEC)) inode->i_flags |= S_NOSEC; } static inline bool is_root_inode(struct inode *inode) { return inode == inode->i_sb->s_root->d_inode; } static inline bool dir_emit(struct dir_context *ctx, const char *name, int namelen, u64 ino, unsigned type) { return ctx->actor(ctx, name, namelen, ctx->pos, ino, type); } static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx) { return ctx->actor(ctx, ".", 1, ctx->pos, file->f_path.dentry->d_inode->i_ino, DT_DIR); } static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx) { return ctx->actor(ctx, "..", 2, ctx->pos, d_parent_ino(file->f_path.dentry), DT_DIR); } static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx) { if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) return false; ctx->pos = 1; } if (ctx->pos == 1) { if (!dir_emit_dotdot(file, ctx)) return false; ctx->pos = 2; } return true; } static inline bool dir_relax(struct inode *inode) { inode_unlock(inode); inode_lock(inode); return !IS_DEADDIR(inode); } static inline bool dir_relax_shared(struct inode *inode) { inode_unlock_shared(inode); inode_lock_shared(inode); return !IS_DEADDIR(inode); } extern bool path_noexec(const struct path *path); extern void inode_nohighmem(struct inode *inode); /* mm/fadvise.c */ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice); extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice); static inline bool vfs_empty_path(int dfd, const char __user *path) { char c; if (dfd < 0) return false; /* We now allow NULL to be used for empty path. */ if (!path) return true; if (unlikely(get_user(c, path))) return false; return !c; } int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); #endif /* _LINUX_FS_H */ |
| 200 201 203 203 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_BACKING_DEV_DEFS_H #define __LINUX_BACKING_DEV_DEFS_H #include <linux/list.h> #include <linux/radix-tree.h> #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/percpu_counter.h> #include <linux/percpu-refcount.h> #include <linux/flex_proportions.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/kref.h> #include <linux/refcount.h> struct page; struct device; struct dentry; /* * Bits in bdi_writeback.state */ enum wb_state { WB_registered, /* bdi_register() was done */ WB_writeback_running, /* Writeback is in progress */ WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ WB_start_all, /* nr_pages == 0 (all) work pending */ }; enum wb_stat_item { WB_RECLAIMABLE, WB_WRITEBACK, WB_DIRTIED, WB_WRITTEN, NR_WB_STAT_ITEMS }; #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) /* * why some writeback work was initiated */ enum wb_reason { WB_REASON_BACKGROUND, WB_REASON_VMSCAN, WB_REASON_SYNC, WB_REASON_PERIODIC, WB_REASON_LAPTOP_TIMER, WB_REASON_FS_FREE_SPACE, /* * There is no bdi forker thread any more and works are done * by emergency worker, however, this is TPs userland visible * and we'll be exposing exactly the same information, * so it has a mismatch name. */ WB_REASON_FORKER_THREAD, WB_REASON_FOREIGN_FLUSH, WB_REASON_MAX, }; struct wb_completion { atomic_t cnt; wait_queue_head_t *waitq; }; #define __WB_COMPLETION_INIT(_waitq) \ (struct wb_completion){ .cnt = ATOMIC_INIT(1), .waitq = (_waitq) } /* * If one wants to wait for one or more wb_writeback_works, each work's * ->done should be set to a wb_completion defined using the following * macro. Once all work items are issued with wb_queue_work(), the caller * can wait for the completion of all using wb_wait_for_completion(). Work * items which are waited upon aren't freed automatically on completion. */ #define WB_COMPLETION_INIT(bdi) __WB_COMPLETION_INIT(&(bdi)->wb_waitq) #define DEFINE_WB_COMPLETION(cmpl, bdi) \ struct wb_completion cmpl = WB_COMPLETION_INIT(bdi) /* * Each wb (bdi_writeback) can perform writeback operations, is measured * and throttled, independently. Without cgroup writeback, each bdi * (bdi_writeback) is served by its embedded bdi->wb. * * On the default hierarchy, blkcg implicitly enables memcg. This allows * using memcg's page ownership for attributing writeback IOs, and every * memcg - blkcg combination can be served by its own wb by assigning a * dedicated wb to each memcg, which enables isolation across different * cgroups and propagation of IO back pressure down from the IO layer upto * the tasks which are generating the dirty pages to be written back. * * A cgroup wb is indexed on its bdi by the ID of the associated memcg, * refcounted with the number of inodes attached to it, and pins the memcg * and the corresponding blkcg. As the corresponding blkcg for a memcg may * change as blkcg is disabled and enabled higher up in the hierarchy, a wb * is tested for blkcg after lookup and removed from index on mismatch so * that a new wb for the combination can be created. * * Each bdi_writeback that is not embedded into the backing_dev_info must hold * a reference to the parent backing_dev_info. See cgwb_create() for details. */ struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ unsigned long state; /* Always use atomic bitops on this */ unsigned long last_old_flush; /* last old data flush */ struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ struct list_head b_dirty_time; /* time stamps are dirty */ spinlock_t list_lock; /* protects the b_* lists */ atomic_t writeback_inodes; /* number of inodes under writeback */ struct percpu_counter stat[NR_WB_STAT_ITEMS]; unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; unsigned long written_stamp; /* pages written at bw_time_stamp */ unsigned long write_bandwidth; /* the estimated write bandwidth */ unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */ /* * The base dirty throttle rate, re-calculated on every 200ms. * All the bdi tasks' dirty rate will be curbed under it. * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit * in small steps and is much more smooth/stable than the latter. */ unsigned long dirty_ratelimit; unsigned long balanced_dirty_ratelimit; struct fprop_local_percpu completions; int dirty_exceeded; enum wb_reason start_all_reason; spinlock_t work_lock; /* protects work_list & dwork scheduling */ struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ struct delayed_work bw_dwork; /* work item used for bandwidth estimate */ struct list_head bdi_node; /* anchored at bdi->wb_list */ #ifdef CONFIG_CGROUP_WRITEBACK struct percpu_ref refcnt; /* used only for !root wb's */ struct fprop_local_percpu memcg_completions; struct cgroup_subsys_state *memcg_css; /* the associated memcg */ struct cgroup_subsys_state *blkcg_css; /* and blkcg */ struct list_head memcg_node; /* anchored at memcg->cgwb_list */ struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ struct list_head b_attached; /* attached inodes, protected by list_lock */ struct list_head offline_node; /* anchored at offline_cgwbs */ union { struct work_struct release_work; struct rcu_head rcu; }; #endif }; struct backing_dev_info { u64 id; struct rb_node rb_node; /* keyed by ->id */ struct list_head bdi_list; unsigned long ra_pages; /* max readahead in PAGE_SIZE units */ unsigned long io_pages; /* max allowed IO size */ struct kref refcnt; /* Reference counter for the structure */ unsigned int capabilities; /* Device capabilities */ unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; /* * Sum of avg_write_bw of wbs with dirty inodes. > 0 if there are * any dirty wbs, which is depended upon by bdi_has_dirty(). */ atomic_long_t tot_write_bandwidth; /* * Jiffies when last process was dirty throttled on this bdi. Used by * blk-wbt. */ unsigned long last_bdp_sleep; struct bdi_writeback wb; /* the root writeback info for this bdi */ struct list_head wb_list; /* list of all wbs */ #ifdef CONFIG_CGROUP_WRITEBACK struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ struct mutex cgwb_release_mutex; /* protect shutdown of wb structs */ struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */ #endif wait_queue_head_t wb_waitq; struct device *dev; char dev_name[64]; struct device *owner; struct timer_list laptop_mode_wb_timer; #ifdef CONFIG_DEBUG_FS struct dentry *debug_dir; #endif }; struct wb_lock_cookie { bool locked; unsigned long flags; }; #ifdef CONFIG_CGROUP_WRITEBACK /** * wb_tryget - try to increment a wb's refcount * @wb: bdi_writeback to get */ static inline bool wb_tryget(struct bdi_writeback *wb) { if (wb != &wb->bdi->wb) return percpu_ref_tryget(&wb->refcnt); return true; } /** * wb_get - increment a wb's refcount * @wb: bdi_writeback to get */ static inline void wb_get(struct bdi_writeback *wb) { if (wb != &wb->bdi->wb) percpu_ref_get(&wb->refcnt); } /** * wb_put - decrement a wb's refcount * @wb: bdi_writeback to put * @nr: number of references to put */ static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr) { if (WARN_ON_ONCE(!wb->bdi)) { /* * A driver bug might cause a file to be removed before bdi was * initialized. */ return; } if (wb != &wb->bdi->wb) percpu_ref_put_many(&wb->refcnt, nr); } /** * wb_put - decrement a wb's refcount * @wb: bdi_writeback to put */ static inline void wb_put(struct bdi_writeback *wb) { wb_put_many(wb, 1); } /** * wb_dying - is a wb dying? * @wb: bdi_writeback of interest * * Returns whether @wb is unlinked and being drained. */ static inline bool wb_dying(struct bdi_writeback *wb) { return percpu_ref_is_dying(&wb->refcnt); } #else /* CONFIG_CGROUP_WRITEBACK */ static inline bool wb_tryget(struct bdi_writeback *wb) { return true; } static inline void wb_get(struct bdi_writeback *wb) { } static inline void wb_put(struct bdi_writeback *wb) { } static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr) { } static inline bool wb_dying(struct bdi_writeback *wb) { return false; } #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* __LINUX_BACKING_DEV_DEFS_H */ |
| 23 24 24 24 23 5 6 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Routing netlink socket interface: protocol independent part. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * Vitaly E. Lavrov RTA_OK arithmetic was wrong. */ #include <linux/bitops.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/capability.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/security.h> #include <linux/mutex.h> #include <linux/if_addr.h> #include <linux/if_bridge.h> #include <linux/if_vlan.h> #include <linux/pci.h> #include <linux/etherdevice.h> #include <linux/bpf.h> #include <linux/uaccess.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/ip.h> #include <net/protocol.h> #include <net/arp.h> #include <net/route.h> #include <net/udp.h> #include <net/tcp.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <net/fib_rules.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/devlink.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/addrconf.h> #endif #include <linux/dpll.h> #include "dev.h" #define RTNL_MAX_TYPE 50 #define RTNL_SLAVE_MAX_TYPE 44 struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; struct module *owner; unsigned int flags; struct rcu_head rcu; }; static DEFINE_MUTEX(rtnl_mutex); void rtnl_lock(void) { mutex_lock(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_lock); int rtnl_lock_killable(void) { return mutex_lock_killable(&rtnl_mutex); } static struct sk_buff *defer_kfree_skb_list; void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) { if (head && tail) { tail->next = defer_kfree_skb_list; defer_kfree_skb_list = head; } } EXPORT_SYMBOL(rtnl_kfree_skbs); void __rtnl_unlock(void) { struct sk_buff *head = defer_kfree_skb_list; defer_kfree_skb_list = NULL; /* Ensure that we didn't actually add any TODO item when __rtnl_unlock() * is used. In some places, e.g. in cfg80211, we have code that will do * something like * rtnl_lock() * wiphy_lock() * ... * rtnl_unlock() * * and because netdev_run_todo() acquires the RTNL for items on the list * we could cause a situation such as this: * Thread 1 Thread 2 * rtnl_lock() * unregister_netdevice() * __rtnl_unlock() * rtnl_lock() * wiphy_lock() * rtnl_unlock() * netdev_run_todo() * __rtnl_unlock() * * // list not empty now * // because of thread 2 * rtnl_lock() * while (!list_empty(...)) * rtnl_lock() * wiphy_lock() * **** DEADLOCK **** * * However, usage of __rtnl_unlock() is rare, and so we can ensure that * it's not used in cases where something is added to do the list. */ WARN_ON(!list_empty(&net_todo_list)); mutex_unlock(&rtnl_mutex); while (head) { struct sk_buff *next = head->next; kfree_skb(head); cond_resched(); head = next; } } void rtnl_unlock(void) { /* This fellow will unlock it for us. */ netdev_run_todo(); } EXPORT_SYMBOL(rtnl_unlock); int rtnl_trylock(void) { return mutex_trylock(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_trylock); int rtnl_is_locked(void) { return mutex_is_locked(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_is_locked); bool refcount_dec_and_rtnl_lock(refcount_t *r) { return refcount_dec_and_mutex_lock(r, &rtnl_mutex); } EXPORT_SYMBOL(refcount_dec_and_rtnl_lock); #ifdef CONFIG_PROVE_LOCKING bool lockdep_rtnl_is_held(void) { return lockdep_is_held(&rtnl_mutex); } EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ #ifdef CONFIG_DEBUG_NET_SMALL_RTNL void __rtnl_net_lock(struct net *net) { ASSERT_RTNL(); mutex_lock(&net->rtnl_mutex); } EXPORT_SYMBOL(__rtnl_net_lock); void __rtnl_net_unlock(struct net *net) { ASSERT_RTNL(); mutex_unlock(&net->rtnl_mutex); } EXPORT_SYMBOL(__rtnl_net_unlock); void rtnl_net_lock(struct net *net) { rtnl_lock(); __rtnl_net_lock(net); } EXPORT_SYMBOL(rtnl_net_lock); void rtnl_net_unlock(struct net *net) { __rtnl_net_unlock(net); rtnl_unlock(); } EXPORT_SYMBOL(rtnl_net_unlock); int rtnl_net_trylock(struct net *net) { int ret = rtnl_trylock(); if (ret) __rtnl_net_lock(net); return ret; } EXPORT_SYMBOL(rtnl_net_trylock); int rtnl_net_lock_killable(struct net *net) { int ret = rtnl_lock_killable(); if (!ret) __rtnl_net_lock(net); return ret; } static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) { if (net_eq(net_a, net_b)) return 0; /* always init_net first */ if (net_eq(net_a, &init_net)) return -1; if (net_eq(net_b, &init_net)) return 1; /* otherwise lock in ascending order */ return net_a < net_b ? -1 : 1; } int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { const struct net *net_a, *net_b; net_a = container_of(a, struct net, rtnl_mutex.dep_map); net_b = container_of(b, struct net, rtnl_mutex.dep_map); return rtnl_net_cmp_locks(net_a, net_b); } bool rtnl_net_is_locked(struct net *net) { return rtnl_is_locked() && mutex_is_locked(&net->rtnl_mutex); } EXPORT_SYMBOL(rtnl_net_is_locked); bool lockdep_rtnl_net_is_held(struct net *net) { return lockdep_rtnl_is_held() && lockdep_is_held(&net->rtnl_mutex); } EXPORT_SYMBOL(lockdep_rtnl_net_is_held); #else static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) { /* No need to swap */ return -1; } #endif struct rtnl_nets { /* ->newlink() needs to freeze 3 netns at most; * 2 for the new device, 1 for its peer. */ struct net *net[3]; unsigned char len; }; static void rtnl_nets_init(struct rtnl_nets *rtnl_nets) { memset(rtnl_nets, 0, sizeof(*rtnl_nets)); } static void rtnl_nets_destroy(struct rtnl_nets *rtnl_nets) { int i; for (i = 0; i < rtnl_nets->len; i++) { put_net(rtnl_nets->net[i]); rtnl_nets->net[i] = NULL; } rtnl_nets->len = 0; } /** * rtnl_nets_add - Add netns to be locked before ->newlink(). * * @rtnl_nets: rtnl_nets pointer passed to ->get_peer_net(). * @net: netns pointer with an extra refcnt held. * * The extra refcnt is released in rtnl_nets_destroy(). */ static void rtnl_nets_add(struct rtnl_nets *rtnl_nets, struct net *net) { int i; DEBUG_NET_WARN_ON_ONCE(rtnl_nets->len == ARRAY_SIZE(rtnl_nets->net)); for (i = 0; i < rtnl_nets->len; i++) { switch (rtnl_net_cmp_locks(rtnl_nets->net[i], net)) { case 0: put_net(net); return; case 1: swap(rtnl_nets->net[i], net); } } rtnl_nets->net[i] = net; rtnl_nets->len++; } static void rtnl_nets_lock(struct rtnl_nets *rtnl_nets) { int i; rtnl_lock(); for (i = 0; i < rtnl_nets->len; i++) __rtnl_net_lock(rtnl_nets->net[i]); } static void rtnl_nets_unlock(struct rtnl_nets *rtnl_nets) { int i; for (i = 0; i < rtnl_nets->len; i++) __rtnl_net_unlock(rtnl_nets->net[i]); rtnl_unlock(); } static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) { int msgindex = msgtype - RTM_BASE; /* * msgindex < 0 implies someone tried to register a netlink * control code. msgindex >= RTM_NR_MSGTYPES may indicate that * the message type has not been added to linux/rtnetlink.h */ BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES); return msgindex; } static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) { struct rtnl_link __rcu **tab; if (protocol >= ARRAY_SIZE(rtnl_msg_handlers)) protocol = PF_UNSPEC; tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]); if (!tab) tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]); return rcu_dereference_rtnl(tab[msgtype]); } static int rtnl_register_internal(struct module *owner, int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { struct rtnl_link *link, *old; struct rtnl_link __rcu **tab; int msgindex; int ret = -ENOBUFS; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); rtnl_lock(); tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (tab == NULL) { tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL); if (!tab) goto unlock; /* ensures we see the 0 stores */ rcu_assign_pointer(rtnl_msg_handlers[protocol], tab); } old = rtnl_dereference(tab[msgindex]); if (old) { link = kmemdup(old, sizeof(*old), GFP_KERNEL); if (!link) goto unlock; } else { link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) goto unlock; } WARN_ON(link->owner && link->owner != owner); link->owner = owner; WARN_ON(doit && link->doit && link->doit != doit); if (doit) link->doit = doit; WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit); if (dumpit) link->dumpit = dumpit; WARN_ON(rtnl_msgtype_kind(msgtype) != RTNL_KIND_DEL && (flags & RTNL_FLAG_BULK_DEL_SUPPORTED)); link->flags |= flags; /* publish protocol:msgtype */ rcu_assign_pointer(tab[msgindex], link); ret = 0; if (old) kfree_rcu(old, rcu); unlock: rtnl_unlock(); return ret; } /** * rtnl_unregister - Unregister a rtnetlink message type * @protocol: Protocol family or PF_UNSPEC * @msgtype: rtnetlink message type * * Returns 0 on success or a negative error code. */ static int rtnl_unregister(int protocol, int msgtype) { struct rtnl_link __rcu **tab; struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); rtnl_lock(); tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (!tab) { rtnl_unlock(); return -ENOENT; } link = rcu_replace_pointer_rtnl(tab[msgindex], NULL); rtnl_unlock(); kfree_rcu(link, rcu); return 0; } /** * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol * @protocol : Protocol family or PF_UNSPEC * * Identical to calling rtnl_unregster() for all registered message types * of a certain protocol family. */ void rtnl_unregister_all(int protocol) { struct rtnl_link __rcu **tab; struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); rtnl_lock(); tab = rcu_replace_pointer_rtnl(rtnl_msg_handlers[protocol], NULL); if (!tab) { rtnl_unlock(); return; } for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { link = rcu_replace_pointer_rtnl(tab[msgindex], NULL); kfree_rcu(link, rcu); } rtnl_unlock(); synchronize_net(); kfree(tab); } EXPORT_SYMBOL_GPL(rtnl_unregister_all); /** * __rtnl_register_many - Register rtnetlink message types * @handlers: Array of struct rtnl_msg_handlers * @n: The length of @handlers * * Registers the specified function pointers (at least one of them has * to be non-NULL) to be called whenever a request message for the * specified protocol family and message type is received. * * The special protocol family PF_UNSPEC may be used to define fallback * function pointers for the case when no entry for the specific protocol * family exists. * * When one element of @handlers fails to register, * 1) built-in: panics. * 2) modules : the previous successful registrations are unwinded * and an error is returned. * * Use rtnl_register_many(). */ int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n) { const struct rtnl_msg_handler *handler; int i, err; for (i = 0, handler = handlers; i < n; i++, handler++) { err = rtnl_register_internal(handler->owner, handler->protocol, handler->msgtype, handler->doit, handler->dumpit, handler->flags); if (err) { if (!handler->owner) panic("Unable to register rtnetlink message " "handlers, %pS\n", handlers); __rtnl_unregister_many(handlers, i); break; } } return err; } EXPORT_SYMBOL_GPL(__rtnl_register_many); void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n) { const struct rtnl_msg_handler *handler; int i; for (i = n - 1, handler = handlers + n - 1; i >= 0; i--, handler--) rtnl_unregister(handler->protocol, handler->msgtype); } EXPORT_SYMBOL_GPL(__rtnl_unregister_many); static DEFINE_MUTEX(link_ops_mutex); static LIST_HEAD(link_ops); static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index) { struct rtnl_link_ops *ops; rcu_read_lock(); list_for_each_entry_rcu(ops, &link_ops, list) { if (!strcmp(ops->kind, kind)) { *srcu_index = srcu_read_lock(&ops->srcu); goto unlock; } } ops = NULL; unlock: rcu_read_unlock(); return ops; } static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index) { srcu_read_unlock(&ops->srcu, srcu_index); } /** * rtnl_link_register - Register rtnl_link_ops with rtnetlink. * @ops: struct rtnl_link_ops * to register * * Returns 0 on success or a negative error code. */ int rtnl_link_register(struct rtnl_link_ops *ops) { struct rtnl_link_ops *tmp; int err; /* Sanity-check max sizes to avoid stack buffer overflow. */ if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE || ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) return -EINVAL; /* The check for alloc/setup is here because if ops * does not have that filled up, it is not possible * to use the ops for creating device. So do not * fill up dellink as well. That disables rtnl_dellink. */ if ((ops->alloc || ops->setup) && !ops->dellink) ops->dellink = unregister_netdevice_queue; err = init_srcu_struct(&ops->srcu); if (err) return err; mutex_lock(&link_ops_mutex); list_for_each_entry(tmp, &link_ops, list) { if (!strcmp(ops->kind, tmp->kind)) { err = -EEXIST; goto unlock; } } list_add_tail_rcu(&ops->list, &link_ops); unlock: mutex_unlock(&link_ops_mutex); return err; } EXPORT_SYMBOL_GPL(rtnl_link_register); static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) { struct net_device *dev; LIST_HEAD(list_kill); for_each_netdev(net, dev) { if (dev->rtnl_link_ops == ops) ops->dellink(dev, &list_kill); } unregister_netdevice_many(&list_kill); } /* Return with the rtnl_lock held when there are no network * devices unregistering in any network namespace. */ static void rtnl_lock_unregistering_all(void) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&netdev_unregistering_wq, &wait); for (;;) { rtnl_lock(); /* We held write locked pernet_ops_rwsem, and parallel * setup_net() and cleanup_net() are not possible. */ if (!atomic_read(&dev_unreg_count)) break; __rtnl_unlock(); wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(&netdev_unregistering_wq, &wait); } /** * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { struct net *net; mutex_lock(&link_ops_mutex); list_del_rcu(&ops->list); mutex_unlock(&link_ops_mutex); synchronize_srcu(&ops->srcu); cleanup_srcu_struct(&ops->srcu); /* Close the race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock_unregistering_all(); for_each_net(net) __rtnl_kill_links(net, ops); rtnl_unlock(); up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(rtnl_link_unregister); static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev) { struct net_device *master_dev; const struct rtnl_link_ops *ops; size_t size = 0; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); if (!master_dev) goto out; ops = master_dev->rtnl_link_ops; if (!ops || !ops->get_slave_size) goto out; /* IFLA_INFO_SLAVE_DATA + nested data */ size = nla_total_size(sizeof(struct nlattr)) + ops->get_slave_size(master_dev, dev); out: rcu_read_unlock(); return size; } static size_t rtnl_link_get_size(const struct net_device *dev) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; size_t size; if (!ops) return 0; size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ if (ops->get_size) /* IFLA_INFO_DATA + nested data */ size += nla_total_size(sizeof(struct nlattr)) + ops->get_size(dev); if (ops->get_xstats_size) /* IFLA_INFO_XSTATS */ size += nla_total_size(ops->get_xstats_size(dev)); size += rtnl_link_get_slave_info_data_size(dev); return size; } static LIST_HEAD(rtnl_af_ops); static struct rtnl_af_ops *rtnl_af_lookup(const int family, int *srcu_index) { struct rtnl_af_ops *ops; ASSERT_RTNL(); rcu_read_lock(); list_for_each_entry_rcu(ops, &rtnl_af_ops, list) { if (ops->family == family) { *srcu_index = srcu_read_lock(&ops->srcu); goto unlock; } } ops = NULL; unlock: rcu_read_unlock(); return ops; } static void rtnl_af_put(struct rtnl_af_ops *ops, int srcu_index) { srcu_read_unlock(&ops->srcu, srcu_index); } /** * rtnl_af_register - Register rtnl_af_ops with rtnetlink. * @ops: struct rtnl_af_ops * to register * * Return: 0 on success or a negative error code. */ int rtnl_af_register(struct rtnl_af_ops *ops) { int err = init_srcu_struct(&ops->srcu); if (err) return err; rtnl_lock(); list_add_tail_rcu(&ops->list, &rtnl_af_ops); rtnl_unlock(); return 0; } EXPORT_SYMBOL_GPL(rtnl_af_register); /** * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. * @ops: struct rtnl_af_ops * to unregister */ void rtnl_af_unregister(struct rtnl_af_ops *ops) { rtnl_lock(); list_del_rcu(&ops->list); rtnl_unlock(); synchronize_rcu(); synchronize_srcu(&ops->srcu); cleanup_srcu_struct(&ops->srcu); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); static size_t rtnl_link_get_af_size(const struct net_device *dev, u32 ext_filter_mask) { struct rtnl_af_ops *af_ops; size_t size; /* IFLA_AF_SPEC */ size = nla_total_size(sizeof(struct nlattr)); rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->get_link_af_size) { /* AF_* + nested data */ size += nla_total_size(sizeof(struct nlattr)) + af_ops->get_link_af_size(dev, ext_filter_mask); } } rcu_read_unlock(); return size; } static bool rtnl_have_link_slave_info(const struct net_device *dev) { struct net_device *master_dev; bool ret = false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); if (master_dev && master_dev->rtnl_link_ops) ret = true; rcu_read_unlock(); return ret; } static int rtnl_link_slave_info_fill(struct sk_buff *skb, const struct net_device *dev) { struct net_device *master_dev; const struct rtnl_link_ops *ops; struct nlattr *slave_data; int err; master_dev = netdev_master_upper_dev_get((struct net_device *) dev); if (!master_dev) return 0; ops = master_dev->rtnl_link_ops; if (!ops) return 0; if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0) return -EMSGSIZE; if (ops->fill_slave_info) { slave_data = nla_nest_start_noflag(skb, IFLA_INFO_SLAVE_DATA); if (!slave_data) return -EMSGSIZE; err = ops->fill_slave_info(skb, master_dev, dev); if (err < 0) goto err_cancel_slave_data; nla_nest_end(skb, slave_data); } return 0; err_cancel_slave_data: nla_nest_cancel(skb, slave_data); return err; } static int rtnl_link_info_fill(struct sk_buff *skb, const struct net_device *dev) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; struct nlattr *data; int err; if (!ops) return 0; if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0) return -EMSGSIZE; if (ops->fill_xstats) { err = ops->fill_xstats(skb, dev); if (err < 0) return err; } if (ops->fill_info) { data = nla_nest_start_noflag(skb, IFLA_INFO_DATA); if (data == NULL) return -EMSGSIZE; err = ops->fill_info(skb, dev); if (err < 0) goto err_cancel_data; nla_nest_end(skb, data); } return 0; err_cancel_data: nla_nest_cancel(skb, data); return err; } static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *linkinfo; int err = -EMSGSIZE; linkinfo = nla_nest_start_noflag(skb, IFLA_LINKINFO); if (linkinfo == NULL) goto out; err = rtnl_link_info_fill(skb, dev); if (err < 0) goto err_cancel_link; err = rtnl_link_slave_info_fill(skb, dev); if (err < 0) goto err_cancel_link; nla_nest_end(skb, linkinfo); return 0; err_cancel_link: nla_nest_cancel(skb, linkinfo); out: return err; } int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo) { struct sock *rtnl = net->rtnl; return nlmsg_notify(rtnl, skb, pid, group, echo, GFP_KERNEL); } int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) { struct sock *rtnl = net->rtnl; return nlmsg_unicast(rtnl, skb, pid); } EXPORT_SYMBOL(rtnl_unicast); void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, const struct nlmsghdr *nlh, gfp_t flags) { struct sock *rtnl = net->rtnl; nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags); } EXPORT_SYMBOL(rtnl_notify); void rtnl_set_sk_err(struct net *net, u32 group, int error) { struct sock *rtnl = net->rtnl; netlink_set_err(rtnl, 0, group, error); } EXPORT_SYMBOL(rtnl_set_sk_err); int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) { struct nlattr *mx; int i, valid = 0; /* nothing is dumped for dst_default_metrics, so just skip the loop */ if (metrics == dst_default_metrics.metrics) return 0; mx = nla_nest_start_noflag(skb, RTA_METRICS); if (mx == NULL) return -ENOBUFS; for (i = 0; i < RTAX_MAX; i++) { if (metrics[i]) { if (i == RTAX_CC_ALGO - 1) { char tmp[TCP_CA_NAME_MAX], *name; name = tcp_ca_get_name_by_key(metrics[i], tmp); if (!name) continue; if (nla_put_string(skb, i + 1, name)) goto nla_put_failure; } else if (i == RTAX_FEATURES - 1) { u32 user_features = metrics[i] & RTAX_FEATURE_MASK; if (!user_features) continue; BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK); if (nla_put_u32(skb, i + 1, user_features)) goto nla_put_failure; } else { if (nla_put_u32(skb, i + 1, metrics[i])) goto nla_put_failure; } valid++; } } if (!valid) { nla_nest_cancel(skb, mx); return 0; } return nla_nest_end(skb, mx); nla_put_failure: nla_nest_cancel(skb, mx); return -EMSGSIZE; } EXPORT_SYMBOL(rtnetlink_put_metrics); int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error) { struct rta_cacheinfo ci = { .rta_error = error, .rta_id = id, }; if (dst) { ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); ci.rta_used = dst->__use; ci.rta_clntref = rcuref_read(&dst->__rcuref); } if (expires) { unsigned long clock; clock = jiffies_to_clock_t(abs(expires)); clock = min_t(unsigned long, clock, INT_MAX); ci.rta_expires = (expires > 0) ? clock : -clock; } return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci); } EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); void netdev_set_operstate(struct net_device *dev, int newstate) { unsigned int old = READ_ONCE(dev->operstate); do { if (old == newstate) return; } while (!try_cmpxchg(&dev->operstate, &old, newstate)); netdev_state_change(dev); } EXPORT_SYMBOL(netdev_set_operstate); static void set_operstate(struct net_device *dev, unsigned char transition) { unsigned char operstate = READ_ONCE(dev->operstate); switch (transition) { case IF_OPER_UP: if ((operstate == IF_OPER_DORMANT || operstate == IF_OPER_TESTING || operstate == IF_OPER_UNKNOWN) && !netif_dormant(dev) && !netif_testing(dev)) operstate = IF_OPER_UP; break; case IF_OPER_TESTING: if (netif_oper_up(dev)) operstate = IF_OPER_TESTING; break; case IF_OPER_DORMANT: if (netif_oper_up(dev)) operstate = IF_OPER_DORMANT; break; } netdev_set_operstate(dev, operstate); } static unsigned int rtnl_dev_get_flags(const struct net_device *dev) { return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) | (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI)); } static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, const struct ifinfomsg *ifm) { unsigned int flags = ifm->ifi_flags; /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ if (ifm->ifi_change) flags = (flags & ifm->ifi_change) | (rtnl_dev_get_flags(dev) & ~ifm->ifi_change); return flags; } static void copy_rtnl_link_stats(struct rtnl_link_stats *a, const struct rtnl_link_stats64 *b) { a->rx_packets = b->rx_packets; a->tx_packets = b->tx_packets; a->rx_bytes = b->rx_bytes; a->tx_bytes = b->tx_bytes; a->rx_errors = b->rx_errors; a->tx_errors = b->tx_errors; a->rx_dropped = b->rx_dropped; a->tx_dropped = b->tx_dropped; a->multicast = b->multicast; a->collisions = b->collisions; a->rx_length_errors = b->rx_length_errors; a->rx_over_errors = b->rx_over_errors; a->rx_crc_errors = b->rx_crc_errors; a->rx_frame_errors = b->rx_frame_errors; a->rx_fifo_errors = b->rx_fifo_errors; a->rx_missed_errors = b->rx_missed_errors; a->tx_aborted_errors = b->tx_aborted_errors; a->tx_carrier_errors = b->tx_carrier_errors; a->tx_fifo_errors = b->tx_fifo_errors; a->tx_heartbeat_errors = b->tx_heartbeat_errors; a->tx_window_errors = b->tx_window_errors; a->rx_compressed = b->rx_compressed; a->tx_compressed = b->tx_compressed; a->rx_nohandler = b->rx_nohandler; } /* All VF info */ static inline int rtnl_vfinfo_size(const struct net_device *dev, u32 ext_filter_mask) { if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) { int num_vfs = dev_num_vf(dev->dev.parent); size_t size = nla_total_size(0); size += num_vfs * (nla_total_size(0) + nla_total_size(sizeof(struct ifla_vf_mac)) + nla_total_size(sizeof(struct ifla_vf_broadcast)) + nla_total_size(sizeof(struct ifla_vf_vlan)) + nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */ nla_total_size(MAX_VLAN_LIST_LEN * sizeof(struct ifla_vf_vlan_info)) + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + nla_total_size(sizeof(struct ifla_vf_tx_rate)) + nla_total_size(sizeof(struct ifla_vf_rate)) + nla_total_size(sizeof(struct ifla_vf_link_state)) + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + nla_total_size(sizeof(struct ifla_vf_trust))); if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { size += num_vfs * (nla_total_size(0) + /* nest IFLA_VF_STATS */ /* IFLA_VF_STATS_RX_PACKETS */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_PACKETS */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_RX_BYTES */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_BYTES */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_BROADCAST */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_MULTICAST */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_RX_DROPPED */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_DROPPED */ nla_total_size_64bit(sizeof(__u64))); } return size; } else return 0; } static size_t rtnl_port_size(const struct net_device *dev, u32 ext_filter_mask) { size_t port_size = nla_total_size(4) /* PORT_VF */ + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */ + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */ + nla_total_size(1) /* PROT_VDP_REQUEST */ + nla_total_size(2); /* PORT_VDP_RESPONSE */ size_t vf_ports_size = nla_total_size(sizeof(struct nlattr)); size_t vf_port_size = nla_total_size(sizeof(struct nlattr)) + port_size; size_t port_self_size = nla_total_size(sizeof(struct nlattr)) + port_size; if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; if (dev_num_vf(dev->dev.parent)) return port_self_size + vf_ports_size + vf_port_size * dev_num_vf(dev->dev.parent); else return port_self_size; } static size_t rtnl_xdp_size(void) { size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ nla_total_size(1) + /* XDP_ATTACHED */ nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */ nla_total_size(4); /* XDP_<mode>_PROG_ID */ return xdp_size; } static size_t rtnl_prop_list_size(const struct net_device *dev) { struct netdev_name_node *name_node; unsigned int cnt = 0; rcu_read_lock(); list_for_each_entry_rcu(name_node, &dev->name_node->list, list) cnt++; rcu_read_unlock(); if (!cnt) return 0; return nla_total_size(0) + cnt * nla_total_size(ALTIFNAMSIZ); } static size_t rtnl_proto_down_size(const struct net_device *dev) { size_t size = nla_total_size(1); /* Assume dev->proto_down_reason is not zero. */ size += nla_total_size(0) + nla_total_size(4); return size; } static size_t rtnl_devlink_port_size(const struct net_device *dev) { size_t size = nla_total_size(0); /* nest IFLA_DEVLINK_PORT */ if (dev->devlink_port) size += devlink_nl_port_handle_size(dev->devlink_port); return size; } static size_t rtnl_dpll_pin_size(const struct net_device *dev) { size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */ size += dpll_netdev_pin_handle_size(dev); return size; } static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */ + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap)) + nla_total_size(sizeof(struct rtnl_link_stats)) + nla_total_size_64bit(sizeof(struct rtnl_link_stats64)) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ + nla_total_size(4) /* IFLA_TXQLEN */ + nla_total_size(4) /* IFLA_WEIGHT */ + nla_total_size(4) /* IFLA_MTU */ + nla_total_size(4) /* IFLA_LINK */ + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(1) /* IFLA_CARRIER */ + nla_total_size(4) /* IFLA_PROMISCUITY */ + nla_total_size(4) /* IFLA_ALLMULTI */ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */ + nla_total_size(4) /* IFLA_GSO_IPV4_MAX_SIZE */ + nla_total_size(4) /* IFLA_GRO_IPV4_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ + nla_total_size(4) /* IFLA_LINK_NETNSID */ + nla_total_size(4) /* IFLA_GROUP */ + nla_total_size(ext_filter_mask & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + rtnl_xdp_size() /* IFLA_XDP */ + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(4) /* IFLA_NEW_IFINDEX */ + rtnl_proto_down_size(dev) /* proto down */ + nla_total_size(4) /* IFLA_TARGET_NETNSID */ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + nla_total_size(4) /* IFLA_MIN_MTU */ + nla_total_size(4) /* IFLA_MAX_MTU */ + rtnl_prop_list_size(dev) + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + rtnl_devlink_port_size(dev) + rtnl_dpll_pin_size(dev) + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */ + 0; } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *vf_ports; struct nlattr *vf_port; int vf; int err; vf_ports = nla_nest_start_noflag(skb, IFLA_VF_PORTS); if (!vf_ports) return -EMSGSIZE; for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) { vf_port = nla_nest_start_noflag(skb, IFLA_VF_PORT); if (!vf_port) goto nla_put_failure; if (nla_put_u32(skb, IFLA_PORT_VF, vf)) goto nla_put_failure; err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb); if (err == -EMSGSIZE) goto nla_put_failure; if (err) { nla_nest_cancel(skb, vf_port); continue; } nla_nest_end(skb, vf_port); } nla_nest_end(skb, vf_ports); return 0; nla_put_failure: nla_nest_cancel(skb, vf_ports); return -EMSGSIZE; } static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *port_self; int err; port_self = nla_nest_start_noflag(skb, IFLA_PORT_SELF); if (!port_self) return -EMSGSIZE; err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb); if (err) { nla_nest_cancel(skb, port_self); return (err == -EMSGSIZE) ? err : 0; } nla_nest_end(skb, port_self); return 0; } static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev, u32 ext_filter_mask) { int err; if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; err = rtnl_port_self_fill(skb, dev); if (err) return err; if (dev_num_vf(dev->dev.parent)) { err = rtnl_vf_ports_fill(skb, dev); if (err) return err; } return 0; } static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; struct netdev_phys_item_id ppid; err = dev_get_phys_port_id(dev, &ppid); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id)) return -EMSGSIZE; return 0; } static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) { char name[IFNAMSIZ]; int err; err = dev_get_phys_port_name(dev, name, sizeof(name)); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name)) return -EMSGSIZE; return 0; } static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { struct netdev_phys_item_id ppid = { }; int err; err = dev_get_port_parent_id(dev, &ppid, false); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put(skb, IFLA_PHYS_SWITCH_ID, ppid.id_len, ppid.id)) return -EMSGSIZE; return 0; } static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, struct net_device *dev) { struct rtnl_link_stats64 *sp; struct nlattr *attr; attr = nla_reserve_64bit(skb, IFLA_STATS64, sizeof(struct rtnl_link_stats64), IFLA_PAD); if (!attr) return -EMSGSIZE; sp = nla_data(attr); dev_get_stats(dev, sp); attr = nla_reserve(skb, IFLA_STATS, sizeof(struct rtnl_link_stats)); if (!attr) return -EMSGSIZE; copy_rtnl_link_stats(nla_data(attr), sp); return 0; } static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct net_device *dev, int vfs_num, u32 ext_filter_mask) { struct ifla_vf_rss_query_en vf_rss_query_en; struct nlattr *vf, *vfstats, *vfvlanlist; struct ifla_vf_link_state vf_linkstate; struct ifla_vf_vlan_info vf_vlan_info; struct ifla_vf_spoofchk vf_spoofchk; struct ifla_vf_tx_rate vf_tx_rate; struct ifla_vf_stats vf_stats; struct ifla_vf_trust vf_trust; struct ifla_vf_vlan vf_vlan; struct ifla_vf_rate vf_rate; struct ifla_vf_mac vf_mac; struct ifla_vf_broadcast vf_broadcast; struct ifla_vf_info ivi; struct ifla_vf_guid node_guid; struct ifla_vf_guid port_guid; memset(&ivi, 0, sizeof(ivi)); /* Not all SR-IOV capable drivers support the * spoofcheck and "RSS query enable" query. Preset to * -1 so the user space tool can detect that the driver * didn't report anything. */ ivi.spoofchk = -1; ivi.rss_query_en = -1; ivi.trusted = -1; /* The default value for VF link state is "auto" * IFLA_VF_LINK_STATE_AUTO which equals zero */ ivi.linkstate = 0; /* VLAN Protocol by default is 802.1Q */ ivi.vlan_proto = htons(ETH_P_8021Q); if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) return 0; memset(&vf_vlan_info, 0, sizeof(vf_vlan_info)); memset(&node_guid, 0, sizeof(node_guid)); memset(&port_guid, 0, sizeof(port_guid)); vf_mac.vf = vf_vlan.vf = vf_vlan_info.vf = vf_rate.vf = vf_tx_rate.vf = vf_spoofchk.vf = vf_linkstate.vf = vf_rss_query_en.vf = vf_trust.vf = node_guid.vf = port_guid.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; vf_vlan_info.vlan = ivi.vlan; vf_vlan_info.qos = ivi.qos; vf_vlan_info.vlan_proto = ivi.vlan_proto; vf_tx_rate.rate = ivi.max_tx_rate; vf_rate.min_tx_rate = ivi.min_tx_rate; vf_rate.max_tx_rate = ivi.max_tx_rate; vf_spoofchk.setting = ivi.spoofchk; vf_linkstate.link_state = ivi.linkstate; vf_rss_query_en.setting = ivi.rss_query_en; vf_trust.setting = ivi.trusted; vf = nla_nest_start_noflag(skb, IFLA_VF_INFO); if (!vf) return -EMSGSIZE; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), &vf_rate) || nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate) || nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), &vf_spoofchk) || nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), &vf_linkstate) || nla_put(skb, IFLA_VF_RSS_QUERY_EN, sizeof(vf_rss_query_en), &vf_rss_query_en) || nla_put(skb, IFLA_VF_TRUST, sizeof(vf_trust), &vf_trust)) goto nla_put_vf_failure; if (dev->netdev_ops->ndo_get_vf_guid && !dev->netdev_ops->ndo_get_vf_guid(dev, vfs_num, &node_guid, &port_guid)) { if (nla_put(skb, IFLA_VF_IB_NODE_GUID, sizeof(node_guid), &node_guid) || nla_put(skb, IFLA_VF_IB_PORT_GUID, sizeof(port_guid), &port_guid)) goto nla_put_vf_failure; } vfvlanlist = nla_nest_start_noflag(skb, IFLA_VF_VLAN_LIST); if (!vfvlanlist) goto nla_put_vf_failure; if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info), &vf_vlan_info)) { nla_nest_cancel(skb, vfvlanlist); goto nla_put_vf_failure; } nla_nest_end(skb, vfvlanlist); if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { memset(&vf_stats, 0, sizeof(vf_stats)); if (dev->netdev_ops->ndo_get_vf_stats) dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, &vf_stats); vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); if (!vfstats) goto nla_put_vf_failure; if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, vf_stats.rx_packets, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, vf_stats.tx_packets, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, vf_stats.broadcast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, vf_stats.multicast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { nla_nest_cancel(skb, vfstats); goto nla_put_vf_failure; } nla_nest_end(skb, vfstats); } nla_nest_end(skb, vf); return 0; nla_put_vf_failure: nla_nest_cancel(skb, vf); return -EMSGSIZE; } static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, struct net_device *dev, u32 ext_filter_mask) { struct nlattr *vfinfo; int i, num_vfs; if (!dev->dev.parent || ((ext_filter_mask & RTEXT_FILTER_VF) == 0)) return 0; num_vfs = dev_num_vf(dev->dev.parent); if (nla_put_u32(skb, IFLA_NUM_VF, num_vfs)) return -EMSGSIZE; if (!dev->netdev_ops->ndo_get_vf_config) return 0; vfinfo = nla_nest_start_noflag(skb, IFLA_VFINFO_LIST); if (!vfinfo) return -EMSGSIZE; for (i = 0; i < num_vfs; i++) { if (rtnl_fill_vfinfo(skb, dev, i, ext_filter_mask)) { nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; } } nla_nest_end(skb, vfinfo); return 0; } static int rtnl_fill_link_ifmap(struct sk_buff *skb, const struct net_device *dev) { struct rtnl_link_ifmap map; memset(&map, 0, sizeof(map)); map.mem_start = READ_ONCE(dev->mem_start); map.mem_end = READ_ONCE(dev->mem_end); map.base_addr = READ_ONCE(dev->base_addr); map.irq = READ_ONCE(dev->irq); map.dma = READ_ONCE(dev->dma); map.port = READ_ONCE(dev->if_port); if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD)) return -EMSGSIZE; return 0; } static u32 rtnl_xdp_prog_skb(struct net_device *dev) { const struct bpf_prog *generic_xdp_prog; u32 res = 0; rcu_read_lock(); generic_xdp_prog = rcu_dereference(dev->xdp_prog); if (generic_xdp_prog) res = generic_xdp_prog->aux->id; rcu_read_unlock(); return res; } static u32 rtnl_xdp_prog_drv(struct net_device *dev) { return dev_xdp_prog_id(dev, XDP_MODE_DRV); } static u32 rtnl_xdp_prog_hw(struct net_device *dev) { return dev_xdp_prog_id(dev, XDP_MODE_HW); } static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev, u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr, u32 (*get_prog_id)(struct net_device *dev)) { u32 curr_id; int err; curr_id = get_prog_id(dev); if (!curr_id) return 0; *prog_id = curr_id; err = nla_put_u32(skb, attr, curr_id); if (err) return err; if (*mode != XDP_ATTACHED_NONE) *mode = XDP_ATTACHED_MULTI; else *mode = tgt_mode; return 0; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *xdp; u32 prog_id; int err; u8 mode; xdp = nla_nest_start_noflag(skb, IFLA_XDP); if (!xdp) return -EMSGSIZE; prog_id = 0; mode = XDP_ATTACHED_NONE; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB, IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb); if (err) goto err_cancel; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV, IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv); if (err) goto err_cancel; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW, IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw); if (err) goto err_cancel; err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode); if (err) goto err_cancel; if (prog_id && mode != XDP_ATTACHED_MULTI) { err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id); if (err) goto err_cancel; } nla_nest_end(skb, xdp); return 0; err_cancel: nla_nest_cancel(skb, xdp); return err; } static u32 rtnl_get_event(unsigned long event) { u32 rtnl_event_type = IFLA_EVENT_NONE; switch (event) { case NETDEV_REBOOT: rtnl_event_type = IFLA_EVENT_REBOOT; break; case NETDEV_FEAT_CHANGE: rtnl_event_type = IFLA_EVENT_FEATURES; break; case NETDEV_BONDING_FAILOVER: rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER; break; case NETDEV_NOTIFY_PEERS: rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS; break; case NETDEV_RESEND_IGMP: rtnl_event_type = IFLA_EVENT_IGMP_RESEND; break; case NETDEV_CHANGEINFODATA: rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS; break; default: break; } return rtnl_event_type; } static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) { const struct net_device *upper_dev; int ret = 0; rcu_read_lock(); upper_dev = netdev_master_upper_dev_get_rcu(dev); if (upper_dev) ret = nla_put_u32(skb, IFLA_MASTER, READ_ONCE(upper_dev->ifindex)); rcu_read_unlock(); return ret; } static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev, bool force) { int iflink = dev_get_iflink(dev); if (force || READ_ONCE(dev->ifindex) != iflink) return nla_put_u32(skb, IFLA_LINK, iflink); return 0; } static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb, struct net_device *dev) { char buf[IFALIASZ]; int ret; ret = dev_get_alias(dev, buf, sizeof(buf)); return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0; } static int rtnl_fill_link_netnsid(struct sk_buff *skb, const struct net_device *dev, struct net *src_net, gfp_t gfp) { bool put_iflink = false; if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) { struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); if (!net_eq(dev_net(dev), link_net)) { int id = peernet2id_alloc(src_net, link_net, gfp); if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) return -EMSGSIZE; put_iflink = true; } } return nla_put_iflink(skb, dev, put_iflink); } static int rtnl_fill_link_af(struct sk_buff *skb, const struct net_device *dev, u32 ext_filter_mask) { const struct rtnl_af_ops *af_ops; struct nlattr *af_spec; af_spec = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af_spec) return -EMSGSIZE; list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { struct nlattr *af; int err; if (!af_ops->fill_link_af) continue; af = nla_nest_start_noflag(skb, af_ops->family); if (!af) return -EMSGSIZE; err = af_ops->fill_link_af(skb, dev, ext_filter_mask); /* * Caller may return ENODATA to indicate that there * was no data to be dumped. This is not an error, it * means we should trim the attribute header and * continue. */ if (err == -ENODATA) nla_nest_cancel(skb, af); else if (err < 0) return -EMSGSIZE; nla_nest_end(skb, af); } nla_nest_end(skb, af_spec); return 0; } static int rtnl_fill_alt_ifnames(struct sk_buff *skb, const struct net_device *dev) { struct netdev_name_node *name_node; int count = 0; list_for_each_entry_rcu(name_node, &dev->name_node->list, list) { if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name)) return -EMSGSIZE; count++; } return count; } /* RCU protected. */ static int rtnl_fill_prop_list(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *prop_list; int ret; prop_list = nla_nest_start(skb, IFLA_PROP_LIST); if (!prop_list) return -EMSGSIZE; ret = rtnl_fill_alt_ifnames(skb, dev); if (ret <= 0) goto nest_cancel; nla_nest_end(skb, prop_list); return 0; nest_cancel: nla_nest_cancel(skb, prop_list); return ret; } static int rtnl_fill_proto_down(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *pr; u32 preason; if (nla_put_u8(skb, IFLA_PROTO_DOWN, READ_ONCE(dev->proto_down))) goto nla_put_failure; preason = READ_ONCE(dev->proto_down_reason); if (!preason) return 0; pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON); if (!pr) return -EMSGSIZE; if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) { nla_nest_cancel(skb, pr); goto nla_put_failure; } nla_nest_end(skb, pr); return 0; nla_put_failure: return -EMSGSIZE; } static int rtnl_fill_devlink_port(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *devlink_port_nest; int ret; devlink_port_nest = nla_nest_start(skb, IFLA_DEVLINK_PORT); if (!devlink_port_nest) return -EMSGSIZE; if (dev->devlink_port) { ret = devlink_nl_port_handle_fill(skb, dev->devlink_port); if (ret < 0) goto nest_cancel; } nla_nest_end(skb, devlink_port_nest); return 0; nest_cancel: nla_nest_cancel(skb, devlink_port_nest); return ret; } static int rtnl_fill_dpll_pin(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *dpll_pin_nest; int ret; dpll_pin_nest = nla_nest_start(skb, IFLA_DPLL_PIN); if (!dpll_pin_nest) return -EMSGSIZE; ret = dpll_netdev_add_pin_handle(skb, dev); if (ret < 0) goto nest_cancel; nla_nest_end(skb, dpll_pin_nest); return 0; nest_cancel: nla_nest_cancel(skb, dpll_pin_nest); return ret; } static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, u32 event, int *new_nsid, int new_ifindex, int tgt_netnsid, gfp_t gfp) { char devname[IFNAMSIZ]; struct ifinfomsg *ifm; struct nlmsghdr *nlh; struct Qdisc *qdisc; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); if (nlh == NULL) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifi_family = AF_UNSPEC; ifm->__ifi_pad = 0; ifm->ifi_type = READ_ONCE(dev->type); ifm->ifi_index = READ_ONCE(dev->ifindex); ifm->ifi_flags = dev_get_flags(dev); ifm->ifi_change = change; if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; netdev_copy_name(dev, devname); if (nla_put_string(skb, IFLA_IFNAME, devname)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_TXQLEN, READ_ONCE(dev->tx_queue_len)) || nla_put_u8(skb, IFLA_OPERSTATE, netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN) || nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) || nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) || nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) || nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) || nla_put_u32(skb, IFLA_GROUP, READ_ONCE(dev->group)) || nla_put_u32(skb, IFLA_PROMISCUITY, READ_ONCE(dev->promiscuity)) || nla_put_u32(skb, IFLA_ALLMULTI, READ_ONCE(dev->allmulti)) || nla_put_u32(skb, IFLA_NUM_TX_QUEUES, READ_ONCE(dev->num_tx_queues)) || nla_put_u32(skb, IFLA_GSO_MAX_SEGS, READ_ONCE(dev->gso_max_segs)) || nla_put_u32(skb, IFLA_GSO_MAX_SIZE, READ_ONCE(dev->gso_max_size)) || nla_put_u32(skb, IFLA_GRO_MAX_SIZE, READ_ONCE(dev->gro_max_size)) || nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, READ_ONCE(dev->gso_ipv4_max_size)) || nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, READ_ONCE(dev->gro_ipv4_max_size)) || nla_put_u32(skb, IFLA_TSO_MAX_SIZE, READ_ONCE(dev->tso_max_size)) || nla_put_u32(skb, IFLA_TSO_MAX_SEGS, READ_ONCE(dev->tso_max_segs)) || nla_put_uint(skb, IFLA_MAX_PACING_OFFLOAD_HORIZON, READ_ONCE(dev->max_pacing_offload_horizon)) || #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, READ_ONCE(dev->num_rx_queues)) || #endif put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + atomic_read(&dev->carrier_down_count)) || nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, atomic_read(&dev->carrier_down_count))) goto nla_put_failure; if (rtnl_fill_proto_down(skb, dev)) goto nla_put_failure; if (event != IFLA_EVENT_NONE) { if (nla_put_u32(skb, IFLA_EVENT, event)) goto nla_put_failure; } if (dev->addr_len) { if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) || nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast)) goto nla_put_failure; } if (rtnl_phys_port_id_fill(skb, dev)) goto nla_put_failure; if (rtnl_phys_port_name_fill(skb, dev)) goto nla_put_failure; if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; if (rtnl_fill_stats(skb, dev)) goto nla_put_failure; if (rtnl_fill_vf(skb, dev, ext_filter_mask)) goto nla_put_failure; if (rtnl_port_fill(skb, dev, ext_filter_mask)) goto nla_put_failure; if (rtnl_xdp_fill(skb, dev)) goto nla_put_failure; if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) { if (rtnl_link_fill(skb, dev) < 0) goto nla_put_failure; } if (new_nsid && nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) goto nla_put_failure; if (new_ifindex && nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0) goto nla_put_failure; if (memchr_inv(dev->perm_addr, '\0', dev->addr_len) && nla_put(skb, IFLA_PERM_ADDRESS, dev->addr_len, dev->perm_addr)) goto nla_put_failure; rcu_read_lock(); if (rtnl_fill_link_netnsid(skb, dev, src_net, GFP_ATOMIC)) goto nla_put_failure_rcu; qdisc = rcu_dereference(dev->qdisc); if (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) goto nla_put_failure_rcu; if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) goto nla_put_failure_rcu; if (rtnl_fill_link_ifmap(skb, dev)) goto nla_put_failure_rcu; if (rtnl_fill_prop_list(skb, dev)) goto nla_put_failure_rcu; rcu_read_unlock(); if (dev->dev.parent && nla_put_string(skb, IFLA_PARENT_DEV_NAME, dev_name(dev->dev.parent))) goto nla_put_failure; if (dev->dev.parent && dev->dev.parent->bus && nla_put_string(skb, IFLA_PARENT_DEV_BUS_NAME, dev->dev.parent->bus->name)) goto nla_put_failure; if (rtnl_fill_devlink_port(skb, dev)) goto nla_put_failure; if (rtnl_fill_dpll_pin(skb, dev)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure_rcu: rcu_read_unlock(); nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_UNSPEC] = { .strict_start_type = IFLA_DPLL_PIN }, [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) }, [IFLA_MTU] = { .type = NLA_U32 }, [IFLA_LINK] = { .type = NLA_U32 }, [IFLA_MASTER] = { .type = NLA_U32 }, [IFLA_CARRIER] = { .type = NLA_U8 }, [IFLA_TXQLEN] = { .type = NLA_U32 }, [IFLA_WEIGHT] = { .type = NLA_U32 }, [IFLA_OPERSTATE] = { .type = NLA_U8 }, [IFLA_LINKMODE] = { .type = NLA_U8 }, [IFLA_LINKINFO] = { .type = NLA_NESTED }, [IFLA_NET_NS_PID] = { .type = NLA_U32 }, [IFLA_NET_NS_FD] = { .type = NLA_U32 }, /* IFLA_IFALIAS is a string, but policy is set to NLA_BINARY to * allow 0-length string (needed to remove an alias). */ [IFLA_IFALIAS] = { .type = NLA_BINARY, .len = IFALIASZ - 1 }, [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, [IFLA_VF_PORTS] = { .type = NLA_NESTED }, [IFLA_PORT_SELF] = { .type = NLA_NESTED }, [IFLA_AF_SPEC] = { .type = NLA_NESTED }, [IFLA_EXT_MASK] = { .type = NLA_U32 }, [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 }, [IFLA_GSO_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, [IFLA_XDP] = { .type = NLA_NESTED }, [IFLA_EVENT] = { .type = NLA_U32 }, [IFLA_GROUP] = { .type = NLA_U32 }, [IFLA_TARGET_NETNSID] = { .type = NLA_S32 }, [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 }, [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, [IFLA_MIN_MTU] = { .type = NLA_U32 }, [IFLA_MAX_MTU] = { .type = NLA_U32 }, [IFLA_PROP_LIST] = { .type = NLA_NESTED }, [IFLA_ALT_IFNAME] = { .type = NLA_STRING, .len = ALTIFNAMSIZ - 1 }, [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING }, [IFLA_GRO_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT }, [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT }, [IFLA_ALLMULTI] = { .type = NLA_REJECT }, [IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_KIND] = { .type = NLA_STRING }, [IFLA_INFO_DATA] = { .type = NLA_NESTED }, [IFLA_INFO_SLAVE_KIND] = { .type = NLA_STRING }, [IFLA_INFO_SLAVE_DATA] = { .type = NLA_NESTED }, }; static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, [IFLA_VF_BROADCAST] = { .type = NLA_REJECT }, [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED }, [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) }, [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) }, [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, [IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) }, [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) }, [IFLA_VF_STATS] = { .type = NLA_NESTED }, [IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) }, [IFLA_VF_IB_NODE_GUID] = { .len = sizeof(struct ifla_vf_guid) }, [IFLA_VF_IB_PORT_GUID] = { .len = sizeof(struct ifla_vf_guid) }, }; static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_VF] = { .type = NLA_U32 }, [IFLA_PORT_PROFILE] = { .type = NLA_STRING, .len = PORT_PROFILE_MAX }, [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY, .len = PORT_UUID_MAX }, [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING, .len = PORT_UUID_MAX }, [IFLA_PORT_REQUEST] = { .type = NLA_U8, }, [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, /* Unused, but we need to keep it here since user space could * fill it. It's also broken with regard to NLA_BINARY use in * combination with structs. */ [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY, .len = sizeof(struct ifla_port_vsi) }, }; static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_UNSPEC] = { .strict_start_type = IFLA_XDP_EXPECTED_FD }, [IFLA_XDP_FD] = { .type = NLA_S32 }, [IFLA_XDP_EXPECTED_FD] = { .type = NLA_S32 }, [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, [IFLA_XDP_FLAGS] = { .type = NLA_U32 }, [IFLA_XDP_PROG_ID] = { .type = NLA_U32 }, }; static struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla, int *ops_srcu_index) { struct nlattr *linfo[IFLA_INFO_MAX + 1]; struct rtnl_link_ops *ops = NULL; if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0) return NULL; if (linfo[IFLA_INFO_KIND]) { char kind[MODULE_NAME_LEN]; nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind, ops_srcu_index); } return ops; } static bool link_master_filtered(struct net_device *dev, int master_idx) { struct net_device *master; if (!master_idx) return false; master = netdev_master_upper_dev_get(dev); /* 0 is already used to denote IFLA_MASTER wasn't passed, therefore need * another invalid value for ifindex to denote "no master". */ if (master_idx == -1) return !!master; if (!master || master->ifindex != master_idx) return true; return false; } static bool link_kind_filtered(const struct net_device *dev, const struct rtnl_link_ops *kind_ops) { if (kind_ops && dev->rtnl_link_ops != kind_ops) return true; return false; } static bool link_dump_filtered(struct net_device *dev, int master_idx, const struct rtnl_link_ops *kind_ops) { if (link_master_filtered(dev, master_idx) || link_kind_filtered(dev, kind_ops)) return true; return false; } /** * rtnl_get_net_ns_capable - Get netns if sufficiently privileged. * @sk: netlink socket * @netnsid: network namespace identifier * * Returns the network namespace identified by netnsid on success or an error * pointer on failure. */ struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid) { struct net *net; net = get_net_ns_by_id(sock_net(sk), netnsid); if (!net) return ERR_PTR(-EINVAL); /* For now, the caller is required to have CAP_NET_ADMIN in * the user namespace owning the target net ns. */ if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) { put_net(net); return ERR_PTR(-EACCES); } return net; } EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable); static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, bool strict_check, struct nlattr **tb, struct netlink_ext_ack *extack) { int hdrlen; if (strict_check) { struct ifinfomsg *ifm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "Invalid header for link dump"); return -EINVAL; } ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request"); return -EINVAL; } if (ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Filter by device index not supported for link dumps"); return -EINVAL; } return nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); } /* A hack to preserve kernel<->userspace interface. * The correct header is ifinfomsg. It is consistent with rtnl_getlink. * However, before Linux v3.9 the code here assumed rtgenmsg and that's * what iproute2 < v3.9.0 used. * We can detect the old iproute2. Even including the IFLA_EXT_MASK * attribute, its netlink message is shorter than struct ifinfomsg. */ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); return nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, extack); } static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct rtnl_link_ops *kind_ops = NULL; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; struct nlattr *tb[IFLA_MAX+1]; struct { unsigned long ifindex; } *ctx = (void *)cb->ctx; struct net *tgt_net = net; u32 ext_filter_mask = 0; struct net_device *dev; int ops_srcu_index; int master_idx = 0; int netnsid = -1; int err, i; err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack); if (err < 0) { if (cb->strict_check) return err; goto walk_entries; } for (i = 0; i <= IFLA_MAX; ++i) { if (!tb[i]) continue; /* new attributes should only be added with strict checking */ switch (i) { case IFLA_TARGET_NETNSID: netnsid = nla_get_s32(tb[i]); tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid); if (IS_ERR(tgt_net)) { NL_SET_ERR_MSG(extack, "Invalid target network namespace id"); err = PTR_ERR(tgt_net); netnsid = -1; goto out; } break; case IFLA_EXT_MASK: ext_filter_mask = nla_get_u32(tb[i]); break; case IFLA_MASTER: master_idx = nla_get_u32(tb[i]); break; case IFLA_LINKINFO: kind_ops = linkinfo_to_kind_ops(tb[i], &ops_srcu_index); break; default: if (cb->strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request"); err = -EINVAL; goto out; } } } if (master_idx || kind_ops) flags |= NLM_F_DUMP_FILTERED; walk_entries: err = 0; for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { if (link_dump_filtered(dev, master_idx, kind_ops)) continue; err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, 0, flags, ext_filter_mask, 0, NULL, 0, netnsid, GFP_KERNEL); if (err < 0) break; } cb->seq = tgt_net->dev_base_seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); out: if (kind_ops) rtnl_link_ops_put(kind_ops, ops_srcu_index); if (netnsid >= 0) put_net(tgt_net); return err; } int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, struct netlink_ext_ack *exterr) { const struct ifinfomsg *ifmp; const struct nlattr *attrs; size_t len; ifmp = nla_data(nla_peer); attrs = nla_data(nla_peer) + sizeof(struct ifinfomsg); len = nla_len(nla_peer) - sizeof(struct ifinfomsg); if (ifmp->ifi_index < 0) { NL_SET_ERR_MSG_ATTR(exterr, nla_peer, "ifindex can't be negative"); return -EINVAL; } return nla_parse_deprecated(tb, IFLA_MAX, attrs, len, ifla_policy, exterr); } EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg); static struct net *rtnl_link_get_net_ifla(struct nlattr *tb[]) { struct net *net = NULL; /* Examine the link attributes and figure out which * network namespace we are talking about. */ if (tb[IFLA_NET_NS_PID]) net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); else if (tb[IFLA_NET_NS_FD]) net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); return net; } struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) { struct net *net = rtnl_link_get_net_ifla(tb); if (!net) net = get_net(src_net); return net; } EXPORT_SYMBOL(rtnl_link_get_net); /* Figure out which network namespace we are talking about by * examining the link attributes in the following order: * * 1. IFLA_NET_NS_PID * 2. IFLA_NET_NS_FD * 3. IFLA_TARGET_NETNSID */ static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, struct nlattr *tb[]) { struct net *net; if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) return rtnl_link_get_net(src_net, tb); if (!tb[IFLA_TARGET_NETNSID]) return get_net(src_net); net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_TARGET_NETNSID])); if (!net) return ERR_PTR(-EINVAL); return net; } static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb, struct net *src_net, struct nlattr *tb[], int cap) { struct net *net; net = rtnl_link_get_net_by_nlattr(src_net, tb); if (IS_ERR(net)) return net; if (!netlink_ns_capable(skb, net->user_ns, cap)) { put_net(net); return ERR_PTR(-EPERM); } return net; } /* Verify that rtnetlink requests do not pass additional properties * potentially referring to different network namespaces. */ static int rtnl_ensure_unique_netns(struct nlattr *tb[], struct netlink_ext_ack *extack, bool netns_id_only) { if (netns_id_only) { if (!tb[IFLA_NET_NS_PID] && !tb[IFLA_NET_NS_FD]) return 0; NL_SET_ERR_MSG(extack, "specified netns attribute not supported"); return -EOPNOTSUPP; } if (tb[IFLA_TARGET_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])) goto invalid_attr; if (tb[IFLA_NET_NS_PID] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_FD])) goto invalid_attr; if (tb[IFLA_NET_NS_FD] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_PID])) goto invalid_attr; return 0; invalid_attr: NL_SET_ERR_MSG(extack, "multiple netns identifying attributes specified"); return -EINVAL; } static int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, int max_tx_rate) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_set_vf_rate) return -EOPNOTSUPP; if (max_tx_rate && max_tx_rate < min_tx_rate) return -EINVAL; return ops->ndo_set_vf_rate(dev, vf, min_tx_rate, max_tx_rate); } static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS] && nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) return -EINVAL; if (tb[IFLA_BROADCAST] && nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) return -EINVAL; if (tb[IFLA_GSO_MAX_SIZE] && nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) { NL_SET_ERR_MSG(extack, "too big gso_max_size"); return -EINVAL; } if (tb[IFLA_GSO_MAX_SEGS] && (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS || nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) { NL_SET_ERR_MSG(extack, "too big gso_max_segs"); return -EINVAL; } if (tb[IFLA_GRO_MAX_SIZE] && nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) { NL_SET_ERR_MSG(extack, "too big gro_max_size"); return -EINVAL; } if (tb[IFLA_GSO_IPV4_MAX_SIZE] && nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) { NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size"); return -EINVAL; } if (tb[IFLA_GRO_IPV4_MAX_SIZE] && nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) { NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size"); return -EINVAL; } if (tb[IFLA_AF_SPEC]) { struct nlattr *af; int rem, err; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { struct rtnl_af_ops *af_ops; int af_ops_srcu_index; af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); if (!af_ops) return -EAFNOSUPPORT; if (!af_ops->set_link_af) err = -EOPNOTSUPP; else if (af_ops->validate_link_af) err = af_ops->validate_link_af(dev, af, extack); else err = 0; rtnl_af_put(af_ops, af_ops_srcu_index); if (err < 0) return err; } } return 0; } static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type) { const struct net_device_ops *ops = dev->netdev_ops; return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type); } static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type) { if (dev->type != ARPHRD_INFINIBAND) return -EOPNOTSUPP; return handle_infiniband_guid(dev, ivt, guid_type); } static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) { const struct net_device_ops *ops = dev->netdev_ops; int err = -EINVAL; if (tb[IFLA_VF_MAC]) { struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]); if (ivm->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_mac) err = ops->ndo_set_vf_mac(dev, ivm->vf, ivm->mac); if (err < 0) return err; } if (tb[IFLA_VF_VLAN]) { struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]); if (ivv->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_vlan) err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan, ivv->qos, htons(ETH_P_8021Q)); if (err < 0) return err; } if (tb[IFLA_VF_VLAN_LIST]) { struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN]; struct nlattr *attr; int rem, len = 0; err = -EOPNOTSUPP; if (!ops->ndo_set_vf_vlan) return err; nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) { if (nla_type(attr) != IFLA_VF_VLAN_INFO || nla_len(attr) < sizeof(struct ifla_vf_vlan_info)) { return -EINVAL; } if (len >= MAX_VLAN_LIST_LEN) return -EOPNOTSUPP; ivvl[len] = nla_data(attr); len++; } if (len == 0) return -EINVAL; if (ivvl[0]->vf >= INT_MAX) return -EINVAL; err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan, ivvl[0]->qos, ivvl[0]->vlan_proto); if (err < 0) return err; } if (tb[IFLA_VF_TX_RATE]) { struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]); struct ifla_vf_info ivf; if (ivt->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_get_vf_config) err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf); if (err < 0) return err; err = rtnl_set_vf_rate(dev, ivt->vf, ivf.min_tx_rate, ivt->rate); if (err < 0) return err; } if (tb[IFLA_VF_RATE]) { struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]); if (ivt->vf >= INT_MAX) return -EINVAL; err = rtnl_set_vf_rate(dev, ivt->vf, ivt->min_tx_rate, ivt->max_tx_rate); if (err < 0) return err; } if (tb[IFLA_VF_SPOOFCHK]) { struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]); if (ivs->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_spoofchk) err = ops->ndo_set_vf_spoofchk(dev, ivs->vf, ivs->setting); if (err < 0) return err; } if (tb[IFLA_VF_LINK_STATE]) { struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]); if (ivl->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_link_state) err = ops->ndo_set_vf_link_state(dev, ivl->vf, ivl->link_state); if (err < 0) return err; } if (tb[IFLA_VF_RSS_QUERY_EN]) { struct ifla_vf_rss_query_en *ivrssq_en; err = -EOPNOTSUPP; ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]); if (ivrssq_en->vf >= INT_MAX) return -EINVAL; if (ops->ndo_set_vf_rss_query_en) err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf, ivrssq_en->setting); if (err < 0) return err; } if (tb[IFLA_VF_TRUST]) { struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]); if (ivt->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_trust) err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting); if (err < 0) return err; } if (tb[IFLA_VF_IB_NODE_GUID]) { struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]); if (ivt->vf >= INT_MAX) return -EINVAL; if (!ops->ndo_set_vf_guid) return -EOPNOTSUPP; return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID); } if (tb[IFLA_VF_IB_PORT_GUID]) { struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]); if (ivt->vf >= INT_MAX) return -EINVAL; if (!ops->ndo_set_vf_guid) return -EOPNOTSUPP; return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID); } return err; } static int do_set_master(struct net_device *dev, int ifindex, struct netlink_ext_ack *extack) { struct net_device *upper_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops; int err; if (upper_dev) { if (upper_dev->ifindex == ifindex) return 0; ops = upper_dev->netdev_ops; if (ops->ndo_del_slave) { err = ops->ndo_del_slave(upper_dev, dev); if (err) return err; } else { return -EOPNOTSUPP; } } if (ifindex) { upper_dev = __dev_get_by_index(dev_net(dev), ifindex); if (!upper_dev) return -EINVAL; ops = upper_dev->netdev_ops; if (ops->ndo_add_slave) { err = ops->ndo_add_slave(upper_dev, dev, extack); if (err) return err; } else { return -EOPNOTSUPP; } } return 0; } static const struct nla_policy ifla_proto_down_reason_policy[IFLA_PROTO_DOWN_REASON_VALUE + 1] = { [IFLA_PROTO_DOWN_REASON_MASK] = { .type = NLA_U32 }, [IFLA_PROTO_DOWN_REASON_VALUE] = { .type = NLA_U32 }, }; static int do_set_proto_down(struct net_device *dev, struct nlattr *nl_proto_down, struct nlattr *nl_proto_down_reason, struct netlink_ext_ack *extack) { struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1]; unsigned long mask = 0; u32 value; bool proto_down; int err; if (!dev->change_proto_down) { NL_SET_ERR_MSG(extack, "Protodown not supported by device"); return -EOPNOTSUPP; } if (nl_proto_down_reason) { err = nla_parse_nested_deprecated(pdreason, IFLA_PROTO_DOWN_REASON_MAX, nl_proto_down_reason, ifla_proto_down_reason_policy, NULL); if (err < 0) return err; if (!pdreason[IFLA_PROTO_DOWN_REASON_VALUE]) { NL_SET_ERR_MSG(extack, "Invalid protodown reason value"); return -EINVAL; } value = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_VALUE]); if (pdreason[IFLA_PROTO_DOWN_REASON_MASK]) mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]); dev_change_proto_down_reason(dev, mask, value); } if (nl_proto_down) { proto_down = nla_get_u8(nl_proto_down); /* Don't turn off protodown if there are active reasons */ if (!proto_down && dev->proto_down_reason) { NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); return -EBUSY; } err = dev_change_proto_down(dev, proto_down); if (err) return err; } return 0; } #define DO_SETLINK_MODIFIED 0x01 /* notify flag means notify + modified. */ #define DO_SETLINK_NOTIFY 0x03 static int do_setlink(const struct sk_buff *skb, struct net_device *dev, struct net *tgt_net, struct ifinfomsg *ifm, struct netlink_ext_ack *extack, struct nlattr **tb, int status) { const struct net_device_ops *ops = dev->netdev_ops; char ifname[IFNAMSIZ]; int err; err = validate_linkmsg(dev, tb, extack); if (err < 0) goto errout; if (tb[IFLA_IFNAME]) nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else ifname[0] = '\0'; if (!net_eq(tgt_net, dev_net(dev))) { const char *pat = ifname[0] ? ifname : NULL; int new_ifindex; new_ifindex = nla_get_s32_default(tb[IFLA_NEW_IFINDEX], 0); err = __dev_change_net_namespace(dev, tgt_net, pat, new_ifindex); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_MAP]) { struct rtnl_link_ifmap *u_map; struct ifmap k_map; if (!ops->ndo_set_config) { err = -EOPNOTSUPP; goto errout; } if (!netif_device_present(dev)) { err = -ENODEV; goto errout; } u_map = nla_data(tb[IFLA_MAP]); k_map.mem_start = (unsigned long) u_map->mem_start; k_map.mem_end = (unsigned long) u_map->mem_end; k_map.base_addr = (unsigned short) u_map->base_addr; k_map.irq = (unsigned char) u_map->irq; k_map.dma = (unsigned char) u_map->dma; k_map.port = (unsigned char) u_map->port; err = ops->ndo_set_config(dev, &k_map); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_ADDRESS]) { struct sockaddr *sa; int len; len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len, sizeof(*sa)); sa = kmalloc(len, GFP_KERNEL); if (!sa) { err = -ENOMEM; goto errout; } sa->sa_family = dev->type; memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); err = dev_set_mac_address_user(dev, sa, extack); kfree(sa); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_MTU]) { err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GROUP]) { dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); status |= DO_SETLINK_NOTIFY; } /* * Interface selected by interface index but interface * name provided implies that a name change has been * requested. */ if (ifm->ifi_index > 0 && ifname[0]) { err = dev_change_name(dev, ifname); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_IFALIAS]) { err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]), nla_len(tb[IFLA_IFALIAS])); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_BROADCAST]) { nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); } if (ifm->ifi_flags || ifm->ifi_change) { err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), extack); if (err < 0) goto errout; } if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_CARRIER]) { err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_TXQLEN]) { unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]); err = dev_change_tx_queue_len(dev, value); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GSO_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); if (dev->gso_max_size ^ max_size) { netif_set_gso_max_size(dev, max_size); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_GSO_MAX_SEGS]) { u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); if (dev->gso_max_segs ^ max_segs) { netif_set_gso_max_segs(dev, max_segs); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_GRO_MAX_SIZE]) { u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]); if (dev->gro_max_size ^ gro_max_size) { netif_set_gro_max_size(dev, gro_max_size); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_GSO_IPV4_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]); if (dev->gso_ipv4_max_size ^ max_size) { netif_set_gso_ipv4_max_size(dev, max_size); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_GRO_IPV4_MAX_SIZE]) { u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]); if (dev->gro_ipv4_max_size ^ gro_max_size) { netif_set_gro_ipv4_max_size(dev, gro_max_size); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) { unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); if (dev->link_mode ^ value) status |= DO_SETLINK_NOTIFY; WRITE_ONCE(dev->link_mode, value); } if (tb[IFLA_VFINFO_LIST]) { struct nlattr *vfinfo[IFLA_VF_MAX + 1]; struct nlattr *attr; int rem; nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) { if (nla_type(attr) != IFLA_VF_INFO || nla_len(attr) < NLA_HDRLEN) { err = -EINVAL; goto errout; } err = nla_parse_nested_deprecated(vfinfo, IFLA_VF_MAX, attr, ifla_vf_policy, NULL); if (err < 0) goto errout; err = do_setvfinfo(dev, vfinfo); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } } err = 0; if (tb[IFLA_VF_PORTS]) { struct nlattr *port[IFLA_PORT_MAX+1]; struct nlattr *attr; int vf; int rem; err = -EOPNOTSUPP; if (!ops->ndo_set_vf_port) goto errout; nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) { if (nla_type(attr) != IFLA_VF_PORT || nla_len(attr) < NLA_HDRLEN) { err = -EINVAL; goto errout; } err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX, attr, ifla_port_policy, NULL); if (err < 0) goto errout; if (!port[IFLA_PORT_VF]) { err = -EOPNOTSUPP; goto errout; } vf = nla_get_u32(port[IFLA_PORT_VF]); err = ops->ndo_set_vf_port(dev, vf, port); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } } err = 0; if (tb[IFLA_PORT_SELF]) { struct nlattr *port[IFLA_PORT_MAX+1]; err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX, tb[IFLA_PORT_SELF], ifla_port_policy, NULL); if (err < 0) goto errout; err = -EOPNOTSUPP; if (ops->ndo_set_vf_port) err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_AF_SPEC]) { struct nlattr *af; int rem; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { struct rtnl_af_ops *af_ops; int af_ops_srcu_index; af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); if (!af_ops) { err = -EAFNOSUPPORT; goto errout; } err = af_ops->set_link_af(dev, af, extack); rtnl_af_put(af_ops, af_ops_srcu_index); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } } err = 0; if (tb[IFLA_PROTO_DOWN] || tb[IFLA_PROTO_DOWN_REASON]) { err = do_set_proto_down(dev, tb[IFLA_PROTO_DOWN], tb[IFLA_PROTO_DOWN_REASON], extack); if (err) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_XDP]) { struct nlattr *xdp[IFLA_XDP_MAX + 1]; u32 xdp_flags = 0; err = nla_parse_nested_deprecated(xdp, IFLA_XDP_MAX, tb[IFLA_XDP], ifla_xdp_policy, NULL); if (err < 0) goto errout; if (xdp[IFLA_XDP_ATTACHED] || xdp[IFLA_XDP_PROG_ID]) { err = -EINVAL; goto errout; } if (xdp[IFLA_XDP_FLAGS]) { xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]); if (xdp_flags & ~XDP_FLAGS_MASK) { err = -EINVAL; goto errout; } if (hweight32(xdp_flags & XDP_FLAGS_MODES) > 1) { err = -EINVAL; goto errout; } } if (xdp[IFLA_XDP_FD]) { int expected_fd = -1; if (xdp_flags & XDP_FLAGS_REPLACE) { if (!xdp[IFLA_XDP_EXPECTED_FD]) { err = -EINVAL; goto errout; } expected_fd = nla_get_s32(xdp[IFLA_XDP_EXPECTED_FD]); } err = dev_change_xdp_fd(dev, extack, nla_get_s32(xdp[IFLA_XDP_FD]), expected_fd, xdp_flags); if (err) goto errout; status |= DO_SETLINK_NOTIFY; } } errout: if (status & DO_SETLINK_MODIFIED) { if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY) netdev_state_change(dev); if (err < 0) net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", dev->name); } return err; } static struct net_device *rtnl_dev_get(struct net *net, struct nlattr *tb[]) { char ifname[ALTIFNAMSIZ]; if (tb[IFLA_IFNAME]) nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else if (tb[IFLA_ALT_IFNAME]) nla_strscpy(ifname, tb[IFLA_ALT_IFNAME], ALTIFNAMSIZ); else return NULL; return __dev_get_by_name(net, ifname); } static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct ifinfomsg *ifm = nlmsg_data(nlh); struct net *net = sock_net(skb->sk); struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct rtnl_nets rtnl_nets; struct net *tgt_net; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) goto errout; err = rtnl_ensure_unique_netns(tb, extack, false); if (err < 0) goto errout; tgt_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); if (IS_ERR(tgt_net)) { err = PTR_ERR(tgt_net); goto errout; } rtnl_nets_init(&rtnl_nets); rtnl_nets_add(&rtnl_nets, get_net(net)); rtnl_nets_add(&rtnl_nets, tgt_net); rtnl_nets_lock(&rtnl_nets); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(net, tb); else err = -EINVAL; if (dev) err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0); else if (!err) err = -ENODEV; rtnl_nets_unlock(&rtnl_nets); errout: return err; } static int rtnl_group_dellink(const struct net *net, int group) { struct net_device *dev, *aux; LIST_HEAD(list_kill); bool found = false; if (!group) return -EPERM; for_each_netdev(net, dev) { if (dev->group == group) { const struct rtnl_link_ops *ops; found = true; ops = dev->rtnl_link_ops; if (!ops || !ops->dellink) return -EOPNOTSUPP; } } if (!found) return -ENODEV; for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { const struct rtnl_link_ops *ops; ops = dev->rtnl_link_ops; ops->dellink(dev, &list_kill); } } unregister_netdevice_many(&list_kill); return 0; } int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr *nlh) { const struct rtnl_link_ops *ops; LIST_HEAD(list_kill); ops = dev->rtnl_link_ops; if (!ops || !ops->dellink) return -EOPNOTSUPP; ops->dellink(dev, &list_kill); unregister_netdevice_many_notify(&list_kill, portid, nlh); return 0; } EXPORT_SYMBOL_GPL(rtnl_delete_link); static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct ifinfomsg *ifm = nlmsg_data(nlh); struct net *net = sock_net(skb->sk); u32 portid = NETLINK_CB(skb).portid; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct net *tgt_net = net; int netnsid = -1; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) return err; err = rtnl_ensure_unique_netns(tb, extack, true); if (err < 0) return err; if (tb[IFLA_TARGET_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); if (IS_ERR(tgt_net)) return PTR_ERR(tgt_net); } rtnl_net_lock(tgt_net); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(tgt_net, tb); if (dev) err = rtnl_delete_link(dev, portid, nlh); else if (ifm->ifi_index > 0 || tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) err = -ENODEV; else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else err = -EINVAL; rtnl_net_unlock(tgt_net); if (netnsid >= 0) put_net(tgt_net); return err; } int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, u32 portid, const struct nlmsghdr *nlh) { unsigned int old_flags; int err; old_flags = dev->flags; if (ifm && (ifm->ifi_flags || ifm->ifi_change)) { err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), NULL); if (err < 0) return err; } if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags), portid, nlh); } else { dev->rtnl_link_state = RTNL_LINK_INITIALIZED; __dev_notify_flags(dev, old_flags, ~0U, portid, nlh); } return 0; } EXPORT_SYMBOL(rtnl_configure_link); struct net_device *rtnl_create_link(struct net *net, const char *ifname, unsigned char name_assign_type, const struct rtnl_link_ops *ops, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct net_device *dev; unsigned int num_tx_queues = 1; unsigned int num_rx_queues = 1; int err; if (tb[IFLA_NUM_TX_QUEUES]) num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]); else if (ops->get_num_tx_queues) num_tx_queues = ops->get_num_tx_queues(); if (tb[IFLA_NUM_RX_QUEUES]) num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]); else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); if (num_tx_queues < 1 || num_tx_queues > 4096) { NL_SET_ERR_MSG(extack, "Invalid number of transmit queues"); return ERR_PTR(-EINVAL); } if (num_rx_queues < 1 || num_rx_queues > 4096) { NL_SET_ERR_MSG(extack, "Invalid number of receive queues"); return ERR_PTR(-EINVAL); } if (ops->alloc) { dev = ops->alloc(tb, ifname, name_assign_type, num_tx_queues, num_rx_queues); if (IS_ERR(dev)) return dev; } else { dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues); } if (!dev) return ERR_PTR(-ENOMEM); err = validate_linkmsg(dev, tb, extack); if (err < 0) { free_netdev(dev); return ERR_PTR(err); } dev_net_set(dev, net); dev->rtnl_link_ops = ops; dev->rtnl_link_state = RTNL_LINK_INITIALIZING; if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); err = dev_validate_mtu(dev, mtu, extack); if (err) { free_netdev(dev); return ERR_PTR(err); } dev->mtu = mtu; } if (tb[IFLA_ADDRESS]) { __dev_addr_set(dev, nla_data(tb[IFLA_ADDRESS]), nla_len(tb[IFLA_ADDRESS])); dev->addr_assign_type = NET_ADDR_SET; } if (tb[IFLA_BROADCAST]) memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]), nla_len(tb[IFLA_BROADCAST])); if (tb[IFLA_TXQLEN]) dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); if (tb[IFLA_GROUP]) dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); if (tb[IFLA_GSO_MAX_SIZE]) netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); if (tb[IFLA_GSO_MAX_SEGS]) netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS])); if (tb[IFLA_GRO_MAX_SIZE]) netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE])); if (tb[IFLA_GSO_IPV4_MAX_SIZE]) netif_set_gso_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE])); if (tb[IFLA_GRO_IPV4_MAX_SIZE]) netif_set_gro_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE])); return dev; } EXPORT_SYMBOL(rtnl_create_link); struct rtnl_newlink_tbs { struct nlattr *tb[IFLA_MAX + 1]; struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; struct nlattr *attr[RTNL_MAX_TYPE + 1]; struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; }; static int rtnl_changelink(const struct sk_buff *skb, struct nlmsghdr *nlh, const struct rtnl_link_ops *ops, struct net_device *dev, struct net *tgt_net, struct rtnl_newlink_tbs *tbs, struct nlattr **data, struct netlink_ext_ack *extack) { struct nlattr ** const linkinfo = tbs->linkinfo; struct nlattr ** const tb = tbs->tb; int status = 0; int err; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; if (linkinfo[IFLA_INFO_DATA]) { if (!ops || ops != dev->rtnl_link_ops || !ops->changelink) return -EOPNOTSUPP; err = ops->changelink(dev, tb, data, extack); if (err < 0) return err; status |= DO_SETLINK_NOTIFY; } if (linkinfo[IFLA_INFO_SLAVE_DATA]) { const struct rtnl_link_ops *m_ops = NULL; struct nlattr **slave_data = NULL; struct net_device *master_dev; master_dev = netdev_master_upper_dev_get(dev); if (master_dev) m_ops = master_dev->rtnl_link_ops; if (!m_ops || !m_ops->slave_changelink) return -EOPNOTSUPP; if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) return -EINVAL; if (m_ops->slave_maxtype) { err = nla_parse_nested_deprecated(tbs->slave_attr, m_ops->slave_maxtype, linkinfo[IFLA_INFO_SLAVE_DATA], m_ops->slave_policy, extack); if (err < 0) return err; slave_data = tbs->slave_attr; } err = m_ops->slave_changelink(master_dev, dev, tb, slave_data, extack); if (err < 0) return err; status |= DO_SETLINK_NOTIFY; } return do_setlink(skb, dev, tgt_net, nlmsg_data(nlh), extack, tb, status); } static int rtnl_group_changelink(const struct sk_buff *skb, struct net *net, struct net *tgt_net, int group, struct ifinfomsg *ifm, struct netlink_ext_ack *extack, struct nlattr **tb) { struct net_device *dev, *aux; int err; for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0); if (err < 0) return err; } } return 0; } static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, const struct rtnl_link_ops *ops, struct net *tgt_net, struct net *link_net, struct net *peer_net, const struct nlmsghdr *nlh, struct nlattr **tb, struct nlattr **data, struct netlink_ext_ack *extack) { unsigned char name_assign_type = NET_NAME_USER; struct net *net = sock_net(skb->sk); u32 portid = NETLINK_CB(skb).portid; struct net_device *dev; char ifname[IFNAMSIZ]; int err; if (!ops->alloc && !ops->setup) return -EOPNOTSUPP; if (tb[IFLA_IFNAME]) { nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); } else { snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); name_assign_type = NET_NAME_ENUM; } dev = rtnl_create_link(link_net ? : tgt_net, ifname, name_assign_type, ops, tb, extack); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out; } dev->ifindex = ifm->ifi_index; if (link_net) net = link_net; if (peer_net) net = peer_net; if (ops->newlink) err = ops->newlink(net, dev, tb, data, extack); else err = register_netdevice(dev); if (err < 0) { free_netdev(dev); goto out; } err = rtnl_configure_link(dev, ifm, portid, nlh); if (err < 0) goto out_unregister; if (link_net) { err = dev_change_net_namespace(dev, tgt_net, ifname); if (err < 0) goto out_unregister; } if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto out_unregister; } out: return err; out_unregister: if (ops->newlink) { LIST_HEAD(list_kill); ops->dellink(dev, &list_kill); unregister_netdevice_many(&list_kill); } else { unregister_netdevice(dev); } goto out; } static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops, struct nlattr *tbp[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_MAX + 1]; int err; if (!data || !data[ops->peer_type]) return rtnl_link_get_net_ifla(tbp); err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack); if (err < 0) return ERR_PTR(err); if (ops->validate) { err = ops->validate(tb, NULL, extack); if (err < 0) return ERR_PTR(err); } return rtnl_link_get_net_ifla(tb); } static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, const struct rtnl_link_ops *ops, struct net *tgt_net, struct net *link_net, struct net *peer_net, struct rtnl_newlink_tbs *tbs, struct nlattr **data, struct netlink_ext_ack *extack) { struct nlattr ** const tb = tbs->tb; struct net *net = sock_net(skb->sk); struct net_device *dev; struct ifinfomsg *ifm; bool link_specified; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) { link_specified = true; dev = __dev_get_by_index(net, ifm->ifi_index); } else if (ifm->ifi_index < 0) { NL_SET_ERR_MSG(extack, "ifindex can't be negative"); return -EINVAL; } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) { link_specified = true; dev = rtnl_dev_get(net, tb); } else { link_specified = false; dev = NULL; } if (dev) return rtnl_changelink(skb, nlh, ops, dev, tgt_net, tbs, data, extack); if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { /* No dev found and NLM_F_CREATE not set. Requested dev does not exist, * or it's for a group */ if (link_specified || !tb[IFLA_GROUP]) return -ENODEV; return rtnl_group_changelink(skb, net, tgt_net, nla_get_u32(tb[IFLA_GROUP]), ifm, extack, tb); } if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) return -EOPNOTSUPP; if (!ops) { NL_SET_ERR_MSG(extack, "Unknown device type"); return -EOPNOTSUPP; } return rtnl_newlink_create(skb, ifm, ops, tgt_net, link_net, peer_net, nlh, tb, data, extack); } static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *tgt_net, *link_net = NULL, *peer_net = NULL; struct nlattr **tb, **linkinfo, **data = NULL; struct rtnl_link_ops *ops = NULL; struct rtnl_newlink_tbs *tbs; struct rtnl_nets rtnl_nets; int ops_srcu_index; int ret; tbs = kmalloc(sizeof(*tbs), GFP_KERNEL); if (!tbs) return -ENOMEM; tb = tbs->tb; ret = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); if (ret < 0) goto free; ret = rtnl_ensure_unique_netns(tb, extack, false); if (ret < 0) goto free; linkinfo = tbs->linkinfo; if (tb[IFLA_LINKINFO]) { ret = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO], ifla_info_policy, NULL); if (ret < 0) goto free; } else { memset(linkinfo, 0, sizeof(tbs->linkinfo)); } if (linkinfo[IFLA_INFO_KIND]) { char kind[MODULE_NAME_LEN]; nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind, &ops_srcu_index); #ifdef CONFIG_MODULES if (!ops) { request_module("rtnl-link-%s", kind); ops = rtnl_link_ops_get(kind, &ops_srcu_index); } #endif } rtnl_nets_init(&rtnl_nets); if (ops) { if (ops->maxtype > RTNL_MAX_TYPE) { ret = -EINVAL; goto put_ops; } if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { ret = nla_parse_nested_deprecated(tbs->attr, ops->maxtype, linkinfo[IFLA_INFO_DATA], ops->policy, extack); if (ret < 0) goto put_ops; data = tbs->attr; } if (ops->validate) { ret = ops->validate(tb, data, extack); if (ret < 0) goto put_ops; } if (ops->peer_type) { peer_net = rtnl_get_peer_net(ops, tb, data, extack); if (IS_ERR(peer_net)) { ret = PTR_ERR(peer_net); goto put_ops; } if (peer_net) rtnl_nets_add(&rtnl_nets, peer_net); } } tgt_net = rtnl_link_get_net_capable(skb, sock_net(skb->sk), tb, CAP_NET_ADMIN); if (IS_ERR(tgt_net)) { ret = PTR_ERR(tgt_net); goto put_net; } rtnl_nets_add(&rtnl_nets, tgt_net); if (tb[IFLA_LINK_NETNSID]) { int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); link_net = get_net_ns_by_id(tgt_net, id); if (!link_net) { NL_SET_ERR_MSG(extack, "Unknown network namespace id"); ret = -EINVAL; goto put_net; } rtnl_nets_add(&rtnl_nets, link_net); if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; goto put_net; } } rtnl_nets_lock(&rtnl_nets); ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, peer_net, tbs, data, extack); rtnl_nets_unlock(&rtnl_nets); put_net: rtnl_nets_destroy(&rtnl_nets); put_ops: if (ops) rtnl_link_ops_put(ops, ops_srcu_index); free: kfree(tbs); return ret; } static int rtnl_valid_getlink_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct ifinfomsg *ifm; int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "Invalid header for get link"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for get link request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err) return err; for (i = 0; i <= IFLA_MAX; i++) { if (!tb[i]) continue; switch (i) { case IFLA_IFNAME: case IFLA_ALT_IFNAME: case IFLA_EXT_MASK: case IFLA_TARGET_NETNSID: break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in get link request"); return -EINVAL; } } return 0; } static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net *tgt_net = net; struct ifinfomsg *ifm; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct sk_buff *nskb; int netnsid = -1; int err; u32 ext_filter_mask = 0; err = rtnl_valid_getlink_req(skb, nlh, tb, extack); if (err < 0) return err; err = rtnl_ensure_unique_netns(tb, extack, true); if (err < 0) return err; if (tb[IFLA_TARGET_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); if (IS_ERR(tgt_net)) return PTR_ERR(tgt_net); } if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(tgt_net, tb); else goto out; err = -ENODEV; if (dev == NULL) goto out; err = -ENOBUFS; nskb = nlmsg_new_large(if_nlmsg_size(dev, ext_filter_mask)); if (nskb == NULL) goto out; /* Synchronize the carrier state so we don't report a state * that we're not actually going to honour immediately; if * the driver just did a carrier off->on transition, we can * only TX if link watch work has run, but without this we'd * already report carrier on, even if it doesn't work yet. */ linkwatch_sync_dev(dev); err = rtnl_fill_ifinfo(nskb, dev, net, RTM_NEWLINK, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0, NULL, 0, netnsid, GFP_KERNEL); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(nskb); } else err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); out: if (netnsid >= 0) put_net(tgt_net); return err; } static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, bool *changed, struct netlink_ext_ack *extack) { char *alt_ifname; size_t size; int err; err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack); if (err) return err; if (cmd == RTM_NEWLINKPROP) { size = rtnl_prop_list_size(dev); size += nla_total_size(ALTIFNAMSIZ); if (size >= U16_MAX) { NL_SET_ERR_MSG(extack, "effective property list too long"); return -EINVAL; } } alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT); if (!alt_ifname) return -ENOMEM; if (cmd == RTM_NEWLINKPROP) { err = netdev_name_node_alt_create(dev, alt_ifname); if (!err) alt_ifname = NULL; } else if (cmd == RTM_DELLINKPROP) { err = netdev_name_node_alt_destroy(dev, alt_ifname); } else { WARN_ON_ONCE(1); err = -EINVAL; } kfree(alt_ifname); if (!err) *changed = true; return err; } static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; struct ifinfomsg *ifm; bool changed = false; struct nlattr *attr; int err, rem; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err) return err; err = rtnl_ensure_unique_netns(tb, extack, true); if (err) return err; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(net, tb); else return -EINVAL; if (!dev) return -ENODEV; if (!tb[IFLA_PROP_LIST]) return 0; nla_for_each_nested(attr, tb[IFLA_PROP_LIST], rem) { switch (nla_type(attr)) { case IFLA_ALT_IFNAME: err = rtnl_alt_ifname(cmd, dev, attr, &changed, extack); if (err) return err; break; } } if (changed) netdev_state_change(dev); return 0; } static int rtnl_newlinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { return rtnl_linkprop(RTM_NEWLINKPROP, skb, nlh, extack); } static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack); } static noinline_for_stack u32 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); size_t min_ifinfo_dump_size = 0; u32 ext_filter_mask = 0; struct net_device *dev; struct nlattr *nla; int hdrlen, rem; /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) return NLMSG_GOODSIZE; nla_for_each_attr_type(nla, IFLA_EXT_MASK, nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), rem) { if (nla_len(nla) == sizeof(u32)) ext_filter_mask = nla_get_u32(nla); } if (!ext_filter_mask) return NLMSG_GOODSIZE; /* * traverse the list of net devices and compute the minimum * buffer size based upon the filter mask. */ rcu_read_lock(); for_each_netdev_rcu(net, dev) { min_ifinfo_dump_size = max(min_ifinfo_dump_size, if_nlmsg_size(dev, ext_filter_mask)); } rcu_read_unlock(); return nlmsg_total_size(min_ifinfo_dump_size); } static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) { int idx; int s_idx = cb->family; int type = cb->nlh->nlmsg_type - RTM_BASE; int ret = 0; if (s_idx == 0) s_idx = 1; for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { struct rtnl_link __rcu **tab; struct rtnl_link *link; rtnl_dumpit_func dumpit; if (idx < s_idx || idx == PF_PACKET) continue; if (type < 0 || type >= RTM_NR_MSGTYPES) continue; tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]); if (!tab) continue; link = rcu_dereference_rtnl(tab[type]); if (!link) continue; dumpit = link->dumpit; if (!dumpit) continue; if (idx > s_idx) { memset(&cb->args[0], 0, sizeof(cb->args)); cb->prev_seq = 0; cb->seq = 0; } ret = dumpit(skb, cb); if (ret) break; } cb->family = idx; return skb->len ? : ret; } struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned int change, u32 event, gfp_t flags, int *new_nsid, int new_ifindex, u32 portid, const struct nlmsghdr *nlh) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; u32 seq = 0; skb = nlmsg_new(if_nlmsg_size(dev, 0), flags); if (skb == NULL) goto errout; if (nlmsg_report(nlh)) seq = nlmsg_seq(nlh); else portid = 0; err = rtnl_fill_ifinfo(skb, dev, dev_net(dev), type, portid, seq, change, 0, 0, event, new_nsid, new_ifindex, -1, flags); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } return skb; errout: rtnl_set_sk_err(net, RTNLGRP_LINK, err); return NULL; } void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags, u32 portid, const struct nlmsghdr *nlh) { struct net *net = dev_net(dev); rtnl_notify(skb, net, portid, RTNLGRP_LINK, nlh, flags); } static void rtmsg_ifinfo_event(int type, struct net_device *dev, unsigned int change, u32 event, gfp_t flags, int *new_nsid, int new_ifindex, u32 portid, const struct nlmsghdr *nlh) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid, new_ifindex, portid, nlh); if (skb) rtmsg_ifinfo_send(skb, dev, flags, portid, nlh); } void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags, u32 portid, const struct nlmsghdr *nlh) { rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL, 0, portid, nlh); } void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, gfp_t flags, int *new_nsid, int new_ifindex) { rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, new_nsid, new_ifindex, 0, NULL); } static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, u8 *addr, u16 vid, u32 pid, u32 seq, int type, unsigned int flags, int nlflags, u16 ndm_state) { struct nlmsghdr *nlh; struct ndmsg *ndm; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags); if (!nlh) return -EMSGSIZE; ndm = nlmsg_data(nlh); ndm->ndm_family = AF_BRIDGE; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = flags; ndm->ndm_type = 0; ndm->ndm_ifindex = dev->ifindex; ndm->ndm_state = ndm_state; if (nla_put(skb, NDA_LLADDR, dev->addr_len, addr)) goto nla_put_failure; if (vid) if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static inline size_t rtnl_fdb_nlmsg_size(const struct net_device *dev) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(dev->addr_len) + /* NDA_LLADDR */ nla_total_size(sizeof(u16)) + /* NDA_VLAN */ 0; } static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type, u16 ndm_state) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(rtnl_fdb_nlmsg_size(dev), GFP_ATOMIC); if (!skb) goto errout; err = nlmsg_populate_fdb_fill(skb, dev, addr, vid, 0, 0, type, NTF_SELF, 0, ndm_state); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } /* * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry */ int ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags) { int err = -EINVAL; /* If aging addresses are supported device will need to * implement its own handler for this. */ if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) { netdev_info(dev, "default FDB implementation only supports local addresses\n"); return err; } if (tb[NDA_FLAGS_EXT]) { netdev_info(dev, "invalid flags given to default FDB implementation\n"); return err; } if (vid) { netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n"); return err; } if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) err = dev_uc_add_excl(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_add_excl(dev, addr); /* Only return duplicate errors if NLM_F_EXCL is set */ if (err == -EEXIST && !(flags & NLM_F_EXCL)) err = 0; return err; } EXPORT_SYMBOL(ndo_dflt_fdb_add); static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid, struct netlink_ext_ack *extack) { u16 vid = 0; if (vlan_attr) { if (nla_len(vlan_attr) != sizeof(u16)) { NL_SET_ERR_MSG(extack, "invalid vlan attribute size"); return -EINVAL; } vid = nla_get_u16(vlan_attr); if (!vid || vid >= VLAN_VID_MASK) { NL_SET_ERR_MSG(extack, "invalid vlan id"); return -EINVAL; } } *p_vid = vid; return 0; } static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; u8 *addr; u16 vid; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); if (err < 0) return err; ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "invalid address"); return -EINVAL; } if (dev->type != ARPHRD_ETHER) { NL_SET_ERR_MSG(extack, "FDB add only supported for Ethernet devices"); return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); if (err) return err; err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; bool notified = false; err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags, ¬ified, extack); if (err) goto out; else ndm->ndm_flags &= ~NTF_MASTER; } /* Embedded bridge, macvlan, and any other device support */ if ((ndm->ndm_flags & NTF_SELF)) { bool notified = false; if (dev->netdev_ops->ndo_fdb_add) err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags, ¬ified, extack); else err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags); if (!err && !notified) { rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; } } out: return err; } /* * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry */ int ndo_dflt_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid) { int err = -EINVAL; /* If aging addresses are supported device will need to * implement its own handler for this. */ if (!(ndm->ndm_state & NUD_PERMANENT)) { netdev_info(dev, "default FDB implementation only supports local addresses\n"); return err; } if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) err = dev_uc_del(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_del(dev, addr); return err; } EXPORT_SYMBOL(ndo_dflt_fdb_del); static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK); struct net *net = sock_net(skb->sk); const struct net_device_ops *ops; struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; __u8 *addr = NULL; int err; u16 vid; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!del_bulk) { err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); } else { /* For bulk delete, the drivers will parse the message with * policy. */ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); } if (err < 0) return err; ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } if (!del_bulk) { if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "invalid address"); return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); if (err) return err; } if (dev->type != ARPHRD_ETHER) { NL_SET_ERR_MSG(extack, "FDB delete only supported for Ethernet devices"); return -EINVAL; } err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); bool notified = false; ops = br_dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, ¬ified, extack); } else { if (ops->ndo_fdb_del_bulk) err = ops->ndo_fdb_del_bulk(nlh, dev, extack); } if (err) goto out; else ndm->ndm_flags &= ~NTF_MASTER; } /* Embedded bridge, macvlan, and any other device support */ if (ndm->ndm_flags & NTF_SELF) { bool notified = false; ops = dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, ¬ified, extack); else err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); } else { /* in case err was cleared by NTF_MASTER call */ err = -EOPNOTSUPP; if (ops->ndo_fdb_del_bulk) err = ops->ndo_fdb_del_bulk(nlh, dev, extack); } if (!err) { if (!del_bulk && !notified) rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; } } out: return err; } static int nlmsg_populate_fdb(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, int *idx, struct netdev_hw_addr_list *list) { struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct netdev_hw_addr *ha; u32 portid, seq; int err; portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; list_for_each_entry(ha, &list->list, list) { if (*idx < ctx->fdb_idx) goto skip; err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, portid, seq, RTM_NEWNEIGH, NTF_SELF, NLM_F_MULTI, NUD_PERMANENT); if (err < 0) return err; skip: *idx += 1; } return 0; } /** * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table. * @skb: socket buffer to store message in * @cb: netlink callback * @dev: netdevice * @filter_dev: ignored * @idx: the number of FDB table entries dumped is added to *@idx * * Default netdevice operation to dump the existing unicast address list. * Returns number of addresses from list put in skb. */ int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { int err; if (dev->type != ARPHRD_ETHER) return -EINVAL; netif_addr_lock_bh(dev); err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc); if (err) goto out; err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc); out: netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(ndo_dflt_fdb_dump); static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, int *br_idx, int *brport_idx, struct netlink_ext_ack *extack) { struct nlattr *tb[NDA_MAX + 1]; struct ndmsg *ndm; int err, i; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request"); return -EINVAL; } ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, extack); if (err < 0) return err; *brport_idx = ndm->ndm_ifindex; for (i = 0; i <= NDA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case NDA_IFINDEX: if (nla_len(tb[i]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in fdb dump request"); return -EINVAL; } *brport_idx = nla_get_u32(tb[NDA_IFINDEX]); break; case NDA_MASTER: if (nla_len(tb[i]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in fdb dump request"); return -EINVAL; } *br_idx = nla_get_u32(tb[NDA_MASTER]); break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb dump request"); return -EINVAL; } } return 0; } static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh, int *br_idx, int *brport_idx, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_MAX+1]; int err; /* A hack to preserve kernel<->userspace interface. * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0. * However, ndmsg is shorter than ifinfomsg thus nlmsg_parse() bails. * So, check for ndmsg with an optional u32 attribute (not used here). * Fortunately these sizes don't conflict with the size of ifinfomsg * with an optional attribute. */ if (nlmsg_len(nlh) != sizeof(struct ndmsg) && (nlmsg_len(nlh) != sizeof(struct ndmsg) + nla_attr_size(sizeof(u32)))) { struct ifinfomsg *ifm; err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) { return -EINVAL; } else if (err == 0) { if (tb[IFLA_MASTER]) *br_idx = nla_get_u32(tb[IFLA_MASTER]); } ifm = nlmsg_data(nlh); *brport_idx = ifm->ifi_index; } return 0; } static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { const struct net_device_ops *ops = NULL, *cops = NULL; struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct net_device *dev, *br_dev = NULL; struct net *net = sock_net(skb->sk); int brport_idx = 0; int br_idx = 0; int fidx = 0; int err; NL_ASSERT_CTX_FITS(struct ndo_fdb_dump_context); if (cb->strict_check) err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx, cb->extack); else err = valid_fdb_dump_legacy(cb->nlh, &br_idx, &brport_idx, cb->extack); if (err < 0) return err; if (br_idx) { br_dev = __dev_get_by_index(net, br_idx); if (!br_dev) return -ENODEV; ops = br_dev->netdev_ops; } for_each_netdev_dump(net, dev, ctx->ifindex) { if (brport_idx && (dev->ifindex != brport_idx)) continue; if (!br_idx) { /* user did not specify a specific bridge */ if (netif_is_bridge_port(dev)) { br_dev = netdev_master_upper_dev_get(dev); cops = br_dev->netdev_ops; } } else { if (dev != br_dev && !netif_is_bridge_port(dev)) continue; if (br_dev != netdev_master_upper_dev_get(dev) && !netif_is_bridge_master(dev)) continue; cops = ops; } if (netif_is_bridge_port(dev)) { if (cops && cops->ndo_fdb_dump) { err = cops->ndo_fdb_dump(skb, cb, br_dev, dev, &fidx); if (err == -EMSGSIZE) break; } } if (dev->netdev_ops->ndo_fdb_dump) err = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, &fidx); else err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, &fidx); if (err == -EMSGSIZE) break; cops = NULL; /* reset fdb offset to 0 for rest of the interfaces */ ctx->fdb_idx = 0; fidx = 0; } ctx->fdb_idx = fidx; return skb->len; } static int valid_fdb_get_strict(const struct nlmsghdr *nlh, struct nlattr **tb, u8 *ndm_flags, int *br_idx, int *brport_idx, u8 **addr, u16 *vid, struct netlink_ext_ack *extack) { struct ndmsg *ndm; int err, i; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { NL_SET_ERR_MSG(extack, "Invalid header for fdb get request"); return -EINVAL; } ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request"); return -EINVAL; } if (ndm->ndm_flags & ~(NTF_MASTER | NTF_SELF)) { NL_SET_ERR_MSG(extack, "Invalid flags in header for fdb get request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) return err; *ndm_flags = ndm->ndm_flags; *brport_idx = ndm->ndm_ifindex; for (i = 0; i <= NDA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case NDA_MASTER: *br_idx = nla_get_u32(tb[i]); break; case NDA_LLADDR: if (nla_len(tb[i]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "Invalid address in fdb get request"); return -EINVAL; } *addr = nla_data(tb[i]); break; case NDA_VLAN: err = fdb_vid_parse(tb[i], vid, extack); if (err) return err; break; case NDA_VNI: break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb get request"); return -EINVAL; } } return 0; } static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net_device *dev = NULL, *br_dev = NULL; const struct net_device_ops *ops = NULL; struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NDA_MAX + 1]; struct sk_buff *skb; int brport_idx = 0; u8 ndm_flags = 0; int br_idx = 0; u8 *addr = NULL; u16 vid = 0; int err; err = valid_fdb_get_strict(nlh, tb, &ndm_flags, &br_idx, &brport_idx, &addr, &vid, extack); if (err < 0) return err; if (!addr) { NL_SET_ERR_MSG(extack, "Missing lookup address for fdb get request"); return -EINVAL; } if (brport_idx) { dev = __dev_get_by_index(net, brport_idx); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); return -ENODEV; } } if (br_idx) { if (dev) { NL_SET_ERR_MSG(extack, "Master and device are mutually exclusive"); return -EINVAL; } br_dev = __dev_get_by_index(net, br_idx); if (!br_dev) { NL_SET_ERR_MSG(extack, "Invalid master ifindex"); return -EINVAL; } ops = br_dev->netdev_ops; } if (dev) { if (!ndm_flags || (ndm_flags & NTF_MASTER)) { if (!netif_is_bridge_port(dev)) { NL_SET_ERR_MSG(extack, "Device is not a bridge port"); return -EINVAL; } br_dev = netdev_master_upper_dev_get(dev); if (!br_dev) { NL_SET_ERR_MSG(extack, "Master of device not found"); return -EINVAL; } ops = br_dev->netdev_ops; } else { if (!(ndm_flags & NTF_SELF)) { NL_SET_ERR_MSG(extack, "Missing NTF_SELF"); return -EINVAL; } ops = dev->netdev_ops; } } if (!br_dev && !dev) { NL_SET_ERR_MSG(extack, "No device specified"); return -ENODEV; } if (!ops || !ops->ndo_fdb_get) { NL_SET_ERR_MSG(extack, "Fdb get operation not supported by device"); return -EOPNOTSUPP; } skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (br_dev) dev = br_dev; err = ops->ndo_fdb_get(skb, tb, dev, addr, vid, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, extack); if (err) goto out; return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); out: kfree_skb(skb); return err; } static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, unsigned int attrnum, unsigned int flag) { if (mask & flag) return nla_put_u8(skb, attrnum, !!(flags & flag)); return 0; } int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u16 mode, u32 flags, u32 mask, int nlflags, u32 filter_mask, int (*vlan_fill)(struct sk_buff *skb, struct net_device *dev, u32 filter_mask)) { struct nlmsghdr *nlh; struct ifinfomsg *ifm; struct nlattr *br_afspec; struct nlattr *protinfo; u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; struct net_device *br_dev = netdev_master_upper_dev_get(dev); int err = 0; nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags); if (nlh == NULL) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifi_family = AF_BRIDGE; ifm->__ifi_pad = 0; ifm->ifi_type = dev->type; ifm->ifi_index = dev->ifindex; ifm->ifi_flags = dev_get_flags(dev); ifm->ifi_change = 0; if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || nla_put_u8(skb, IFLA_OPERSTATE, operstate) || (br_dev && nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || (dev->ifindex != dev_get_iflink(dev) && nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) goto nla_put_failure; br_afspec = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!br_afspec) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF)) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } if (mode != BRIDGE_MODE_UNDEF) { if (nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } } if (vlan_fill) { err = vlan_fill(skb, dev, filter_mask); if (err) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } } nla_nest_end(skb, br_afspec); protinfo = nla_nest_start(skb, IFLA_PROTINFO); if (!protinfo) goto nla_put_failure; if (brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_GUARD, BR_BPDU_GUARD) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_LEARNING, BR_LEARNING) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_PROXYARP, BR_PROXYARP) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD)) { nla_nest_cancel(skb, protinfo); goto nla_put_failure; } nla_nest_end(skb, protinfo); nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return err ? err : -EMSGSIZE; } EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink); static int valid_bridge_getlink_req(const struct nlmsghdr *nlh, bool strict_check, u32 *filter_mask, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_MAX+1]; int err, i; if (strict_check) { struct ifinfomsg *ifm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump"); return -EINVAL; } ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); } else { err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); } if (err < 0) return err; /* new attributes should only be added with strict checking */ for (i = 0; i <= IFLA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case IFLA_EXT_MASK: *filter_mask = nla_get_u32(tb[i]); break; default: if (strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in bridge link dump request"); return -EINVAL; } } } return 0; } static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct net_device *dev; int idx = 0; u32 portid = NETLINK_CB(cb->skb).portid; u32 seq = nlh->nlmsg_seq; u32 filter_mask = 0; int err; err = valid_bridge_getlink_req(nlh, cb->strict_check, &filter_mask, cb->extack); if (err < 0 && cb->strict_check) return err; rcu_read_lock(); for_each_netdev_rcu(net, dev) { const struct net_device_ops *ops = dev->netdev_ops; struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { if (idx >= cb->args[0]) { err = br_dev->netdev_ops->ndo_bridge_getlink( skb, portid, seq, dev, filter_mask, NLM_F_MULTI); if (err < 0 && err != -EOPNOTSUPP) { if (likely(skb->len)) break; goto out_err; } } idx++; } if (ops->ndo_bridge_getlink) { if (idx >= cb->args[0]) { err = ops->ndo_bridge_getlink(skb, portid, seq, dev, filter_mask, NLM_F_MULTI); if (err < 0 && err != -EOPNOTSUPP) { if (likely(skb->len)) break; goto out_err; } } idx++; } } err = skb->len; out_err: rcu_read_unlock(); cb->args[0] = idx; return err; } static inline size_t bridge_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(sizeof(u32)) /* IFLA_MASTER */ + nla_total_size(sizeof(u32)) /* IFLA_MTU */ + nla_total_size(sizeof(u32)) /* IFLA_LINK */ + nla_total_size(sizeof(u32)) /* IFLA_OPERSTATE */ + nla_total_size(sizeof(u8)) /* IFLA_PROTINFO */ + nla_total_size(sizeof(struct nlattr)) /* IFLA_AF_SPEC */ + nla_total_size(sizeof(u16)) /* IFLA_BRIDGE_FLAGS */ + nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_MODE */ } static int rtnl_bridge_notify(struct net_device *dev) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -EOPNOTSUPP; if (!dev->netdev_ops->ndo_bridge_getlink) return 0; skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC); if (!skb) { err = -ENOMEM; goto errout; } err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0, 0); if (err < 0) goto errout; /* Notification info is only filled for bridge ports, not the bridge * device itself. Therefore, a zero notification length is valid and * should not result in an error. */ if (!skb->len) goto errout; rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); return 0; errout: WARN_ON(err == -EMSGSIZE); kfree_skb(skb); if (err) rtnl_set_sk_err(net, RTNLGRP_LINK, err); return err; } static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; struct nlattr *br_spec, *attr, *br_flags_attr = NULL; int rem, err = -EOPNOTSUPP; u16 flags = 0; if (nlmsg_len(nlh) < sizeof(*ifm)) return -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_family != AF_BRIDGE) return -EPFNOSUPPORT; dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !br_flags_attr) { if (nla_len(attr) < sizeof(flags)) return -EINVAL; br_flags_attr = attr; flags = nla_get_u16(attr); } if (nla_type(attr) == IFLA_BRIDGE_MODE) { if (nla_len(attr) < sizeof(u16)) return -EINVAL; } } } if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) { err = -EOPNOTSUPP; goto out; } err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags, extack); if (err) goto out; flags &= ~BRIDGE_FLAGS_MASTER; } if ((flags & BRIDGE_FLAGS_SELF)) { if (!dev->netdev_ops->ndo_bridge_setlink) err = -EOPNOTSUPP; else err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags, extack); if (!err) { flags &= ~BRIDGE_FLAGS_SELF; /* Generate event to notify upper layer of bridge * change */ err = rtnl_bridge_notify(dev); } } if (br_flags_attr) memcpy(nla_data(br_flags_attr), &flags, sizeof(flags)); out: return err; } static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; struct nlattr *br_spec, *attr = NULL; int rem, err = -EOPNOTSUPP; u16 flags = 0; bool have_flags = false; if (nlmsg_len(nlh) < sizeof(*ifm)) return -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_family != AF_BRIDGE) return -EPFNOSUPPORT; dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { nla_for_each_nested_type(attr, IFLA_BRIDGE_FLAGS, br_spec, rem) { if (nla_len(attr) < sizeof(flags)) return -EINVAL; have_flags = true; flags = nla_get_u16(attr); break; } } if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) { err = -EOPNOTSUPP; goto out; } err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags); if (err) goto out; flags &= ~BRIDGE_FLAGS_MASTER; } if ((flags & BRIDGE_FLAGS_SELF)) { if (!dev->netdev_ops->ndo_bridge_dellink) err = -EOPNOTSUPP; else err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags); if (!err) { flags &= ~BRIDGE_FLAGS_SELF; /* Generate event to notify upper layer of bridge * change */ err = rtnl_bridge_notify(dev); } } if (have_flags) memcpy(nla_data(attr), &flags, sizeof(flags)); out: return err; } static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr) { return (mask & IFLA_STATS_FILTER_BIT(attrid)) && (!idxattr || idxattr == attrid); } static bool rtnl_offload_xstats_have_ndo(const struct net_device *dev, int attr_id) { return dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && dev->netdev_ops->ndo_get_offload_stats && dev->netdev_ops->ndo_has_offload_stats(dev, attr_id); } static unsigned int rtnl_offload_xstats_get_size_ndo(const struct net_device *dev, int attr_id) { return rtnl_offload_xstats_have_ndo(dev, attr_id) ? sizeof(struct rtnl_link_stats64) : 0; } static int rtnl_offload_xstats_fill_ndo(struct net_device *dev, int attr_id, struct sk_buff *skb) { unsigned int size = rtnl_offload_xstats_get_size_ndo(dev, attr_id); struct nlattr *attr = NULL; void *attr_data; int err; if (!size) return -ENODATA; attr = nla_reserve_64bit(skb, attr_id, size, IFLA_OFFLOAD_XSTATS_UNSPEC); if (!attr) return -EMSGSIZE; attr_data = nla_data(attr); memset(attr_data, 0, size); err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, attr_data); if (err) return err; return 0; } static unsigned int rtnl_offload_xstats_get_size_stats(const struct net_device *dev, enum netdev_offload_xstats_type type) { bool enabled = netdev_offload_xstats_enabled(dev, type); return enabled ? sizeof(struct rtnl_hw_stats64) : 0; } struct rtnl_offload_xstats_request_used { bool request; bool used; }; static int rtnl_offload_xstats_get_stats(struct net_device *dev, enum netdev_offload_xstats_type type, struct rtnl_offload_xstats_request_used *ru, struct rtnl_hw_stats64 *stats, struct netlink_ext_ack *extack) { bool request; bool used; int err; request = netdev_offload_xstats_enabled(dev, type); if (!request) { used = false; goto out; } err = netdev_offload_xstats_get(dev, type, stats, &used, extack); if (err) return err; out: if (ru) { ru->request = request; ru->used = used; } return 0; } static int rtnl_offload_xstats_fill_hw_s_info_one(struct sk_buff *skb, int attr_id, struct rtnl_offload_xstats_request_used *ru) { struct nlattr *nest; nest = nla_nest_start(skb, attr_id); if (!nest) return -EMSGSIZE; if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, ru->request)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, ru->used)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int rtnl_offload_xstats_fill_hw_s_info(struct sk_buff *skb, struct net_device *dev, struct netlink_ext_ack *extack) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; struct rtnl_offload_xstats_request_used ru_l3; struct nlattr *nest; int err; err = rtnl_offload_xstats_get_stats(dev, t_l3, &ru_l3, NULL, extack); if (err) return err; nest = nla_nest_start(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO); if (!nest) return -EMSGSIZE; if (rtnl_offload_xstats_fill_hw_s_info_one(skb, IFLA_OFFLOAD_XSTATS_L3_STATS, &ru_l3)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int rtnl_offload_xstats_fill(struct sk_buff *skb, struct net_device *dev, int *prividx, u32 off_filter_mask, struct netlink_ext_ack *extack) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; int attr_id_hw_s_info = IFLA_OFFLOAD_XSTATS_HW_S_INFO; int attr_id_l3_stats = IFLA_OFFLOAD_XSTATS_L3_STATS; int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT; bool have_data = false; int err; if (*prividx <= attr_id_cpu_hit && (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_cpu_hit))) { err = rtnl_offload_xstats_fill_ndo(dev, attr_id_cpu_hit, skb); if (!err) { have_data = true; } else if (err != -ENODATA) { *prividx = attr_id_cpu_hit; return err; } } if (*prividx <= attr_id_hw_s_info && (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_hw_s_info))) { *prividx = attr_id_hw_s_info; err = rtnl_offload_xstats_fill_hw_s_info(skb, dev, extack); if (err) return err; have_data = true; *prividx = 0; } if (*prividx <= attr_id_l3_stats && (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_l3_stats))) { unsigned int size_l3; struct nlattr *attr; *prividx = attr_id_l3_stats; size_l3 = rtnl_offload_xstats_get_size_stats(dev, t_l3); if (!size_l3) goto skip_l3_stats; attr = nla_reserve_64bit(skb, attr_id_l3_stats, size_l3, IFLA_OFFLOAD_XSTATS_UNSPEC); if (!attr) return -EMSGSIZE; err = rtnl_offload_xstats_get_stats(dev, t_l3, NULL, nla_data(attr), extack); if (err) return err; have_data = true; skip_l3_stats: *prividx = 0; } if (!have_data) return -ENODATA; *prividx = 0; return 0; } static unsigned int rtnl_offload_xstats_get_size_hw_s_info_one(const struct net_device *dev, enum netdev_offload_xstats_type type) { return nla_total_size(0) + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST */ nla_total_size(sizeof(u8)) + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED */ nla_total_size(sizeof(u8)) + 0; } static unsigned int rtnl_offload_xstats_get_size_hw_s_info(const struct net_device *dev) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; return nla_total_size(0) + /* IFLA_OFFLOAD_XSTATS_L3_STATS */ rtnl_offload_xstats_get_size_hw_s_info_one(dev, t_l3) + 0; } static int rtnl_offload_xstats_get_size(const struct net_device *dev, u32 off_filter_mask) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT; int nla_size = 0; int size; if (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_cpu_hit)) { size = rtnl_offload_xstats_get_size_ndo(dev, attr_id_cpu_hit); nla_size += nla_total_size_64bit(size); } if (off_filter_mask & IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO)) nla_size += rtnl_offload_xstats_get_size_hw_s_info(dev); if (off_filter_mask & IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_L3_STATS)) { size = rtnl_offload_xstats_get_size_stats(dev, t_l3); nla_size += nla_total_size_64bit(size); } if (nla_size != 0) nla_size += nla_total_size(0); return nla_size; } struct rtnl_stats_dump_filters { /* mask[0] filters outer attributes. Then individual nests have their * filtering mask at the index of the nested attribute. */ u32 mask[IFLA_STATS_MAX + 1]; }; static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, const struct rtnl_stats_dump_filters *filters, int *idxattr, int *prividx, struct netlink_ext_ack *extack) { unsigned int filter_mask = filters->mask[0]; struct if_stats_msg *ifsm; struct nlmsghdr *nlh; struct nlattr *attr; int s_prividx = *prividx; int err; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifsm), flags); if (!nlh) return -EMSGSIZE; ifsm = nlmsg_data(nlh); ifsm->family = PF_UNSPEC; ifsm->pad1 = 0; ifsm->pad2 = 0; ifsm->ifindex = dev->ifindex; ifsm->filter_mask = filter_mask; if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, *idxattr)) { struct rtnl_link_stats64 *sp; attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64, sizeof(struct rtnl_link_stats64), IFLA_STATS_UNSPEC); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } sp = nla_data(attr); dev_get_stats(dev, sp); } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, *idxattr)) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; if (ops && ops->fill_linkxstats) { *idxattr = IFLA_STATS_LINK_XSTATS; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_XSTATS); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); nla_nest_end(skb, attr); if (err) goto nla_put_failure; *idxattr = 0; } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, *idxattr)) { const struct rtnl_link_ops *ops = NULL; const struct net_device *master; master = netdev_master_upper_dev_get(dev); if (master) ops = master->rtnl_link_ops; if (ops && ops->fill_linkxstats) { *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_XSTATS_SLAVE); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); nla_nest_end(skb, attr); if (err) goto nla_put_failure; *idxattr = 0; } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, *idxattr)) { u32 off_filter_mask; off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS]; *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } err = rtnl_offload_xstats_fill(skb, dev, prividx, off_filter_mask, extack); if (err == -ENODATA) nla_nest_cancel(skb, attr); else nla_nest_end(skb, attr); if (err && err != -ENODATA) goto nla_put_failure; *idxattr = 0; } if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) { struct rtnl_af_ops *af_ops; *idxattr = IFLA_STATS_AF_SPEC; attr = nla_nest_start_noflag(skb, IFLA_STATS_AF_SPEC); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->fill_stats_af) { struct nlattr *af; af = nla_nest_start_noflag(skb, af_ops->family); if (!af) { rcu_read_unlock(); err = -EMSGSIZE; goto nla_put_failure; } err = af_ops->fill_stats_af(skb, dev); if (err == -ENODATA) { nla_nest_cancel(skb, af); } else if (err < 0) { rcu_read_unlock(); goto nla_put_failure; } nla_nest_end(skb, af); } } rcu_read_unlock(); nla_nest_end(skb, attr); *idxattr = 0; } nlmsg_end(skb, nlh); return 0; nla_put_failure: /* not a multi message or no progress mean a real error */ if (!(flags & NLM_F_MULTI) || s_prividx == *prividx) nlmsg_cancel(skb, nlh); else nlmsg_end(skb, nlh); return err; } static size_t if_nlmsg_stats_size(const struct net_device *dev, const struct rtnl_stats_dump_filters *filters) { size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg)); unsigned int filter_mask = filters->mask[0]; if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0)) size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; int attr = IFLA_STATS_LINK_XSTATS; if (ops && ops->get_linkxstats_size) { size += nla_total_size(ops->get_linkxstats_size(dev, attr)); /* for IFLA_STATS_LINK_XSTATS */ size += nla_total_size(0); } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, 0)) { struct net_device *_dev = (struct net_device *)dev; const struct rtnl_link_ops *ops = NULL; const struct net_device *master; /* netdev_master_upper_dev_get can't take const */ master = netdev_master_upper_dev_get(_dev); if (master) ops = master->rtnl_link_ops; if (ops && ops->get_linkxstats_size) { int attr = IFLA_STATS_LINK_XSTATS_SLAVE; size += nla_total_size(ops->get_linkxstats_size(dev, attr)); /* for IFLA_STATS_LINK_XSTATS_SLAVE */ size += nla_total_size(0); } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) { u32 off_filter_mask; off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS]; size += rtnl_offload_xstats_get_size(dev, off_filter_mask); } if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) { struct rtnl_af_ops *af_ops; /* for IFLA_STATS_AF_SPEC */ size += nla_total_size(0); rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->get_stats_af_size) { size += nla_total_size( af_ops->get_stats_af_size(dev)); /* for AF_* */ size += nla_total_size(0); } } rcu_read_unlock(); } return size; } #define RTNL_STATS_OFFLOAD_XSTATS_VALID ((1 << __IFLA_OFFLOAD_XSTATS_MAX) - 1) static const struct nla_policy rtnl_stats_get_policy_filters[IFLA_STATS_MAX + 1] = { [IFLA_STATS_LINK_OFFLOAD_XSTATS] = NLA_POLICY_MASK(NLA_U32, RTNL_STATS_OFFLOAD_XSTATS_VALID), }; static const struct nla_policy rtnl_stats_get_policy[IFLA_STATS_GETSET_MAX + 1] = { [IFLA_STATS_GET_FILTERS] = NLA_POLICY_NESTED(rtnl_stats_get_policy_filters), }; static const struct nla_policy ifla_stats_set_policy[IFLA_STATS_GETSET_MAX + 1] = { [IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS] = NLA_POLICY_MAX(NLA_U8, 1), }; static int rtnl_stats_get_parse_filters(struct nlattr *ifla_filters, struct rtnl_stats_dump_filters *filters, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_STATS_MAX + 1]; int err; int at; err = nla_parse_nested(tb, IFLA_STATS_MAX, ifla_filters, rtnl_stats_get_policy_filters, extack); if (err < 0) return err; for (at = 1; at <= IFLA_STATS_MAX; at++) { if (tb[at]) { if (!(filters->mask[0] & IFLA_STATS_FILTER_BIT(at))) { NL_SET_ERR_MSG(extack, "Filtered attribute not enabled in filter_mask"); return -EINVAL; } filters->mask[at] = nla_get_u32(tb[at]); } } return 0; } static int rtnl_stats_get_parse(const struct nlmsghdr *nlh, u32 filter_mask, struct rtnl_stats_dump_filters *filters, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1]; int err; int i; filters->mask[0] = filter_mask; for (i = 1; i < ARRAY_SIZE(filters->mask); i++) filters->mask[i] = -1U; err = nlmsg_parse(nlh, sizeof(struct if_stats_msg), tb, IFLA_STATS_GETSET_MAX, rtnl_stats_get_policy, extack); if (err < 0) return err; if (tb[IFLA_STATS_GET_FILTERS]) { err = rtnl_stats_get_parse_filters(tb[IFLA_STATS_GET_FILTERS], filters, extack); if (err) return err; } return 0; } static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, bool is_dump, struct netlink_ext_ack *extack) { struct if_stats_msg *ifsm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifsm))) { NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); return -EINVAL; } if (!strict_check) return 0; ifsm = nlmsg_data(nlh); /* only requests using strict checks can pass data to influence * the dump. The legacy exception is filter_mask. */ if (ifsm->pad1 || ifsm->pad2 || (is_dump && ifsm->ifindex)) { NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request"); return -EINVAL; } if (ifsm->filter_mask >= IFLA_STATS_FILTER_BIT(IFLA_STATS_MAX + 1)) { NL_SET_ERR_MSG(extack, "Invalid stats requested through filter mask"); return -EINVAL; } return 0; } static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct rtnl_stats_dump_filters filters; struct net *net = sock_net(skb->sk); struct net_device *dev = NULL; int idxattr = 0, prividx = 0; struct if_stats_msg *ifsm; struct sk_buff *nskb; int err; err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), false, extack); if (err) return err; ifsm = nlmsg_data(nlh); if (ifsm->ifindex > 0) dev = __dev_get_by_index(net, ifsm->ifindex); else return -EINVAL; if (!dev) return -ENODEV; if (!ifsm->filter_mask) { NL_SET_ERR_MSG(extack, "Filter mask must be set for stats get"); return -EINVAL; } err = rtnl_stats_get_parse(nlh, ifsm->filter_mask, &filters, extack); if (err) return err; nskb = nlmsg_new(if_nlmsg_stats_size(dev, &filters), GFP_KERNEL); if (!nskb) return -ENOBUFS; err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, &filters, &idxattr, &prividx, extack); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(nskb); } else { err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); } return err; } static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct rtnl_stats_dump_filters filters; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; struct if_stats_msg *ifsm; struct { unsigned long ifindex; int idxattr; int prividx; } *ctx = (void *)cb->ctx; struct net_device *dev; int err; cb->seq = net->dev_base_seq; err = rtnl_valid_stats_req(cb->nlh, cb->strict_check, true, extack); if (err) return err; ifsm = nlmsg_data(cb->nlh); if (!ifsm->filter_mask) { NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump"); return -EINVAL; } err = rtnl_stats_get_parse(cb->nlh, ifsm->filter_mask, &filters, extack); if (err) return err; for_each_netdev_dump(net, dev, ctx->ifindex) { err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, &filters, &ctx->idxattr, &ctx->prividx, extack); /* If we ran out of room on the first message, * we're in trouble. */ WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); if (err < 0) break; ctx->prividx = 0; ctx->idxattr = 0; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); } return err; } void rtnl_offload_xstats_notify(struct net_device *dev) { struct rtnl_stats_dump_filters response_filters = {}; struct net *net = dev_net(dev); int idxattr = 0, prividx = 0; struct sk_buff *skb; int err = -ENOBUFS; ASSERT_RTNL(); response_filters.mask[0] |= IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS); response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |= IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO); skb = nlmsg_new(if_nlmsg_stats_size(dev, &response_filters), GFP_KERNEL); if (!skb) goto errout; err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, 0, 0, 0, 0, &response_filters, &idxattr, &prividx, NULL); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_STATS, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_STATS, err); } EXPORT_SYMBOL(rtnl_offload_xstats_notify); static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; struct rtnl_stats_dump_filters response_filters = {}; struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1]; struct net *net = sock_net(skb->sk); struct net_device *dev = NULL; struct if_stats_msg *ifsm; bool notify = false; int err; err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), false, extack); if (err) return err; ifsm = nlmsg_data(nlh); if (ifsm->family != AF_UNSPEC) { NL_SET_ERR_MSG(extack, "Address family should be AF_UNSPEC"); return -EINVAL; } if (ifsm->ifindex > 0) dev = __dev_get_by_index(net, ifsm->ifindex); else return -EINVAL; if (!dev) return -ENODEV; if (ifsm->filter_mask) { NL_SET_ERR_MSG(extack, "Filter mask must be 0 for stats set"); return -EINVAL; } err = nlmsg_parse(nlh, sizeof(*ifsm), tb, IFLA_STATS_GETSET_MAX, ifla_stats_set_policy, extack); if (err < 0) return err; if (tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]) { u8 req = nla_get_u8(tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]); if (req) err = netdev_offload_xstats_enable(dev, t_l3, extack); else err = netdev_offload_xstats_disable(dev, t_l3); if (!err) notify = true; else if (err != -EALREADY) return err; response_filters.mask[0] |= IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS); response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |= IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO); } if (notify) rtnl_offload_xstats_notify(dev); return 0; } static int rtnl_mdb_valid_dump_req(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct br_port_msg *bpm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*bpm))) { NL_SET_ERR_MSG(extack, "Invalid header for mdb dump request"); return -EINVAL; } bpm = nlmsg_data(nlh); if (bpm->ifindex) { NL_SET_ERR_MSG(extack, "Filtering by device index is not supported for mdb dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*bpm))) { NL_SET_ERR_MSG(extack, "Invalid data after header in mdb dump request"); return -EINVAL; } return 0; } struct rtnl_mdb_dump_ctx { long idx; }; static int rtnl_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct rtnl_mdb_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); struct net_device *dev; int idx, s_idx; int err; NL_ASSERT_CTX_FITS(struct rtnl_mdb_dump_ctx); if (cb->strict_check) { err = rtnl_mdb_valid_dump_req(cb->nlh, cb->extack); if (err) return err; } s_idx = ctx->idx; idx = 0; for_each_netdev(net, dev) { if (idx < s_idx) goto skip; if (!dev->netdev_ops->ndo_mdb_dump) goto skip; err = dev->netdev_ops->ndo_mdb_dump(dev, skb, cb); if (err == -EMSGSIZE) goto out; /* Moving on to next device, reset markers and sequence * counters since they are all maintained per-device. */ memset(cb->ctx, 0, sizeof(cb->ctx)); cb->prev_seq = 0; cb->seq = 0; skip: idx++; } out: ctx->idx = idx; return skb->len; } static int rtnl_validate_mdb_entry_get(const struct nlattr *attr, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(attr); if (nla_len(attr) != sizeof(struct br_mdb_entry)) { NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length"); return -EINVAL; } if (entry->ifindex) { NL_SET_ERR_MSG(extack, "Entry ifindex cannot be specified"); return -EINVAL; } if (entry->state) { NL_SET_ERR_MSG(extack, "Entry state cannot be specified"); return -EINVAL; } if (entry->flags) { NL_SET_ERR_MSG(extack, "Entry flags cannot be specified"); return -EINVAL; } if (entry->vid >= VLAN_VID_MASK) { NL_SET_ERR_MSG(extack, "Invalid entry VLAN id"); return -EINVAL; } if (entry->addr.proto != htons(ETH_P_IP) && entry->addr.proto != htons(ETH_P_IPV6) && entry->addr.proto != 0) { NL_SET_ERR_MSG(extack, "Unknown entry protocol"); return -EINVAL; } return 0; } static const struct nla_policy mdba_get_policy[MDBA_GET_ENTRY_MAX + 1] = { [MDBA_GET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, rtnl_validate_mdb_entry_get, sizeof(struct br_mdb_entry)), [MDBA_GET_ENTRY_ATTRS] = { .type = NLA_NESTED }, }; static int rtnl_mdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[MDBA_GET_ENTRY_MAX + 1]; struct net *net = sock_net(in_skb->sk); struct br_port_msg *bpm; struct net_device *dev; int err; err = nlmsg_parse(nlh, sizeof(struct br_port_msg), tb, MDBA_GET_ENTRY_MAX, mdba_get_policy, extack); if (err) return err; bpm = nlmsg_data(nlh); if (!bpm->ifindex) { NL_SET_ERR_MSG(extack, "Invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, bpm->ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Device doesn't exist"); return -ENODEV; } if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_GET_ENTRY)) { NL_SET_ERR_MSG(extack, "Missing MDBA_GET_ENTRY attribute"); return -EINVAL; } if (!dev->netdev_ops->ndo_mdb_get) { NL_SET_ERR_MSG(extack, "Device does not support MDB operations"); return -EOPNOTSUPP; } return dev->netdev_ops->ndo_mdb_get(dev, tb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, extack); } static int rtnl_validate_mdb_entry(const struct nlattr *attr, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(attr); if (nla_len(attr) != sizeof(struct br_mdb_entry)) { NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length"); return -EINVAL; } if (entry->ifindex == 0) { NL_SET_ERR_MSG(extack, "Zero entry ifindex is not allowed"); return -EINVAL; } if (entry->addr.proto == htons(ETH_P_IP)) { if (!ipv4_is_multicast(entry->addr.u.ip4) && !ipv4_is_zeronet(entry->addr.u.ip4)) { NL_SET_ERR_MSG(extack, "IPv4 entry group address is not multicast or 0.0.0.0"); return -EINVAL; } if (ipv4_is_local_multicast(entry->addr.u.ip4)) { NL_SET_ERR_MSG(extack, "IPv4 entry group address is local multicast"); return -EINVAL; } #if IS_ENABLED(CONFIG_IPV6) } else if (entry->addr.proto == htons(ETH_P_IPV6)) { if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6)) { NL_SET_ERR_MSG(extack, "IPv6 entry group address is link-local all nodes"); return -EINVAL; } #endif } else if (entry->addr.proto == 0) { /* L2 mdb */ if (!is_multicast_ether_addr(entry->addr.u.mac_addr)) { NL_SET_ERR_MSG(extack, "L2 entry group is not multicast"); return -EINVAL; } } else { NL_SET_ERR_MSG(extack, "Unknown entry protocol"); return -EINVAL; } if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) { NL_SET_ERR_MSG(extack, "Unknown entry state"); return -EINVAL; } if (entry->vid >= VLAN_VID_MASK) { NL_SET_ERR_MSG(extack, "Invalid entry VLAN id"); return -EINVAL; } return 0; } static const struct nla_policy mdba_policy[MDBA_SET_ENTRY_MAX + 1] = { [MDBA_SET_ENTRY_UNSPEC] = { .strict_start_type = MDBA_SET_ENTRY_ATTRS + 1 }, [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, rtnl_validate_mdb_entry, sizeof(struct br_mdb_entry)), [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED }, }; static int rtnl_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1]; struct net *net = sock_net(skb->sk); struct br_port_msg *bpm; struct net_device *dev; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX, mdba_policy, extack); if (err) return err; bpm = nlmsg_data(nlh); if (!bpm->ifindex) { NL_SET_ERR_MSG(extack, "Invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, bpm->ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Device doesn't exist"); return -ENODEV; } if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) { NL_SET_ERR_MSG(extack, "Missing MDBA_SET_ENTRY attribute"); return -EINVAL; } if (!dev->netdev_ops->ndo_mdb_add) { NL_SET_ERR_MSG(extack, "Device does not support MDB operations"); return -EOPNOTSUPP; } return dev->netdev_ops->ndo_mdb_add(dev, tb, nlh->nlmsg_flags, extack); } static int rtnl_validate_mdb_entry_del_bulk(const struct nlattr *attr, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(attr); struct br_mdb_entry zero_entry = {}; if (nla_len(attr) != sizeof(struct br_mdb_entry)) { NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length"); return -EINVAL; } if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) { NL_SET_ERR_MSG(extack, "Unknown entry state"); return -EINVAL; } if (entry->flags) { NL_SET_ERR_MSG(extack, "Entry flags cannot be set"); return -EINVAL; } if (entry->vid >= VLAN_N_VID - 1) { NL_SET_ERR_MSG(extack, "Invalid entry VLAN id"); return -EINVAL; } if (memcmp(&entry->addr, &zero_entry.addr, sizeof(entry->addr))) { NL_SET_ERR_MSG(extack, "Entry address cannot be set"); return -EINVAL; } return 0; } static const struct nla_policy mdba_del_bulk_policy[MDBA_SET_ENTRY_MAX + 1] = { [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, rtnl_validate_mdb_entry_del_bulk, sizeof(struct br_mdb_entry)), [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED }, }; static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK); struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1]; struct net *net = sock_net(skb->sk); struct br_port_msg *bpm; struct net_device *dev; int err; if (!del_bulk) err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX, mdba_policy, extack); else err = nlmsg_parse(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX, mdba_del_bulk_policy, extack); if (err) return err; bpm = nlmsg_data(nlh); if (!bpm->ifindex) { NL_SET_ERR_MSG(extack, "Invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, bpm->ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Device doesn't exist"); return -ENODEV; } if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) { NL_SET_ERR_MSG(extack, "Missing MDBA_SET_ENTRY attribute"); return -EINVAL; } if (del_bulk) { if (!dev->netdev_ops->ndo_mdb_del_bulk) { NL_SET_ERR_MSG(extack, "Device does not support MDB bulk deletion"); return -EOPNOTSUPP; } return dev->netdev_ops->ndo_mdb_del_bulk(dev, tb, extack); } if (!dev->netdev_ops->ndo_mdb_del) { NL_SET_ERR_MSG(extack, "Device does not support MDB operations"); return -EOPNOTSUPP; } return dev->netdev_ops->ndo_mdb_del(dev, tb, extack); } /* Process one rtnetlink message. */ static int rtnl_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { const bool needs_lock = !(cb->flags & RTNL_FLAG_DUMP_UNLOCKED); rtnl_dumpit_func dumpit = cb->data; int err; /* Previous iteration have already finished, avoid calling->dumpit() * again, it may not expect to be called after it reached the end. */ if (!dumpit) return 0; if (needs_lock) rtnl_lock(); err = dumpit(skb, cb); if (needs_lock) rtnl_unlock(); /* Old dump handlers used to send NLM_DONE as in a separate recvmsg(). * Some applications which parse netlink manually depend on this. */ if (cb->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE) { if (err < 0 && err != -EMSGSIZE) return err; if (!err) cb->data = NULL; return skb->len; } return err; } static int rtnetlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { if (control->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE || !(control->flags & RTNL_FLAG_DUMP_UNLOCKED)) { WARN_ON(control->data); control->data = control->dump; control->dump = rtnl_dumpit; } return netlink_dump_start(ssk, skb, nlh, control); } static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct rtnl_link *link; enum rtnl_kinds kind; struct module *owner; int err = -EOPNOTSUPP; rtnl_doit_func doit; unsigned int flags; int family; int type; type = nlh->nlmsg_type; if (type > RTM_MAX) return -EOPNOTSUPP; type -= RTM_BASE; /* All the messages must have at least 1 byte length */ if (nlmsg_len(nlh) < sizeof(struct rtgenmsg)) return 0; family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; kind = rtnl_msgtype_kind(type); if (kind != RTNL_KIND_GET && !netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; rcu_read_lock(); if (kind == RTNL_KIND_GET && (nlh->nlmsg_flags & NLM_F_DUMP)) { struct sock *rtnl; rtnl_dumpit_func dumpit; u32 min_dump_alloc = 0; link = rtnl_get_link(family, type); if (!link || !link->dumpit) { family = PF_UNSPEC; link = rtnl_get_link(family, type); if (!link || !link->dumpit) goto err_unlock; } owner = link->owner; dumpit = link->dumpit; flags = link->flags; if (type == RTM_GETLINK - RTM_BASE) min_dump_alloc = rtnl_calcit(skb, nlh); err = 0; /* need to do this before rcu_read_unlock() */ if (!try_module_get(owner)) err = -EPROTONOSUPPORT; rcu_read_unlock(); rtnl = net->rtnl; if (err == 0) { struct netlink_dump_control c = { .dump = dumpit, .min_dump_alloc = min_dump_alloc, .module = owner, .flags = flags, }; err = rtnetlink_dump_start(rtnl, skb, nlh, &c); /* netlink_dump_start() will keep a reference on * module if dump is still in progress. */ module_put(owner); } return err; } link = rtnl_get_link(family, type); if (!link || !link->doit) { family = PF_UNSPEC; link = rtnl_get_link(PF_UNSPEC, type); if (!link || !link->doit) goto out_unlock; } owner = link->owner; if (!try_module_get(owner)) { err = -EPROTONOSUPPORT; goto out_unlock; } flags = link->flags; if (kind == RTNL_KIND_DEL && (nlh->nlmsg_flags & NLM_F_BULK) && !(flags & RTNL_FLAG_BULK_DEL_SUPPORTED)) { NL_SET_ERR_MSG(extack, "Bulk delete is not supported"); module_put(owner); goto err_unlock; } if (flags & RTNL_FLAG_DOIT_UNLOCKED) { doit = link->doit; rcu_read_unlock(); if (doit) err = doit(skb, nlh, extack); module_put(owner); return err; } rcu_read_unlock(); rtnl_lock(); link = rtnl_get_link(family, type); if (link && link->doit) err = link->doit(skb, nlh, extack); rtnl_unlock(); module_put(owner); return err; out_unlock: rcu_read_unlock(); return err; err_unlock: rcu_read_unlock(); return -EOPNOTSUPP; } static void rtnetlink_rcv(struct sk_buff *skb) { netlink_rcv_skb(skb, &rtnetlink_rcv_msg); } static int rtnetlink_bind(struct net *net, int group) { switch (group) { case RTNLGRP_IPV4_MROUTE_R: case RTNLGRP_IPV6_MROUTE_R: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; break; } return 0; } static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_REBOOT: case NETDEV_CHANGEMTU: case NETDEV_CHANGEADDR: case NETDEV_CHANGENAME: case NETDEV_FEAT_CHANGE: case NETDEV_BONDING_FAILOVER: case NETDEV_POST_TYPE_CHANGE: case NETDEV_NOTIFY_PEERS: case NETDEV_CHANGEUPPER: case NETDEV_RESEND_IGMP: case NETDEV_CHANGEINFODATA: case NETDEV_CHANGELOWERSTATE: case NETDEV_CHANGE_TX_QUEUE_LEN: rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), GFP_KERNEL, NULL, 0, 0, NULL); break; default: break; } return NOTIFY_DONE; } static struct notifier_block rtnetlink_dev_notifier = { .notifier_call = rtnetlink_event, }; static int __net_init rtnetlink_net_init(struct net *net) { struct sock *sk; struct netlink_kernel_cfg cfg = { .groups = RTNLGRP_MAX, .input = rtnetlink_rcv, .flags = NL_CFG_F_NONROOT_RECV, .bind = rtnetlink_bind, }; sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg); if (!sk) return -ENOMEM; net->rtnl = sk; return 0; } static void __net_exit rtnetlink_net_exit(struct net *net) { netlink_kernel_release(net->rtnl); net->rtnl = NULL; } static struct pernet_operations rtnetlink_net_ops = { .init = rtnetlink_net_init, .exit = rtnetlink_net_exit, }; static const struct rtnl_msg_handler rtnetlink_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWLINK, .doit = rtnl_newlink, .flags = RTNL_FLAG_DOIT_PERNET}, {.msgtype = RTM_DELLINK, .doit = rtnl_dellink, .flags = RTNL_FLAG_DOIT_PERNET_WIP}, {.msgtype = RTM_GETLINK, .doit = rtnl_getlink, .dumpit = rtnl_dump_ifinfo, .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, {.msgtype = RTM_SETLINK, .doit = rtnl_setlink, .flags = RTNL_FLAG_DOIT_PERNET_WIP}, {.msgtype = RTM_GETADDR, .dumpit = rtnl_dump_all}, {.msgtype = RTM_GETROUTE, .dumpit = rtnl_dump_all}, {.msgtype = RTM_GETNETCONF, .dumpit = rtnl_dump_all}, {.msgtype = RTM_GETSTATS, .doit = rtnl_stats_get, .dumpit = rtnl_stats_dump}, {.msgtype = RTM_SETSTATS, .doit = rtnl_stats_set}, {.msgtype = RTM_NEWLINKPROP, .doit = rtnl_newlinkprop}, {.msgtype = RTM_DELLINKPROP, .doit = rtnl_dellinkprop}, {.protocol = PF_BRIDGE, .msgtype = RTM_GETLINK, .dumpit = rtnl_bridge_getlink}, {.protocol = PF_BRIDGE, .msgtype = RTM_DELLINK, .doit = rtnl_bridge_dellink}, {.protocol = PF_BRIDGE, .msgtype = RTM_SETLINK, .doit = rtnl_bridge_setlink}, {.protocol = PF_BRIDGE, .msgtype = RTM_NEWNEIGH, .doit = rtnl_fdb_add}, {.protocol = PF_BRIDGE, .msgtype = RTM_DELNEIGH, .doit = rtnl_fdb_del, .flags = RTNL_FLAG_BULK_DEL_SUPPORTED}, {.protocol = PF_BRIDGE, .msgtype = RTM_GETNEIGH, .doit = rtnl_fdb_get, .dumpit = rtnl_fdb_dump}, {.protocol = PF_BRIDGE, .msgtype = RTM_NEWMDB, .doit = rtnl_mdb_add}, {.protocol = PF_BRIDGE, .msgtype = RTM_DELMDB, .doit = rtnl_mdb_del, .flags = RTNL_FLAG_BULK_DEL_SUPPORTED}, {.protocol = PF_BRIDGE, .msgtype = RTM_GETMDB, .doit = rtnl_mdb_get, .dumpit = rtnl_mdb_dump}, }; void __init rtnetlink_init(void) { if (register_pernet_subsys(&rtnetlink_net_ops)) panic("rtnetlink_init: cannot initialize rtnetlink\n"); register_netdevice_notifier(&rtnetlink_dev_notifier); rtnl_register_many(rtnetlink_rtnl_msg_handlers); } |
| 171 171 171 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | // SPDX-License-Identifier: GPL-2.0-only /* * Fixmap manipulation code */ #include <linux/bug.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/libfdt.h> #include <linux/memory.h> #include <linux/mm.h> #include <linux/sizes.h> #include <asm/fixmap.h> #include <asm/kernel-pgtable.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> /* ensure that the fixmap region does not grow down into the PCI I/O region */ static_assert(FIXADDR_TOT_START > PCI_IO_END); #define NR_BM_PTE_TABLES \ SPAN_NR_ENTRIES(FIXADDR_TOT_START, FIXADDR_TOP, PMD_SHIFT) #define NR_BM_PMD_TABLES \ SPAN_NR_ENTRIES(FIXADDR_TOT_START, FIXADDR_TOP, PUD_SHIFT) static_assert(NR_BM_PMD_TABLES == 1); #define __BM_TABLE_IDX(addr, shift) \ (((addr) >> (shift)) - (FIXADDR_TOT_START >> (shift))) #define BM_PTE_TABLE_IDX(addr) __BM_TABLE_IDX(addr, PMD_SHIFT) static pte_t bm_pte[NR_BM_PTE_TABLES][PTRS_PER_PTE] __page_aligned_bss; static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused; static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused; static inline pte_t *fixmap_pte(unsigned long addr) { return &bm_pte[BM_PTE_TABLE_IDX(addr)][pte_index(addr)]; } static void __init early_fixmap_init_pte(pmd_t *pmdp, unsigned long addr) { pmd_t pmd = READ_ONCE(*pmdp); pte_t *ptep; if (pmd_none(pmd)) { ptep = bm_pte[BM_PTE_TABLE_IDX(addr)]; __pmd_populate(pmdp, __pa_symbol(ptep), PMD_TYPE_TABLE | PMD_TABLE_AF); } } static void __init early_fixmap_init_pmd(pud_t *pudp, unsigned long addr, unsigned long end) { unsigned long next; pud_t pud = READ_ONCE(*pudp); pmd_t *pmdp; if (pud_none(pud)) __pud_populate(pudp, __pa_symbol(bm_pmd), PUD_TYPE_TABLE | PUD_TABLE_AF); pmdp = pmd_offset_kimg(pudp, addr); do { next = pmd_addr_end(addr, end); early_fixmap_init_pte(pmdp, addr); } while (pmdp++, addr = next, addr != end); } static void __init early_fixmap_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end) { p4d_t p4d = READ_ONCE(*p4dp); pud_t *pudp; if (CONFIG_PGTABLE_LEVELS > 3 && !p4d_none(p4d) && p4d_page_paddr(p4d) != __pa_symbol(bm_pud)) { /* * We only end up here if the kernel mapping and the fixmap * share the top level pgd entry, which should only happen on * 16k/4 levels configurations. */ BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); } if (p4d_none(p4d)) __p4d_populate(p4dp, __pa_symbol(bm_pud), P4D_TYPE_TABLE | P4D_TABLE_AF); pudp = pud_offset_kimg(p4dp, addr); early_fixmap_init_pmd(pudp, addr, end); } /* * The p*d_populate functions call virt_to_phys implicitly so they can't be used * directly on kernel symbols (bm_p*d). This function is called too early to use * lm_alias so __p*d_populate functions must be used to populate with the * physical address from __pa_symbol. */ void __init early_fixmap_init(void) { unsigned long addr = FIXADDR_TOT_START; unsigned long end = FIXADDR_TOP; pgd_t *pgdp = pgd_offset_k(addr); p4d_t *p4dp = p4d_offset_kimg(pgdp, addr); early_fixmap_init_pud(p4dp, addr, end); } /* * Unusually, this is also called in IRQ context (ghes_iounmap_irq) so if we * ever need to use IPIs for TLB broadcasting, then we're in trouble here. */ void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) { unsigned long addr = __fix_to_virt(idx); pte_t *ptep; BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses); ptep = fixmap_pte(addr); if (pgprot_val(flags)) { __set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags)); } else { __pte_clear(&init_mm, addr, ptep); flush_tlb_kernel_range(addr, addr+PAGE_SIZE); } } void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot) { const u64 dt_virt_base = __fix_to_virt(FIX_FDT); phys_addr_t dt_phys_base; int offset; void *dt_virt; /* * Check whether the physical FDT address is set and meets the minimum * alignment requirement. Since we are relying on MIN_FDT_ALIGN to be * at least 8 bytes so that we can always access the magic and size * fields of the FDT header after mapping the first chunk, double check * here if that is indeed the case. */ BUILD_BUG_ON(MIN_FDT_ALIGN < 8); if (!dt_phys || dt_phys % MIN_FDT_ALIGN) return NULL; dt_phys_base = round_down(dt_phys, PAGE_SIZE); offset = dt_phys % PAGE_SIZE; dt_virt = (void *)dt_virt_base + offset; /* map the first chunk so we can read the size from the header */ create_mapping_noalloc(dt_phys_base, dt_virt_base, PAGE_SIZE, prot); if (fdt_magic(dt_virt) != FDT_MAGIC) return NULL; *size = fdt_totalsize(dt_virt); if (*size > MAX_FDT_SIZE) return NULL; if (offset + *size > PAGE_SIZE) { create_mapping_noalloc(dt_phys_base, dt_virt_base, offset + *size, prot); } return dt_virt; } |
| 248 248 167 246 260 260 253 253 8 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 | // SPDX-License-Identifier: GPL-2.0-only /* * lib/bitmap.c * Helper functions for bitmap.h. */ #include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/ctype.h> #include <linux/device.h> #include <linux/export.h> #include <linux/slab.h> /** * DOC: bitmap introduction * * bitmaps provide an array of bits, implemented using an * array of unsigned longs. The number of valid bits in a * given bitmap does _not_ need to be an exact multiple of * BITS_PER_LONG. * * The possible unused bits in the last, partially used word * of a bitmap are 'don't care'. The implementation makes * no particular effort to keep them zero. It ensures that * their value will not affect the results of any operation. * The bitmap operations that return Boolean (bitmap_empty, * for example) or scalar (bitmap_weight, for example) results * carefully filter out these unused bits from impacting their * results. * * The byte ordering of bitmaps is more natural on little * endian architectures. See the big-endian headers * include/asm-ppc64/bitops.h and include/asm-s390/bitops.h * for the best explanations of this ordering. */ bool __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] != bitmap2[k]) return false; if (bits % BITS_PER_LONG) if ((bitmap1[k] ^ bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) return false; return true; } EXPORT_SYMBOL(__bitmap_equal); bool __bitmap_or_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, const unsigned long *bitmap3, unsigned int bits) { unsigned int k, lim = bits / BITS_PER_LONG; unsigned long tmp; for (k = 0; k < lim; ++k) { if ((bitmap1[k] | bitmap2[k]) != bitmap3[k]) return false; } if (!(bits % BITS_PER_LONG)) return true; tmp = (bitmap1[k] | bitmap2[k]) ^ bitmap3[k]; return (tmp & BITMAP_LAST_WORD_MASK(bits)) == 0; } void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits) { unsigned int k, lim = BITS_TO_LONGS(bits); for (k = 0; k < lim; ++k) dst[k] = ~src[k]; } EXPORT_SYMBOL(__bitmap_complement); /** * __bitmap_shift_right - logical right shift of the bits in a bitmap * @dst : destination bitmap * @src : source bitmap * @shift : shift by this many bits * @nbits : bitmap size, in bits * * Shifting right (dividing) means moving bits in the MS -> LS bit * direction. Zeros are fed into the vacated MS positions and the * LS bits shifted off the bottom are lost. */ void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, unsigned shift, unsigned nbits) { unsigned k, lim = BITS_TO_LONGS(nbits); unsigned off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG; unsigned long mask = BITMAP_LAST_WORD_MASK(nbits); for (k = 0; off + k < lim; ++k) { unsigned long upper, lower; /* * If shift is not word aligned, take lower rem bits of * word above and make them the top rem bits of result. */ if (!rem || off + k + 1 >= lim) upper = 0; else { upper = src[off + k + 1]; if (off + k + 1 == lim - 1) upper &= mask; upper <<= (BITS_PER_LONG - rem); } lower = src[off + k]; if (off + k == lim - 1) lower &= mask; lower >>= rem; dst[k] = lower | upper; } if (off) memset(&dst[lim - off], 0, off*sizeof(unsigned long)); } EXPORT_SYMBOL(__bitmap_shift_right); /** * __bitmap_shift_left - logical left shift of the bits in a bitmap * @dst : destination bitmap * @src : source bitmap * @shift : shift by this many bits * @nbits : bitmap size, in bits * * Shifting left (multiplying) means moving bits in the LS -> MS * direction. Zeros are fed into the vacated LS bit positions * and those MS bits shifted off the top are lost. */ void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits) { int k; unsigned int lim = BITS_TO_LONGS(nbits); unsigned int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG; for (k = lim - off - 1; k >= 0; --k) { unsigned long upper, lower; /* * If shift is not word aligned, take upper rem bits of * word below and make them the bottom rem bits of result. */ if (rem && k > 0) lower = src[k - 1] >> (BITS_PER_LONG - rem); else lower = 0; upper = src[k] << rem; dst[k + off] = lower | upper; } if (off) memset(dst, 0, off*sizeof(unsigned long)); } EXPORT_SYMBOL(__bitmap_shift_left); /** * bitmap_cut() - remove bit region from bitmap and right shift remaining bits * @dst: destination bitmap, might overlap with src * @src: source bitmap * @first: start bit of region to be removed * @cut: number of bits to remove * @nbits: bitmap size, in bits * * Set the n-th bit of @dst iff the n-th bit of @src is set and * n is less than @first, or the m-th bit of @src is set for any * m such that @first <= n < nbits, and m = n + @cut. * * In pictures, example for a big-endian 32-bit architecture: * * The @src bitmap is:: * * 31 63 * | | * 10000000 11000001 11110010 00010101 10000000 11000001 01110010 00010101 * | | | | * 16 14 0 32 * * if @cut is 3, and @first is 14, bits 14-16 in @src are cut and @dst is:: * * 31 63 * | | * 10110000 00011000 00110010 00010101 00010000 00011000 00101110 01000010 * | | | * 14 (bit 17 0 32 * from @src) * * Note that @dst and @src might overlap partially or entirely. * * This is implemented in the obvious way, with a shift and carry * step for each moved bit. Optimisation is left as an exercise * for the compiler. */ void bitmap_cut(unsigned long *dst, const unsigned long *src, unsigned int first, unsigned int cut, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits); unsigned long keep = 0, carry; int i; if (first % BITS_PER_LONG) { keep = src[first / BITS_PER_LONG] & (~0UL >> (BITS_PER_LONG - first % BITS_PER_LONG)); } memmove(dst, src, len * sizeof(*dst)); while (cut--) { for (i = first / BITS_PER_LONG; i < len; i++) { if (i < len - 1) carry = dst[i + 1] & 1UL; else carry = 0; dst[i] = (dst[i] >> 1) | (carry << (BITS_PER_LONG - 1)); } } dst[first / BITS_PER_LONG] &= ~0UL << (first % BITS_PER_LONG); dst[first / BITS_PER_LONG] |= keep; } EXPORT_SYMBOL(bitmap_cut); bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; unsigned int lim = bits/BITS_PER_LONG; unsigned long result = 0; for (k = 0; k < lim; k++) result |= (dst[k] = bitmap1[k] & bitmap2[k]); if (bits % BITS_PER_LONG) result |= (dst[k] = bitmap1[k] & bitmap2[k] & BITMAP_LAST_WORD_MASK(bits)); return result != 0; } EXPORT_SYMBOL(__bitmap_and); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; unsigned int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] | bitmap2[k]; } EXPORT_SYMBOL(__bitmap_or); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; unsigned int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] ^ bitmap2[k]; } EXPORT_SYMBOL(__bitmap_xor); bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; unsigned int lim = bits/BITS_PER_LONG; unsigned long result = 0; for (k = 0; k < lim; k++) result |= (dst[k] = bitmap1[k] & ~bitmap2[k]); if (bits % BITS_PER_LONG) result |= (dst[k] = bitmap1[k] & ~bitmap2[k] & BITMAP_LAST_WORD_MASK(bits)); return result != 0; } EXPORT_SYMBOL(__bitmap_andnot); void __bitmap_replace(unsigned long *dst, const unsigned long *old, const unsigned long *new, const unsigned long *mask, unsigned int nbits) { unsigned int k; unsigned int nr = BITS_TO_LONGS(nbits); for (k = 0; k < nr; k++) dst[k] = (old[k] & ~mask[k]) | (new[k] & mask[k]); } EXPORT_SYMBOL(__bitmap_replace); bool __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] & bitmap2[k]) return true; if (bits % BITS_PER_LONG) if ((bitmap1[k] & bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) return true; return false; } EXPORT_SYMBOL(__bitmap_intersects); bool __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] & ~bitmap2[k]) return false; if (bits % BITS_PER_LONG) if ((bitmap1[k] & ~bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) return false; return true; } EXPORT_SYMBOL(__bitmap_subset); #define BITMAP_WEIGHT(FETCH, bits) \ ({ \ unsigned int __bits = (bits), idx, w = 0; \ \ for (idx = 0; idx < __bits / BITS_PER_LONG; idx++) \ w += hweight_long(FETCH); \ \ if (__bits % BITS_PER_LONG) \ w += hweight_long((FETCH) & BITMAP_LAST_WORD_MASK(__bits)); \ \ w; \ }) unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int bits) { return BITMAP_WEIGHT(bitmap[idx], bits); } EXPORT_SYMBOL(__bitmap_weight); unsigned int __bitmap_weight_and(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { return BITMAP_WEIGHT(bitmap1[idx] & bitmap2[idx], bits); } EXPORT_SYMBOL(__bitmap_weight_and); unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { return BITMAP_WEIGHT(bitmap1[idx] & ~bitmap2[idx], bits); } EXPORT_SYMBOL(__bitmap_weight_andnot); void __bitmap_set(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); const unsigned int size = start + len; int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); while (len - bits_to_set >= 0) { *p |= mask_to_set; len -= bits_to_set; bits_to_set = BITS_PER_LONG; mask_to_set = ~0UL; p++; } if (len) { mask_to_set &= BITMAP_LAST_WORD_MASK(size); *p |= mask_to_set; } } EXPORT_SYMBOL(__bitmap_set); void __bitmap_clear(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); const unsigned int size = start + len; int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); while (len - bits_to_clear >= 0) { *p &= ~mask_to_clear; len -= bits_to_clear; bits_to_clear = BITS_PER_LONG; mask_to_clear = ~0UL; p++; } if (len) { mask_to_clear &= BITMAP_LAST_WORD_MASK(size); *p &= ~mask_to_clear; } } EXPORT_SYMBOL(__bitmap_clear); /** * bitmap_find_next_zero_area_off - find a contiguous aligned zero area * @map: The address to base the search on * @size: The bitmap size in bits * @start: The bitnumber to start searching at * @nr: The number of zeroed bits we're looking for * @align_mask: Alignment mask for zero area * @align_offset: Alignment offset for zero area. * * The @align_mask should be one less than a power of 2; the effect is that * the bit offset of all zero areas this function finds plus @align_offset * is multiple of that power of 2. */ unsigned long bitmap_find_next_zero_area_off(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, unsigned long align_mask, unsigned long align_offset) { unsigned long index, end, i; again: index = find_next_zero_bit(map, size, start); /* Align allocation */ index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset; end = index + nr; if (end > size) return end; i = find_next_bit(map, end, index); if (i < end) { start = i + 1; goto again; } return index; } EXPORT_SYMBOL(bitmap_find_next_zero_area_off); /** * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap * @buf: pointer to a bitmap * @pos: a bit position in @buf (0 <= @pos < @nbits) * @nbits: number of valid bit positions in @buf * * Map the bit at position @pos in @buf (of length @nbits) to the * ordinal of which set bit it is. If it is not set or if @pos * is not a valid bit position, map to -1. * * If for example, just bits 4 through 7 are set in @buf, then @pos * values 4 through 7 will get mapped to 0 through 3, respectively, * and other @pos values will get mapped to -1. When @pos value 7 * gets mapped to (returns) @ord value 3 in this example, that means * that bit 7 is the 3rd (starting with 0th) set bit in @buf. * * The bit positions 0 through @bits are valid positions in @buf. */ static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigned int nbits) { if (pos >= nbits || !test_bit(pos, buf)) return -1; return bitmap_weight(buf, pos); } /** * bitmap_remap - Apply map defined by a pair of bitmaps to another bitmap * @dst: remapped result * @src: subset to be remapped * @old: defines domain of map * @new: defines range of map * @nbits: number of bits in each of these bitmaps * * Let @old and @new define a mapping of bit positions, such that * whatever position is held by the n-th set bit in @old is mapped * to the n-th set bit in @new. In the more general case, allowing * for the possibility that the weight 'w' of @new is less than the * weight of @old, map the position of the n-th set bit in @old to * the position of the m-th set bit in @new, where m == n % w. * * If either of the @old and @new bitmaps are empty, or if @src and * @dst point to the same location, then this routine copies @src * to @dst. * * The positions of unset bits in @old are mapped to themselves * (the identity map). * * Apply the above specified mapping to @src, placing the result in * @dst, clearing any bits previously set in @dst. * * For example, lets say that @old has bits 4 through 7 set, and * @new has bits 12 through 15 set. This defines the mapping of bit * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other * bit positions unchanged. So if say @src comes into this routine * with bits 1, 5 and 7 set, then @dst should leave with bits 1, * 13 and 15 set. */ void bitmap_remap(unsigned long *dst, const unsigned long *src, const unsigned long *old, const unsigned long *new, unsigned int nbits) { unsigned int oldbit, w; if (dst == src) /* following doesn't handle inplace remaps */ return; bitmap_zero(dst, nbits); w = bitmap_weight(new, nbits); for_each_set_bit(oldbit, src, nbits) { int n = bitmap_pos_to_ord(old, oldbit, nbits); if (n < 0 || w == 0) set_bit(oldbit, dst); /* identity map */ else set_bit(find_nth_bit(new, nbits, n % w), dst); } } EXPORT_SYMBOL(bitmap_remap); /** * bitmap_bitremap - Apply map defined by a pair of bitmaps to a single bit * @oldbit: bit position to be mapped * @old: defines domain of map * @new: defines range of map * @bits: number of bits in each of these bitmaps * * Let @old and @new define a mapping of bit positions, such that * whatever position is held by the n-th set bit in @old is mapped * to the n-th set bit in @new. In the more general case, allowing * for the possibility that the weight 'w' of @new is less than the * weight of @old, map the position of the n-th set bit in @old to * the position of the m-th set bit in @new, where m == n % w. * * The positions of unset bits in @old are mapped to themselves * (the identity map). * * Apply the above specified mapping to bit position @oldbit, returning * the new bit position. * * For example, lets say that @old has bits 4 through 7 set, and * @new has bits 12 through 15 set. This defines the mapping of bit * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other * bit positions unchanged. So if say @oldbit is 5, then this routine * returns 13. */ int bitmap_bitremap(int oldbit, const unsigned long *old, const unsigned long *new, int bits) { int w = bitmap_weight(new, bits); int n = bitmap_pos_to_ord(old, oldbit, bits); if (n < 0 || w == 0) return oldbit; else return find_nth_bit(new, bits, n % w); } EXPORT_SYMBOL(bitmap_bitremap); #ifdef CONFIG_NUMA /** * bitmap_onto - translate one bitmap relative to another * @dst: resulting translated bitmap * @orig: original untranslated bitmap * @relmap: bitmap relative to which translated * @bits: number of bits in each of these bitmaps * * Set the n-th bit of @dst iff there exists some m such that the * n-th bit of @relmap is set, the m-th bit of @orig is set, and * the n-th bit of @relmap is also the m-th _set_ bit of @relmap. * (If you understood the previous sentence the first time your * read it, you're overqualified for your current job.) * * In other words, @orig is mapped onto (surjectively) @dst, * using the map { <n, m> | the n-th bit of @relmap is the * m-th set bit of @relmap }. * * Any set bits in @orig above bit number W, where W is the * weight of (number of set bits in) @relmap are mapped nowhere. * In particular, if for all bits m set in @orig, m >= W, then * @dst will end up empty. In situations where the possibility * of such an empty result is not desired, one way to avoid it is * to use the bitmap_fold() operator, below, to first fold the * @orig bitmap over itself so that all its set bits x are in the * range 0 <= x < W. The bitmap_fold() operator does this by * setting the bit (m % W) in @dst, for each bit (m) set in @orig. * * Example [1] for bitmap_onto(): * Let's say @relmap has bits 30-39 set, and @orig has bits * 1, 3, 5, 7, 9 and 11 set. Then on return from this routine, * @dst will have bits 31, 33, 35, 37 and 39 set. * * When bit 0 is set in @orig, it means turn on the bit in * @dst corresponding to whatever is the first bit (if any) * that is turned on in @relmap. Since bit 0 was off in the * above example, we leave off that bit (bit 30) in @dst. * * When bit 1 is set in @orig (as in the above example), it * means turn on the bit in @dst corresponding to whatever * is the second bit that is turned on in @relmap. The second * bit in @relmap that was turned on in the above example was * bit 31, so we turned on bit 31 in @dst. * * Similarly, we turned on bits 33, 35, 37 and 39 in @dst, * because they were the 4th, 6th, 8th and 10th set bits * set in @relmap, and the 4th, 6th, 8th and 10th bits of * @orig (i.e. bits 3, 5, 7 and 9) were also set. * * When bit 11 is set in @orig, it means turn on the bit in * @dst corresponding to whatever is the twelfth bit that is * turned on in @relmap. In the above example, there were * only ten bits turned on in @relmap (30..39), so that bit * 11 was set in @orig had no affect on @dst. * * Example [2] for bitmap_fold() + bitmap_onto(): * Let's say @relmap has these ten bits set:: * * 40 41 42 43 45 48 53 61 74 95 * * (for the curious, that's 40 plus the first ten terms of the * Fibonacci sequence.) * * Further lets say we use the following code, invoking * bitmap_fold() then bitmap_onto, as suggested above to * avoid the possibility of an empty @dst result:: * * unsigned long *tmp; // a temporary bitmap's bits * * bitmap_fold(tmp, orig, bitmap_weight(relmap, bits), bits); * bitmap_onto(dst, tmp, relmap, bits); * * Then this table shows what various values of @dst would be, for * various @orig's. I list the zero-based positions of each set bit. * The tmp column shows the intermediate result, as computed by * using bitmap_fold() to fold the @orig bitmap modulo ten * (the weight of @relmap): * * =============== ============== ================= * @orig tmp @dst * 0 0 40 * 1 1 41 * 9 9 95 * 10 0 40 [#f1]_ * 1 3 5 7 1 3 5 7 41 43 48 61 * 0 1 2 3 4 0 1 2 3 4 40 41 42 43 45 * 0 9 18 27 0 9 8 7 40 61 74 95 * 0 10 20 30 0 40 * 0 11 22 33 0 1 2 3 40 41 42 43 * 0 12 24 36 0 2 4 6 40 42 45 53 * 78 102 211 1 2 8 41 42 74 [#f1]_ * =============== ============== ================= * * .. [#f1] * * For these marked lines, if we hadn't first done bitmap_fold() * into tmp, then the @dst result would have been empty. * * If either of @orig or @relmap is empty (no set bits), then @dst * will be returned empty. * * If (as explained above) the only set bits in @orig are in positions * m where m >= W, (where W is the weight of @relmap) then @dst will * once again be returned empty. * * All bits in @dst not set by the above rule are cleared. */ void bitmap_onto(unsigned long *dst, const unsigned long *orig, const unsigned long *relmap, unsigned int bits) { unsigned int n, m; /* same meaning as in above comment */ if (dst == orig) /* following doesn't handle inplace mappings */ return; bitmap_zero(dst, bits); /* * The following code is a more efficient, but less * obvious, equivalent to the loop: * for (m = 0; m < bitmap_weight(relmap, bits); m++) { * n = find_nth_bit(orig, bits, m); * if (test_bit(m, orig)) * set_bit(n, dst); * } */ m = 0; for_each_set_bit(n, relmap, bits) { /* m == bitmap_pos_to_ord(relmap, n, bits) */ if (test_bit(m, orig)) set_bit(n, dst); m++; } } /** * bitmap_fold - fold larger bitmap into smaller, modulo specified size * @dst: resulting smaller bitmap * @orig: original larger bitmap * @sz: specified size * @nbits: number of bits in each of these bitmaps * * For each bit oldbit in @orig, set bit oldbit mod @sz in @dst. * Clear all other bits in @dst. See further the comment and * Example [2] for bitmap_onto() for why and how to use this. */ void bitmap_fold(unsigned long *dst, const unsigned long *orig, unsigned int sz, unsigned int nbits) { unsigned int oldbit; if (dst == orig) /* following doesn't handle inplace mappings */ return; bitmap_zero(dst, nbits); for_each_set_bit(oldbit, orig, nbits) set_bit(oldbit % sz, dst); } #endif /* CONFIG_NUMA */ unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags) { return kmalloc_array(BITS_TO_LONGS(nbits), sizeof(unsigned long), flags); } EXPORT_SYMBOL(bitmap_alloc); unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags) { return bitmap_alloc(nbits, flags | __GFP_ZERO); } EXPORT_SYMBOL(bitmap_zalloc); unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node) { return kmalloc_array_node(BITS_TO_LONGS(nbits), sizeof(unsigned long), flags, node); } EXPORT_SYMBOL(bitmap_alloc_node); unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node) { return bitmap_alloc_node(nbits, flags | __GFP_ZERO, node); } EXPORT_SYMBOL(bitmap_zalloc_node); void bitmap_free(const unsigned long *bitmap) { kfree(bitmap); } EXPORT_SYMBOL(bitmap_free); static void devm_bitmap_free(void *data) { unsigned long *bitmap = data; bitmap_free(bitmap); } unsigned long *devm_bitmap_alloc(struct device *dev, unsigned int nbits, gfp_t flags) { unsigned long *bitmap; int ret; bitmap = bitmap_alloc(nbits, flags); if (!bitmap) return NULL; ret = devm_add_action_or_reset(dev, devm_bitmap_free, bitmap); if (ret) return NULL; return bitmap; } EXPORT_SYMBOL_GPL(devm_bitmap_alloc); unsigned long *devm_bitmap_zalloc(struct device *dev, unsigned int nbits, gfp_t flags) { return devm_bitmap_alloc(dev, nbits, flags | __GFP_ZERO); } EXPORT_SYMBOL_GPL(devm_bitmap_zalloc); #if BITS_PER_LONG == 64 /** * bitmap_from_arr32 - copy the contents of u32 array of bits to bitmap * @bitmap: array of unsigned longs, the destination bitmap * @buf: array of u32 (in host byte order), the source bitmap * @nbits: number of bits in @bitmap */ void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits) { unsigned int i, halfwords; halfwords = DIV_ROUND_UP(nbits, 32); for (i = 0; i < halfwords; i++) { bitmap[i/2] = (unsigned long) buf[i]; if (++i < halfwords) bitmap[i/2] |= ((unsigned long) buf[i]) << 32; } /* Clear tail bits in last word beyond nbits. */ if (nbits % BITS_PER_LONG) bitmap[(halfwords - 1) / 2] &= BITMAP_LAST_WORD_MASK(nbits); } EXPORT_SYMBOL(bitmap_from_arr32); /** * bitmap_to_arr32 - copy the contents of bitmap to a u32 array of bits * @buf: array of u32 (in host byte order), the dest bitmap * @bitmap: array of unsigned longs, the source bitmap * @nbits: number of bits in @bitmap */ void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits) { unsigned int i, halfwords; halfwords = DIV_ROUND_UP(nbits, 32); for (i = 0; i < halfwords; i++) { buf[i] = (u32) (bitmap[i/2] & UINT_MAX); if (++i < halfwords) buf[i] = (u32) (bitmap[i/2] >> 32); } /* Clear tail bits in last element of array beyond nbits. */ if (nbits % BITS_PER_LONG) buf[halfwords - 1] &= (u32) (UINT_MAX >> ((-nbits) & 31)); } EXPORT_SYMBOL(bitmap_to_arr32); #endif #if BITS_PER_LONG == 32 /** * bitmap_from_arr64 - copy the contents of u64 array of bits to bitmap * @bitmap: array of unsigned longs, the destination bitmap * @buf: array of u64 (in host byte order), the source bitmap * @nbits: number of bits in @bitmap */ void bitmap_from_arr64(unsigned long *bitmap, const u64 *buf, unsigned int nbits) { int n; for (n = nbits; n > 0; n -= 64) { u64 val = *buf++; *bitmap++ = val; if (n > 32) *bitmap++ = val >> 32; } /* * Clear tail bits in the last word beyond nbits. * * Negative index is OK because here we point to the word next * to the last word of the bitmap, except for nbits == 0, which * is tested implicitly. */ if (nbits % BITS_PER_LONG) bitmap[-1] &= BITMAP_LAST_WORD_MASK(nbits); } EXPORT_SYMBOL(bitmap_from_arr64); /** * bitmap_to_arr64 - copy the contents of bitmap to a u64 array of bits * @buf: array of u64 (in host byte order), the dest bitmap * @bitmap: array of unsigned longs, the source bitmap * @nbits: number of bits in @bitmap */ void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits) { const unsigned long *end = bitmap + BITS_TO_LONGS(nbits); while (bitmap < end) { *buf = *bitmap++; if (bitmap < end) *buf |= (u64)(*bitmap++) << 32; buf++; } /* Clear tail bits in the last element of array beyond nbits. */ if (nbits % 64) buf[-1] &= GENMASK_ULL((nbits - 1) % 64, 0); } EXPORT_SYMBOL(bitmap_to_arr64); #endif |
| 45 4 42 45 45 46 45 242 4 242 241 242 242 4 4 4 4 22 4 3 38 10 45 46 45 45 46 45 46 46 46 4 4 4 4 4 22 22 22 22 22 22 22 22 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 | // SPDX-License-Identifier: GPL-2.0 /* * Kernel internal timers * * Copyright (C) 1991, 1992 Linus Torvalds * * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. * * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 * "A Kernel Model for Precision Timekeeping" by Dave Mills * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to * serialize accesses to xtime/lost_ticks). * Copyright (C) 1998 Andrea Arcangeli * 1999-03-10 Improved NTP compatibility by Ulrich Windl * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love * 2000-10-05 Implemented scalable SMP per-CPU timer handling. * Copyright (C) 2000, 2001, 2002 Ingo Molnar * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar */ #include <linux/kernel_stat.h> #include <linux/export.h> #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/pid_namespace.h> #include <linux/notifier.h> #include <linux/thread_info.h> #include <linux/time.h> #include <linux/jiffies.h> #include <linux/posix-timers.h> #include <linux/cpu.h> #include <linux/syscalls.h> #include <linux/delay.h> #include <linux/tick.h> #include <linux/kallsyms.h> #include <linux/irq_work.h> #include <linux/sched/sysctl.h> #include <linux/sched/nohz.h> #include <linux/sched/debug.h> #include <linux/slab.h> #include <linux/compat.h> #include <linux/random.h> #include <linux/sysctl.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <asm/div64.h> #include <asm/timex.h> #include <asm/io.h> #include "tick-internal.h" #include "timer_migration.h" #define CREATE_TRACE_POINTS #include <trace/events/timer.h> __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); /* * The timer wheel has LVL_DEPTH array levels. Each level provides an array of * LVL_SIZE buckets. Each level is driven by its own clock and therefore each * level has a different granularity. * * The level granularity is: LVL_CLK_DIV ^ level * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level) * * The array level of a newly armed timer depends on the relative expiry * time. The farther the expiry time is away the higher the array level and * therefore the granularity becomes. * * Contrary to the original timer wheel implementation, which aims for 'exact' * expiry of the timers, this implementation removes the need for recascading * the timers into the lower array levels. The previous 'classic' timer wheel * implementation of the kernel already violated the 'exact' expiry by adding * slack to the expiry time to provide batched expiration. The granularity * levels provide implicit batching. * * This is an optimization of the original timer wheel implementation for the * majority of the timer wheel use cases: timeouts. The vast majority of * timeout timers (networking, disk I/O ...) are canceled before expiry. If * the timeout expires it indicates that normal operation is disturbed, so it * does not matter much whether the timeout comes with a slight delay. * * The only exception to this are networking timers with a small expiry * time. They rely on the granularity. Those fit into the first wheel level, * which has HZ granularity. * * We don't have cascading anymore. timers with a expiry time above the * capacity of the last wheel level are force expired at the maximum timeout * value of the last wheel level. From data sampling we know that the maximum * value observed is 5 days (network connection tracking), so this should not * be an issue. * * The currently chosen array constants values are a good compromise between * array size and granularity. * * This results in the following granularity and range levels: * * HZ 1000 steps * Level Offset Granularity Range * 0 0 1 ms 0 ms - 63 ms * 1 64 8 ms 64 ms - 511 ms * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s) * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s) * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m) * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m) * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h) * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d) * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d) * * HZ 300 * Level Offset Granularity Range * 0 0 3 ms 0 ms - 210 ms * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s) * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s) * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m) * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m) * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h) * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h) * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d) * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d) * * HZ 250 * Level Offset Granularity Range * 0 0 4 ms 0 ms - 255 ms * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s) * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s) * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m) * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m) * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h) * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h) * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d) * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d) * * HZ 100 * Level Offset Granularity Range * 0 0 10 ms 0 ms - 630 ms * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s) * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s) * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m) * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m) * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h) * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d) * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d) */ /* Clock divisor for the next level */ #define LVL_CLK_SHIFT 3 #define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT) #define LVL_CLK_MASK (LVL_CLK_DIV - 1) #define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT) #define LVL_GRAN(n) (1UL << LVL_SHIFT(n)) /* * The time start value for each level to select the bucket at enqueue * time. We start from the last possible delta of the previous level * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()). */ #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) /* Size of each clock level */ #define LVL_BITS 6 #define LVL_SIZE (1UL << LVL_BITS) #define LVL_MASK (LVL_SIZE - 1) #define LVL_OFFS(n) ((n) * LVL_SIZE) /* Level depth */ #if HZ > 100 # define LVL_DEPTH 9 # else # define LVL_DEPTH 8 #endif /* The cutoff (max. capacity of the wheel) */ #define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH)) #define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1)) /* * The resulting wheel size. If NOHZ is configured we allocate two * wheels so we have a separate storage for the deferrable timers. */ #define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH) #ifdef CONFIG_NO_HZ_COMMON /* * If multiple bases need to be locked, use the base ordering for lock * nesting, i.e. lowest number first. */ # define NR_BASES 3 # define BASE_LOCAL 0 # define BASE_GLOBAL 1 # define BASE_DEF 2 #else # define NR_BASES 1 # define BASE_LOCAL 0 # define BASE_GLOBAL 0 # define BASE_DEF 0 #endif /** * struct timer_base - Per CPU timer base (number of base depends on config) * @lock: Lock protecting the timer_base * @running_timer: When expiring timers, the lock is dropped. To make * sure not to race against deleting/modifying a * currently running timer, the pointer is set to the * timer, which expires at the moment. If no timer is * running, the pointer is NULL. * @expiry_lock: PREEMPT_RT only: Lock is taken in softirq around * timer expiry callback execution and when trying to * delete a running timer and it wasn't successful in * the first glance. It prevents priority inversion * when callback was preempted on a remote CPU and a * caller tries to delete the running timer. It also * prevents a life lock, when the task which tries to * delete a timer preempted the softirq thread which * is running the timer callback function. * @timer_waiters: PREEMPT_RT only: Tells, if there is a waiter * waiting for the end of the timer callback function * execution. * @clk: clock of the timer base; is updated before enqueue * of a timer; during expiry, it is 1 offset ahead of * jiffies to avoid endless requeuing to current * jiffies * @next_expiry: expiry value of the first timer; it is updated when * finding the next timer and during enqueue; the * value is not valid, when next_expiry_recalc is set * @cpu: Number of CPU the timer base belongs to * @next_expiry_recalc: States, whether a recalculation of next_expiry is * required. Value is set true, when a timer was * deleted. * @is_idle: Is set, when timer_base is idle. It is triggered by NOHZ * code. This state is only used in standard * base. Deferrable timers, which are enqueued remotely * never wake up an idle CPU. So no matter of supporting it * for this base. * @timers_pending: Is set, when a timer is pending in the base. It is only * reliable when next_expiry_recalc is not set. * @pending_map: bitmap of the timer wheel; each bit reflects a * bucket of the wheel. When a bit is set, at least a * single timer is enqueued in the related bucket. * @vectors: Array of lists; Each array member reflects a bucket * of the timer wheel. The list contains all timers * which are enqueued into a specific bucket. */ struct timer_base { raw_spinlock_t lock; struct timer_list *running_timer; #ifdef CONFIG_PREEMPT_RT spinlock_t expiry_lock; atomic_t timer_waiters; #endif unsigned long clk; unsigned long next_expiry; unsigned int cpu; bool next_expiry_recalc; bool is_idle; bool timers_pending; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); #ifdef CONFIG_NO_HZ_COMMON static DEFINE_STATIC_KEY_FALSE(timers_nohz_active); static DEFINE_MUTEX(timer_keys_mutex); static void timer_update_keys(struct work_struct *work); static DECLARE_WORK(timer_update_work, timer_update_keys); #ifdef CONFIG_SMP static unsigned int sysctl_timer_migration = 1; DEFINE_STATIC_KEY_FALSE(timers_migration_enabled); static void timers_update_migration(void) { if (sysctl_timer_migration && tick_nohz_active) static_branch_enable(&timers_migration_enabled); else static_branch_disable(&timers_migration_enabled); } #ifdef CONFIG_SYSCTL static int timer_migration_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; mutex_lock(&timer_keys_mutex); ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) timers_update_migration(); mutex_unlock(&timer_keys_mutex); return ret; } static const struct ctl_table timer_sysctl[] = { { .procname = "timer_migration", .data = &sysctl_timer_migration, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = timer_migration_handler, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; static int __init timer_sysctl_init(void) { register_sysctl("kernel", timer_sysctl); return 0; } device_initcall(timer_sysctl_init); #endif /* CONFIG_SYSCTL */ #else /* CONFIG_SMP */ static inline void timers_update_migration(void) { } #endif /* !CONFIG_SMP */ static void timer_update_keys(struct work_struct *work) { mutex_lock(&timer_keys_mutex); timers_update_migration(); static_branch_enable(&timers_nohz_active); mutex_unlock(&timer_keys_mutex); } void timers_update_nohz(void) { schedule_work(&timer_update_work); } static inline bool is_timers_nohz_active(void) { return static_branch_unlikely(&timers_nohz_active); } #else static inline bool is_timers_nohz_active(void) { return false; } #endif /* NO_HZ_COMMON */ static unsigned long round_jiffies_common(unsigned long j, int cpu, bool force_up) { int rem; unsigned long original = j; /* * We don't want all cpus firing their timers at once hitting the * same lock or cachelines, so we skew each extra cpu with an extra * 3 jiffies. This 3 jiffies came originally from the mm/ code which * already did this. * The skew is done by adding 3*cpunr, then round, then subtract this * extra offset again. */ j += cpu * 3; rem = j % HZ; /* * If the target jiffy is just after a whole second (which can happen * due to delays of the timer irq, long irq off times etc etc) then * we should round down to the whole second, not up. Use 1/4th second * as cutoff for this rounding as an extreme upper bound for this. * But never round down if @force_up is set. */ if (rem < HZ/4 && !force_up) /* round down */ j = j - rem; else /* round up */ j = j - rem + HZ; /* now that we have rounded, subtract the extra skew again */ j -= cpu * 3; /* * Make sure j is still in the future. Otherwise return the * unmodified value. */ return time_is_after_jiffies(j) ? j : original; } /** * __round_jiffies - function to round jiffies to a full second * @j: the time in (absolute) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * * __round_jiffies() rounds an absolute time in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. * * By rounding these timers to whole seconds, all such timers will fire * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * * The exact rounding is skewed for each processor to avoid all * processors firing at the exact same time, which could lead * to lock contention or spurious cache line bouncing. * * The return value is the rounded version of the @j parameter. */ unsigned long __round_jiffies(unsigned long j, int cpu) { return round_jiffies_common(j, cpu, false); } EXPORT_SYMBOL_GPL(__round_jiffies); /** * __round_jiffies_relative - function to round jiffies to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * * __round_jiffies_relative() rounds a time delta in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. * * By rounding these timers to whole seconds, all such timers will fire * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * * The exact rounding is skewed for each processor to avoid all * processors firing at the exact same time, which could lead * to lock contention or spurious cache line bouncing. * * The return value is the rounded version of the @j parameter. */ unsigned long __round_jiffies_relative(unsigned long j, int cpu) { unsigned long j0 = jiffies; /* Use j0 because jiffies might change while we run */ return round_jiffies_common(j + j0, cpu, false) - j0; } EXPORT_SYMBOL_GPL(__round_jiffies_relative); /** * round_jiffies - function to round jiffies to a full second * @j: the time in (absolute) jiffies that should be rounded * * round_jiffies() rounds an absolute time in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. * * By rounding these timers to whole seconds, all such timers will fire * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * * The return value is the rounded version of the @j parameter. */ unsigned long round_jiffies(unsigned long j) { return round_jiffies_common(j, raw_smp_processor_id(), false); } EXPORT_SYMBOL_GPL(round_jiffies); /** * round_jiffies_relative - function to round jiffies to a full second * @j: the time in (relative) jiffies that should be rounded * * round_jiffies_relative() rounds a time delta in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. * * By rounding these timers to whole seconds, all such timers will fire * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * * The return value is the rounded version of the @j parameter. */ unsigned long round_jiffies_relative(unsigned long j) { return __round_jiffies_relative(j, raw_smp_processor_id()); } EXPORT_SYMBOL_GPL(round_jiffies_relative); /** * __round_jiffies_up - function to round jiffies up to a full second * @j: the time in (absolute) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * * This is the same as __round_jiffies() except that it will never * round down. This is useful for timeouts for which the exact time * of firing does not matter too much, as long as they don't fire too * early. */ unsigned long __round_jiffies_up(unsigned long j, int cpu) { return round_jiffies_common(j, cpu, true); } EXPORT_SYMBOL_GPL(__round_jiffies_up); /** * __round_jiffies_up_relative - function to round jiffies up to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * * This is the same as __round_jiffies_relative() except that it will never * round down. This is useful for timeouts for which the exact time * of firing does not matter too much, as long as they don't fire too * early. */ unsigned long __round_jiffies_up_relative(unsigned long j, int cpu) { unsigned long j0 = jiffies; /* Use j0 because jiffies might change while we run */ return round_jiffies_common(j + j0, cpu, true) - j0; } EXPORT_SYMBOL_GPL(__round_jiffies_up_relative); /** * round_jiffies_up - function to round jiffies up to a full second * @j: the time in (absolute) jiffies that should be rounded * * This is the same as round_jiffies() except that it will never * round down. This is useful for timeouts for which the exact time * of firing does not matter too much, as long as they don't fire too * early. */ unsigned long round_jiffies_up(unsigned long j) { return round_jiffies_common(j, raw_smp_processor_id(), true); } EXPORT_SYMBOL_GPL(round_jiffies_up); /** * round_jiffies_up_relative - function to round jiffies up to a full second * @j: the time in (relative) jiffies that should be rounded * * This is the same as round_jiffies_relative() except that it will never * round down. This is useful for timeouts for which the exact time * of firing does not matter too much, as long as they don't fire too * early. */ unsigned long round_jiffies_up_relative(unsigned long j) { return __round_jiffies_up_relative(j, raw_smp_processor_id()); } EXPORT_SYMBOL_GPL(round_jiffies_up_relative); static inline unsigned int timer_get_idx(struct timer_list *timer) { return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT; } static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) { timer->flags = (timer->flags & ~TIMER_ARRAYMASK) | idx << TIMER_ARRAYSHIFT; } /* * Helper function to calculate the array index for a given expiry * time. */ static inline unsigned calc_index(unsigned long expires, unsigned lvl, unsigned long *bucket_expiry) { /* * The timer wheel has to guarantee that a timer does not fire * early. Early expiry can happen due to: * - Timer is armed at the edge of a tick * - Truncation of the expiry time in the outer wheel levels * * Round up with level granularity to prevent this. */ expires = (expires >> LVL_SHIFT(lvl)) + 1; *bucket_expiry = expires << LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); } static int calc_wheel_index(unsigned long expires, unsigned long clk, unsigned long *bucket_expiry) { unsigned long delta = expires - clk; unsigned int idx; if (delta < LVL_START(1)) { idx = calc_index(expires, 0, bucket_expiry); } else if (delta < LVL_START(2)) { idx = calc_index(expires, 1, bucket_expiry); } else if (delta < LVL_START(3)) { idx = calc_index(expires, 2, bucket_expiry); } else if (delta < LVL_START(4)) { idx = calc_index(expires, 3, bucket_expiry); } else if (delta < LVL_START(5)) { idx = calc_index(expires, 4, bucket_expiry); } else if (delta < LVL_START(6)) { idx = calc_index(expires, 5, bucket_expiry); } else if (delta < LVL_START(7)) { idx = calc_index(expires, 6, bucket_expiry); } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { idx = calc_index(expires, 7, bucket_expiry); } else if ((long) delta < 0) { idx = clk & LVL_MASK; *bucket_expiry = clk; } else { /* * Force expire obscene large timeouts to expire at the * capacity limit of the wheel. */ if (delta >= WHEEL_TIMEOUT_CUTOFF) expires = clk + WHEEL_TIMEOUT_MAX; idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry); } return idx; } static void trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) { /* * Deferrable timers do not prevent the CPU from entering dynticks and * are not taken into account on the idle/nohz_full path. An IPI when a * new deferrable timer is enqueued will wake up the remote CPU but * nothing will be done with the deferrable timer base. Therefore skip * the remote IPI for deferrable timers completely. */ if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE) return; /* * We might have to IPI the remote CPU if the base is idle and the * timer is pinned. If it is a non pinned timer, it is only queued * on the remote CPU, when timer was running during queueing. Then * everything is handled by remote CPU anyway. If the other CPU is * on the way to idle then it can't set base->is_idle as we hold * the base lock: */ if (base->is_idle) { WARN_ON_ONCE(!(timer->flags & TIMER_PINNED || tick_nohz_full_cpu(base->cpu))); wake_up_nohz_cpu(base->cpu); } } /* * Enqueue the timer into the hash bucket, mark it pending in * the bitmap, store the index in the timer flags then wake up * the target CPU if needed. */ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, unsigned int idx, unsigned long bucket_expiry) { hlist_add_head(&timer->entry, base->vectors + idx); __set_bit(idx, base->pending_map); timer_set_idx(timer, idx); trace_timer_start(timer, bucket_expiry); /* * Check whether this is the new first expiring timer. The * effective expiry time of the timer is required here * (bucket_expiry) instead of timer->expires. */ if (time_before(bucket_expiry, base->next_expiry)) { /* * Set the next expiry time and kick the CPU so it * can reevaluate the wheel: */ WRITE_ONCE(base->next_expiry, bucket_expiry); base->timers_pending = true; base->next_expiry_recalc = false; trigger_dyntick_cpu(base, timer); } } static void internal_add_timer(struct timer_base *base, struct timer_list *timer) { unsigned long bucket_expiry; unsigned int idx; idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry); enqueue_timer(base, timer, idx, bucket_expiry); } #ifdef CONFIG_DEBUG_OBJECTS_TIMERS static const struct debug_obj_descr timer_debug_descr; struct timer_hint { void (*function)(struct timer_list *t); long offset; }; #define TIMER_HINT(fn, container, timr, hintfn) \ { \ .function = fn, \ .offset = offsetof(container, hintfn) - \ offsetof(container, timr) \ } static const struct timer_hint timer_hints[] = { TIMER_HINT(delayed_work_timer_fn, struct delayed_work, timer, work.func), TIMER_HINT(kthread_delayed_work_timer_fn, struct kthread_delayed_work, timer, work.func), }; static void *timer_debug_hint(void *addr) { struct timer_list *timer = addr; int i; for (i = 0; i < ARRAY_SIZE(timer_hints); i++) { if (timer_hints[i].function == timer->function) { void (**fn)(void) = addr + timer_hints[i].offset; return *fn; } } return timer->function; } static bool timer_is_static_object(void *addr) { struct timer_list *timer = addr; return (timer->entry.pprev == NULL && timer->entry.next == TIMER_ENTRY_STATIC); } /* * timer_fixup_init is called when: * - an active object is initialized */ static bool timer_fixup_init(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: del_timer_sync(timer); debug_object_init(timer, &timer_debug_descr); return true; default: return false; } } /* Stub timer callback for improperly used timers. */ static void stub_timer(struct timer_list *unused) { WARN_ON(1); } /* * timer_fixup_activate is called when: * - an active object is activated * - an unknown non-static object is activated */ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { case ODEBUG_STATE_NOTAVAILABLE: timer_setup(timer, stub_timer, 0); return true; case ODEBUG_STATE_ACTIVE: WARN_ON(1); fallthrough; default: return false; } } /* * timer_fixup_free is called when: * - an active object is freed */ static bool timer_fixup_free(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: del_timer_sync(timer); debug_object_free(timer, &timer_debug_descr); return true; default: return false; } } /* * timer_fixup_assert_init is called when: * - an untracked/uninit-ed object is found */ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { case ODEBUG_STATE_NOTAVAILABLE: timer_setup(timer, stub_timer, 0); return true; default: return false; } } static const struct debug_obj_descr timer_debug_descr = { .name = "timer_list", .debug_hint = timer_debug_hint, .is_static_object = timer_is_static_object, .fixup_init = timer_fixup_init, .fixup_activate = timer_fixup_activate, .fixup_free = timer_fixup_free, .fixup_assert_init = timer_fixup_assert_init, }; static inline void debug_timer_init(struct timer_list *timer) { debug_object_init(timer, &timer_debug_descr); } static inline void debug_timer_activate(struct timer_list *timer) { debug_object_activate(timer, &timer_debug_descr); } static inline void debug_timer_deactivate(struct timer_list *timer) { debug_object_deactivate(timer, &timer_debug_descr); } static inline void debug_timer_assert_init(struct timer_list *timer) { debug_object_assert_init(timer, &timer_debug_descr); } static void do_init_timer(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key); void init_timer_on_stack_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { debug_object_init_on_stack(timer, &timer_debug_descr); do_init_timer(timer, func, flags, name, key); } EXPORT_SYMBOL_GPL(init_timer_on_stack_key); void destroy_timer_on_stack(struct timer_list *timer) { debug_object_free(timer, &timer_debug_descr); } EXPORT_SYMBOL_GPL(destroy_timer_on_stack); #else static inline void debug_timer_init(struct timer_list *timer) { } static inline void debug_timer_activate(struct timer_list *timer) { } static inline void debug_timer_deactivate(struct timer_list *timer) { } static inline void debug_timer_assert_init(struct timer_list *timer) { } #endif static inline void debug_init(struct timer_list *timer) { debug_timer_init(timer); trace_timer_init(timer); } static inline void debug_deactivate(struct timer_list *timer) { debug_timer_deactivate(timer); trace_timer_cancel(timer); } static inline void debug_assert_init(struct timer_list *timer) { debug_timer_assert_init(timer); } static void do_init_timer(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { timer->entry.pprev = NULL; timer->function = func; if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS)) flags &= TIMER_INIT_FLAGS; timer->flags = flags | raw_smp_processor_id(); lockdep_init_map(&timer->lockdep_map, name, key, 0); } /** * init_timer_key - initialize a timer * @timer: the timer to be initialized * @func: timer callback function * @flags: timer flags * @name: name of the timer * @key: lockdep class key of the fake lock used for tracking timer * sync lock dependencies * * init_timer_key() must be done to a timer prior to calling *any* of the * other timer functions. */ void init_timer_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { debug_init(timer); do_init_timer(timer, func, flags, name, key); } EXPORT_SYMBOL(init_timer_key); static inline void detach_timer(struct timer_list *timer, bool clear_pending) { struct hlist_node *entry = &timer->entry; debug_deactivate(timer); __hlist_del(entry); if (clear_pending) entry->pprev = NULL; entry->next = LIST_POISON2; } static int detach_if_pending(struct timer_list *timer, struct timer_base *base, bool clear_pending) { unsigned idx = timer_get_idx(timer); if (!timer_pending(timer)) return 0; if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) { __clear_bit(idx, base->pending_map); base->next_expiry_recalc = true; } detach_timer(timer, clear_pending); return 1; } static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) { int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) index = BASE_DEF; return per_cpu_ptr(&timer_bases[index], cpu); } static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) { int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) index = BASE_DEF; return this_cpu_ptr(&timer_bases[index]); } static inline struct timer_base *get_timer_base(u32 tflags) { return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); } static inline void __forward_timer_base(struct timer_base *base, unsigned long basej) { /* * Check whether we can forward the base. We can only do that when * @basej is past base->clk otherwise we might rewind base->clk. */ if (time_before_eq(basej, base->clk)) return; /* * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ if (time_after(base->next_expiry, basej)) { base->clk = basej; } else { if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) return; base->clk = base->next_expiry; } } static inline void forward_timer_base(struct timer_base *base) { __forward_timer_base(base, READ_ONCE(jiffies)); } /* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means * that all timers which are tied to this base are locked, and the base itself * is locked too. * * So __run_timers/migrate_timers can safely modify all timers which could * be found in the base->vectors array. * * When a timer is migrating then the TIMER_MIGRATING flag is set and we need * to wait until the migration is done. */ static struct timer_base *lock_timer_base(struct timer_list *timer, unsigned long *flags) __acquires(timer->base->lock) { for (;;) { struct timer_base *base; u32 tf; /* * We need to use READ_ONCE() here, otherwise the compiler * might re-read @tf between the check for TIMER_MIGRATING * and spin_lock(). */ tf = READ_ONCE(timer->flags); if (!(tf & TIMER_MIGRATING)) { base = get_timer_base(tf); raw_spin_lock_irqsave(&base->lock, *flags); if (timer->flags == tf) return base; raw_spin_unlock_irqrestore(&base->lock, *flags); } cpu_relax(); } } #define MOD_TIMER_PENDING_ONLY 0x01 #define MOD_TIMER_REDUCE 0x02 #define MOD_TIMER_NOTPENDING 0x04 static inline int __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) { unsigned long clk = 0, flags, bucket_expiry; struct timer_base *base, *new_base; unsigned int idx = UINT_MAX; int ret = 0; debug_assert_init(timer); /* * This is a common optimization triggered by the networking code - if * the timer is re-modified to have the same timeout or ends up in the * same array bucket then just return: */ if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) { /* * The downside of this optimization is that it can result in * larger granularity than you would get from adding a new * timer with this expiry. */ long diff = timer->expires - expires; if (!diff) return 1; if (options & MOD_TIMER_REDUCE && diff <= 0) return 1; /* * We lock timer base and calculate the bucket index right * here. If the timer ends up in the same bucket, then we * just update the expiry time and avoid the whole * dequeue/enqueue dance. */ base = lock_timer_base(timer, &flags); /* * Has @timer been shutdown? This needs to be evaluated * while holding base lock to prevent a race against the * shutdown code. */ if (!timer->function) goto out_unlock; forward_timer_base(base); if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) && time_before_eq(timer->expires, expires)) { ret = 1; goto out_unlock; } clk = base->clk; idx = calc_wheel_index(expires, clk, &bucket_expiry); /* * Retrieve and compare the array index of the pending * timer. If it matches set the expiry to the new value so a * subsequent call will exit in the expires check above. */ if (idx == timer_get_idx(timer)) { if (!(options & MOD_TIMER_REDUCE)) timer->expires = expires; else if (time_after(timer->expires, expires)) timer->expires = expires; ret = 1; goto out_unlock; } } else { base = lock_timer_base(timer, &flags); /* * Has @timer been shutdown? This needs to be evaluated * while holding base lock to prevent a race against the * shutdown code. */ if (!timer->function) goto out_unlock; forward_timer_base(base); } ret = detach_if_pending(timer, base, false); if (!ret && (options & MOD_TIMER_PENDING_ONLY)) goto out_unlock; new_base = get_timer_this_cpu_base(timer->flags); if (base != new_base) { /* * We are trying to schedule the timer on the new base. * However we can't change timer's base while it is running, * otherwise timer_delete_sync() can't detect that the timer's * handler yet has not finished. This also guarantees that the * timer is serialized wrt itself. */ if (likely(base->running_timer != timer)) { /* See the comment in lock_timer_base() */ timer->flags |= TIMER_MIGRATING; raw_spin_unlock(&base->lock); base = new_base; raw_spin_lock(&base->lock); WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | base->cpu); forward_timer_base(base); } } debug_timer_activate(timer); timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance * between calculating 'idx' and possibly switching the base, only * enqueue_timer() is required. Otherwise we need to (re)calculate * the wheel index via internal_add_timer(). */ if (idx != UINT_MAX && clk == base->clk) enqueue_timer(base, timer, idx, bucket_expiry); else internal_add_timer(base, timer); out_unlock: raw_spin_unlock_irqrestore(&base->lock, flags); return ret; } /** * mod_timer_pending - Modify a pending timer's timeout * @timer: The pending timer to be modified * @expires: New absolute timeout in jiffies * * mod_timer_pending() is the same for pending timers as mod_timer(), but * will not activate inactive timers. * * If @timer->function == NULL then the start operation is silently * discarded. * * Return: * * %0 - The timer was inactive and not modified or was in * shutdown state and the operation was discarded * * %1 - The timer was active and requeued to expire at @expires */ int mod_timer_pending(struct timer_list *timer, unsigned long expires) { return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY); } EXPORT_SYMBOL(mod_timer_pending); /** * mod_timer - Modify a timer's timeout * @timer: The timer to be modified * @expires: New absolute timeout in jiffies * * mod_timer(timer, expires) is equivalent to: * * del_timer(timer); timer->expires = expires; add_timer(timer); * * mod_timer() is more efficient than the above open coded sequence. In * case that the timer is inactive, the del_timer() part is a NOP. The * timer is in any case activated with the new expiry time @expires. * * Note that if there are multiple unserialized concurrent users of the * same timer, then mod_timer() is the only safe way to modify the timeout, * since add_timer() cannot modify an already running timer. * * If @timer->function == NULL then the start operation is silently * discarded. In this case the return value is 0 and meaningless. * * Return: * * %0 - The timer was inactive and started or was in shutdown * state and the operation was discarded * * %1 - The timer was active and requeued to expire at @expires or * the timer was active and not modified because @expires did * not change the effective expiry time */ int mod_timer(struct timer_list *timer, unsigned long expires) { return __mod_timer(timer, expires, 0); } EXPORT_SYMBOL(mod_timer); /** * timer_reduce - Modify a timer's timeout if it would reduce the timeout * @timer: The timer to be modified * @expires: New absolute timeout in jiffies * * timer_reduce() is very similar to mod_timer(), except that it will only * modify an enqueued timer if that would reduce the expiration time. If * @timer is not enqueued it starts the timer. * * If @timer->function == NULL then the start operation is silently * discarded. * * Return: * * %0 - The timer was inactive and started or was in shutdown * state and the operation was discarded * * %1 - The timer was active and requeued to expire at @expires or * the timer was active and not modified because @expires * did not change the effective expiry time such that the * timer would expire earlier than already scheduled */ int timer_reduce(struct timer_list *timer, unsigned long expires) { return __mod_timer(timer, expires, MOD_TIMER_REDUCE); } EXPORT_SYMBOL(timer_reduce); /** * add_timer - Start a timer * @timer: The timer to be started * * Start @timer to expire at @timer->expires in the future. @timer->expires * is the absolute expiry time measured in 'jiffies'. When the timer expires * timer->function(timer) will be invoked from soft interrupt context. * * The @timer->expires and @timer->function fields must be set prior * to calling this function. * * If @timer->function == NULL then the start operation is silently * discarded. * * If @timer->expires is already in the past @timer will be queued to * expire at the next timer tick. * * This can only operate on an inactive timer. Attempts to invoke this on * an active timer are rejected with a warning. */ void add_timer(struct timer_list *timer) { if (WARN_ON_ONCE(timer_pending(timer))) return; __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); } EXPORT_SYMBOL(add_timer); /** * add_timer_local() - Start a timer on the local CPU * @timer: The timer to be started * * Same as add_timer() except that the timer flag TIMER_PINNED is set. * * See add_timer() for further details. */ void add_timer_local(struct timer_list *timer) { if (WARN_ON_ONCE(timer_pending(timer))) return; timer->flags |= TIMER_PINNED; __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); } EXPORT_SYMBOL(add_timer_local); /** * add_timer_global() - Start a timer without TIMER_PINNED flag set * @timer: The timer to be started * * Same as add_timer() except that the timer flag TIMER_PINNED is unset. * * See add_timer() for further details. */ void add_timer_global(struct timer_list *timer) { if (WARN_ON_ONCE(timer_pending(timer))) return; timer->flags &= ~TIMER_PINNED; __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); } EXPORT_SYMBOL(add_timer_global); /** * add_timer_on - Start a timer on a particular CPU * @timer: The timer to be started * @cpu: The CPU to start it on * * Same as add_timer() except that it starts the timer on the given CPU and * the TIMER_PINNED flag is set. When timer shouldn't be a pinned timer in * the next round, add_timer_global() should be used instead as it unsets * the TIMER_PINNED flag. * * See add_timer() for further details. */ void add_timer_on(struct timer_list *timer, int cpu) { struct timer_base *new_base, *base; unsigned long flags; debug_assert_init(timer); if (WARN_ON_ONCE(timer_pending(timer))) return; /* Make sure timer flags have TIMER_PINNED flag set */ timer->flags |= TIMER_PINNED; new_base = get_timer_cpu_base(timer->flags, cpu); /* * If @timer was on a different CPU, it should be migrated with the * old base locked to prevent other operations proceeding with the * wrong base locked. See lock_timer_base(). */ base = lock_timer_base(timer, &flags); /* * Has @timer been shutdown? This needs to be evaluated while * holding base lock to prevent a race against the shutdown code. */ if (!timer->function) goto out_unlock; if (base != new_base) { timer->flags |= TIMER_MIGRATING; raw_spin_unlock(&base->lock); base = new_base; raw_spin_lock(&base->lock); WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | cpu); } forward_timer_base(base); debug_timer_activate(timer); internal_add_timer(base, timer); out_unlock: raw_spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(add_timer_on); /** * __timer_delete - Internal function: Deactivate a timer * @timer: The timer to be deactivated * @shutdown: If true, this indicates that the timer is about to be * shutdown permanently. * * If @shutdown is true then @timer->function is set to NULL under the * timer base lock which prevents further rearming of the time. In that * case any attempt to rearm @timer after this function returns will be * silently ignored. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated */ static int __timer_delete(struct timer_list *timer, bool shutdown) { struct timer_base *base; unsigned long flags; int ret = 0; debug_assert_init(timer); /* * If @shutdown is set then the lock has to be taken whether the * timer is pending or not to protect against a concurrent rearm * which might hit between the lockless pending check and the lock * acquisition. By taking the lock it is ensured that such a newly * enqueued timer is dequeued and cannot end up with * timer->function == NULL in the expiry code. * * If timer->function is currently executed, then this makes sure * that the callback cannot requeue the timer. */ if (timer_pending(timer) || shutdown) { base = lock_timer_base(timer, &flags); ret = detach_if_pending(timer, base, true); if (shutdown) timer->function = NULL; raw_spin_unlock_irqrestore(&base->lock, flags); } return ret; } /** * timer_delete - Deactivate a timer * @timer: The timer to be deactivated * * The function only deactivates a pending timer, but contrary to * timer_delete_sync() it does not take into account whether the timer's * callback function is concurrently executed on a different CPU or not. * It neither prevents rearming of the timer. If @timer can be rearmed * concurrently then the return value of this function is meaningless. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated */ int timer_delete(struct timer_list *timer) { return __timer_delete(timer, false); } EXPORT_SYMBOL(timer_delete); /** * timer_shutdown - Deactivate a timer and prevent rearming * @timer: The timer to be deactivated * * The function does not wait for an eventually running timer callback on a * different CPU but it prevents rearming of the timer. Any attempt to arm * @timer after this function returns will be silently ignored. * * This function is useful for teardown code and should only be used when * timer_shutdown_sync() cannot be invoked due to locking or context constraints. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending */ int timer_shutdown(struct timer_list *timer) { return __timer_delete(timer, true); } EXPORT_SYMBOL_GPL(timer_shutdown); /** * __try_to_del_timer_sync - Internal function: Try to deactivate a timer * @timer: Timer to deactivate * @shutdown: If true, this indicates that the timer is about to be * shutdown permanently. * * If @shutdown is true then @timer->function is set to NULL under the * timer base lock which prevents further rearming of the timer. Any * attempt to rearm @timer after this function returns will be silently * ignored. * * This function cannot guarantee that the timer cannot be rearmed * right after dropping the base lock if @shutdown is false. That * needs to be prevented by the calling code if necessary. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated * * %-1 - The timer callback function is running on a different CPU */ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) { struct timer_base *base; unsigned long flags; int ret = -1; debug_assert_init(timer); base = lock_timer_base(timer, &flags); if (base->running_timer != timer) ret = detach_if_pending(timer, base, true); if (shutdown) timer->function = NULL; raw_spin_unlock_irqrestore(&base->lock, flags); return ret; } /** * try_to_del_timer_sync - Try to deactivate a timer * @timer: Timer to deactivate * * This function tries to deactivate a timer. On success the timer is not * queued and the timer callback function is not running on any CPU. * * This function does not guarantee that the timer cannot be rearmed right * after dropping the base lock. That needs to be prevented by the calling * code if necessary. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated * * %-1 - The timer callback function is running on a different CPU */ int try_to_del_timer_sync(struct timer_list *timer) { return __try_to_del_timer_sync(timer, false); } EXPORT_SYMBOL(try_to_del_timer_sync); #ifdef CONFIG_PREEMPT_RT static __init void timer_base_init_expiry_lock(struct timer_base *base) { spin_lock_init(&base->expiry_lock); } static inline void timer_base_lock_expiry(struct timer_base *base) { spin_lock(&base->expiry_lock); } static inline void timer_base_unlock_expiry(struct timer_base *base) { spin_unlock(&base->expiry_lock); } /* * The counterpart to del_timer_wait_running(). * * If there is a waiter for base->expiry_lock, then it was waiting for the * timer callback to finish. Drop expiry_lock and reacquire it. That allows * the waiter to acquire the lock and make progress. */ static void timer_sync_wait_running(struct timer_base *base) __releases(&base->lock) __releases(&base->expiry_lock) __acquires(&base->expiry_lock) __acquires(&base->lock) { if (atomic_read(&base->timer_waiters)) { raw_spin_unlock_irq(&base->lock); spin_unlock(&base->expiry_lock); spin_lock(&base->expiry_lock); raw_spin_lock_irq(&base->lock); } } /* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was * running. * * This prevents priority inversion, if the softirq thread on a remote CPU * got preempted, and it prevents a life lock when the task which tries to * delete a timer preempted the softirq thread running the timer callback * function. */ static void del_timer_wait_running(struct timer_list *timer) { u32 tf; tf = READ_ONCE(timer->flags); if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) { struct timer_base *base = get_timer_base(tf); /* * Mark the base as contended and grab the expiry lock, * which is held by the softirq across the timer * callback. Drop the lock immediately so the softirq can * expire the next timer. In theory the timer could already * be running again, but that's more than unlikely and just * causes another wait loop. */ atomic_inc(&base->timer_waiters); spin_lock_bh(&base->expiry_lock); atomic_dec(&base->timer_waiters); spin_unlock_bh(&base->expiry_lock); } } #else static inline void timer_base_init_expiry_lock(struct timer_base *base) { } static inline void timer_base_lock_expiry(struct timer_base *base) { } static inline void timer_base_unlock_expiry(struct timer_base *base) { } static inline void timer_sync_wait_running(struct timer_base *base) { } static inline void del_timer_wait_running(struct timer_list *timer) { } #endif /** * __timer_delete_sync - Internal function: Deactivate a timer and wait * for the handler to finish. * @timer: The timer to be deactivated * @shutdown: If true, @timer->function will be set to NULL under the * timer base lock which prevents rearming of @timer * * If @shutdown is not set the timer can be rearmed later. If the timer can * be rearmed concurrently, i.e. after dropping the base lock then the * return value is meaningless. * * If @shutdown is set then @timer->function is set to NULL under timer * base lock which prevents rearming of the timer. Any attempt to rearm * a shutdown timer is silently ignored. * * If the timer should be reused after shutdown it has to be initialized * again. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated */ static int __timer_delete_sync(struct timer_list *timer, bool shutdown) { int ret; #ifdef CONFIG_LOCKDEP unsigned long flags; /* * If lockdep gives a backtrace here, please reference * the synchronization rules above. */ local_irq_save(flags); lock_map_acquire(&timer->lockdep_map); lock_map_release(&timer->lockdep_map); local_irq_restore(flags); #endif /* * don't use it in hardirq context, because it * could lead to deadlock. */ WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE)); /* * Must be able to sleep on PREEMPT_RT because of the slowpath in * del_timer_wait_running(). */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE)) lockdep_assert_preemption_enabled(); do { ret = __try_to_del_timer_sync(timer, shutdown); if (unlikely(ret < 0)) { del_timer_wait_running(timer); cpu_relax(); } } while (ret < 0); return ret; } /** * timer_delete_sync - Deactivate a timer and wait for the handler to finish. * @timer: The timer to be deactivated * * Synchronization rules: Callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from * interrupt contexts unless the timer is an irqsafe one. The caller must * not hold locks which would prevent completion of the timer's callback * function. The timer's handler must not call add_timer_on(). Upon exit * the timer is not queued and the handler is not running on any CPU. * * For !irqsafe timers, the caller must not hold locks that are held in * interrupt context. Even if the lock has nothing to do with the timer in * question. Here's why:: * * CPU0 CPU1 * ---- ---- * <SOFTIRQ> * call_timer_fn(); * base->running_timer = mytimer; * spin_lock_irq(somelock); * <IRQ> * spin_lock(somelock); * timer_delete_sync(mytimer); * while (base->running_timer == mytimer); * * Now timer_delete_sync() will never return and never release somelock. * The interrupt on the other CPU is waiting to grab somelock but it has * interrupted the softirq that CPU0 is waiting to finish. * * This function cannot guarantee that the timer is not rearmed again by * some concurrent or preempting code, right after it dropped the base * lock. If there is the possibility of a concurrent rearm then the return * value of the function is meaningless. * * If such a guarantee is needed, e.g. for teardown situations then use * timer_shutdown_sync() instead. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated */ int timer_delete_sync(struct timer_list *timer) { return __timer_delete_sync(timer, false); } EXPORT_SYMBOL(timer_delete_sync); /** * timer_shutdown_sync - Shutdown a timer and prevent rearming * @timer: The timer to be shutdown * * When the function returns it is guaranteed that: * - @timer is not queued * - The callback function of @timer is not running * - @timer cannot be enqueued again. Any attempt to rearm * @timer is silently ignored. * * See timer_delete_sync() for synchronization rules. * * This function is useful for final teardown of an infrastructure where * the timer is subject to a circular dependency problem. * * A common pattern for this is a timer and a workqueue where the timer can * schedule work and work can arm the timer. On shutdown the workqueue must * be destroyed and the timer must be prevented from rearming. Unless the * code has conditionals like 'if (mything->in_shutdown)' to prevent that * there is no way to get this correct with timer_delete_sync(). * * timer_shutdown_sync() is solving the problem. The correct ordering of * calls in this case is: * * timer_shutdown_sync(&mything->timer); * workqueue_destroy(&mything->workqueue); * * After this 'mything' can be safely freed. * * This obviously implies that the timer is not required to be functional * for the rest of the shutdown operation. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending */ int timer_shutdown_sync(struct timer_list *timer) { return __timer_delete_sync(timer, true); } EXPORT_SYMBOL_GPL(timer_shutdown_sync); static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *), unsigned long baseclk) { int count = preempt_count(); #ifdef CONFIG_LOCKDEP /* * It is permissible to free the timer from inside the * function that is called from it, this we need to take into * account for lockdep too. To avoid bogus "held lock freed" * warnings as well as problems when looking into * timer->lockdep_map, make a copy and use that here. */ struct lockdep_map lockdep_map; lockdep_copy_map(&lockdep_map, &timer->lockdep_map); #endif /* * Couple the lock chain with the lock chain at * timer_delete_sync() by acquiring the lock_map around the fn() * call here and in timer_delete_sync(). */ lock_map_acquire(&lockdep_map); trace_timer_expire_entry(timer, baseclk); fn(timer); trace_timer_expire_exit(timer); lock_map_release(&lockdep_map); if (count != preempt_count()) { WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n", fn, count, preempt_count()); /* * Restore the preempt count. That gives us a decent * chance to survive and extract information. If the * callback kept a lock held, bad luck, but not worse * than the BUG() we had. */ preempt_count_set(count); } } static void expire_timers(struct timer_base *base, struct hlist_head *head) { /* * This value is required only for tracing. base->clk was * incremented directly before expire_timers was called. But expiry * is related to the old base->clk value. */ unsigned long baseclk = base->clk - 1; while (!hlist_empty(head)) { struct timer_list *timer; void (*fn)(struct timer_list *); timer = hlist_entry(head->first, struct timer_list, entry); base->running_timer = timer; detach_timer(timer, true); fn = timer->function; if (WARN_ON_ONCE(!fn)) { /* Should never happen. Emphasis on should! */ base->running_timer = NULL; continue; } if (timer->flags & TIMER_IRQSAFE) { raw_spin_unlock(&base->lock); call_timer_fn(timer, fn, baseclk); raw_spin_lock(&base->lock); base->running_timer = NULL; } else { raw_spin_unlock_irq(&base->lock); call_timer_fn(timer, fn, baseclk); raw_spin_lock_irq(&base->lock); base->running_timer = NULL; timer_sync_wait_running(base); } } } static int collect_expired_timers(struct timer_base *base, struct hlist_head *heads) { unsigned long clk = base->clk = base->next_expiry; struct hlist_head *vec; int i, levels = 0; unsigned int idx; for (i = 0; i < LVL_DEPTH; i++) { idx = (clk & LVL_MASK) + i * LVL_SIZE; if (__test_and_clear_bit(idx, base->pending_map)) { vec = base->vectors + idx; hlist_move_list(vec, heads++); levels++; } /* Is it time to look at the next level? */ if (clk & LVL_CLK_MASK) break; /* Shift clock for the next level granularity */ clk >>= LVL_CLK_SHIFT; } return levels; } /* * Find the next pending bucket of a level. Search from level start (@offset) * + @clk upwards and if nothing there, search from start of the level * (@offset) up to @offset + clk. */ static int next_pending_bucket(struct timer_base *base, unsigned offset, unsigned clk) { unsigned pos, start = offset + clk; unsigned end = offset + LVL_SIZE; pos = find_next_bit(base->pending_map, end, start); if (pos < end) return pos - start; pos = find_next_bit(base->pending_map, start, offset); return pos < start ? pos + LVL_SIZE - start : -1; } /* * Search the first expiring timer in the various clock levels. Caller must * hold base->lock. * * Store next expiry time in base->next_expiry. */ static void timer_recalc_next_expiry(struct timer_base *base) { unsigned long clk, next, adj; unsigned lvl, offset = 0; next = base->clk + NEXT_TIMER_MAX_DELTA; clk = base->clk; for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { int pos = next_pending_bucket(base, offset, clk & LVL_MASK); unsigned long lvl_clk = clk & LVL_CLK_MASK; if (pos >= 0) { unsigned long tmp = clk + (unsigned long) pos; tmp <<= LVL_SHIFT(lvl); if (time_before(tmp, next)) next = tmp; /* * If the next expiration happens before we reach * the next level, no need to check further. */ if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK)) break; } /* * Clock for the next level. If the current level clock lower * bits are zero, we look at the next level as is. If not we * need to advance it by one because that's going to be the * next expiring bucket in that level. base->clk is the next * expiring jiffy. So in case of: * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 0 0 * * we have to look at all levels @index 0. With * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 0 2 * * LVL0 has the next expiring bucket @index 2. The upper * levels have the next expiring bucket @index 1. * * In case that the propagation wraps the next level the same * rules apply: * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 F 2 * * So after looking at LVL0 we get: * * LVL5 LVL4 LVL3 LVL2 LVL1 * 0 0 0 1 0 * * So no propagation from LVL1 to LVL2 because that happened * with the add already, but then we need to propagate further * from LVL2 to LVL3. * * So the simple check whether the lower bits of the current * level are 0 or not is sufficient for all cases. */ adj = lvl_clk ? 1 : 0; clk >>= LVL_CLK_SHIFT; clk += adj; } WRITE_ONCE(base->next_expiry, next); base->next_expiry_recalc = false; base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); } #ifdef CONFIG_NO_HZ_COMMON /* * Check, if the next hrtimer event is before the next timer wheel * event: */ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) { u64 nextevt = hrtimer_get_next_event(); /* * If high resolution timers are enabled * hrtimer_get_next_event() returns KTIME_MAX. */ if (expires <= nextevt) return expires; /* * If the next timer is already expired, return the tick base * time so the tick is fired immediately. */ if (nextevt <= basem) return basem; /* * Round up to the next jiffy. High resolution timers are * off, so the hrtimers are expired in the tick and we need to * make sure that this tick really expires the timer to avoid * a ping pong of the nohz stop code. * * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3 */ return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC; } static unsigned long next_timer_interrupt(struct timer_base *base, unsigned long basej) { if (base->next_expiry_recalc) timer_recalc_next_expiry(base); /* * Move next_expiry for the empty base into the future to prevent an * unnecessary raise of the timer softirq when the next_expiry value * will be reached even if there is no timer pending. * * This update is also required to make timer_base::next_expiry values * easy comparable to find out which base holds the first pending timer. */ if (!base->timers_pending) WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA); return base->next_expiry; } static unsigned long fetch_next_timer_interrupt(unsigned long basej, u64 basem, struct timer_base *base_local, struct timer_base *base_global, struct timer_events *tevt) { unsigned long nextevt, nextevt_local, nextevt_global; bool local_first; nextevt_local = next_timer_interrupt(base_local, basej); nextevt_global = next_timer_interrupt(base_global, basej); local_first = time_before_eq(nextevt_local, nextevt_global); nextevt = local_first ? nextevt_local : nextevt_global; /* * If the @nextevt is at max. one tick away, use @nextevt and store * it in the local expiry value. The next global event is irrelevant in * this case and can be left as KTIME_MAX. */ if (time_before_eq(nextevt, basej + 1)) { /* If we missed a tick already, force 0 delta */ if (time_before(nextevt, basej)) nextevt = basej; tevt->local = basem + (u64)(nextevt - basej) * TICK_NSEC; /* * This is required for the remote check only but it doesn't * hurt, when it is done for both call sites: * * * The remote callers will only take care of the global timers * as local timers will be handled by CPU itself. When not * updating tevt->global with the already missed first global * timer, it is possible that it will be missed completely. * * * The local callers will ignore the tevt->global anyway, when * nextevt is max. one tick away. */ if (!local_first) tevt->global = tevt->local; return nextevt; } /* * Update tevt.* values: * * If the local queue expires first, then the global event can be * ignored. If the global queue is empty, nothing to do either. */ if (!local_first && base_global->timers_pending) tevt->global = basem + (u64)(nextevt_global - basej) * TICK_NSEC; if (base_local->timers_pending) tevt->local = basem + (u64)(nextevt_local - basej) * TICK_NSEC; return nextevt; } # ifdef CONFIG_SMP /** * fetch_next_timer_interrupt_remote() - Store next timers into @tevt * @basej: base time jiffies * @basem: base time clock monotonic * @tevt: Pointer to the storage for the expiry values * @cpu: Remote CPU * * Stores the next pending local and global timer expiry values in the * struct pointed to by @tevt. If a queue is empty the corresponding * field is set to KTIME_MAX. If local event expires before global * event, global event is set to KTIME_MAX as well. * * Caller needs to make sure timer base locks are held (use * timer_lock_remote_bases() for this purpose). */ void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem, struct timer_events *tevt, unsigned int cpu) { struct timer_base *base_local, *base_global; /* Preset local / global events */ tevt->local = tevt->global = KTIME_MAX; base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); lockdep_assert_held(&base_local->lock); lockdep_assert_held(&base_global->lock); fetch_next_timer_interrupt(basej, basem, base_local, base_global, tevt); } /** * timer_unlock_remote_bases - unlock timer bases of cpu * @cpu: Remote CPU * * Unlocks the remote timer bases. */ void timer_unlock_remote_bases(unsigned int cpu) __releases(timer_bases[BASE_LOCAL]->lock) __releases(timer_bases[BASE_GLOBAL]->lock) { struct timer_base *base_local, *base_global; base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); raw_spin_unlock(&base_global->lock); raw_spin_unlock(&base_local->lock); } /** * timer_lock_remote_bases - lock timer bases of cpu * @cpu: Remote CPU * * Locks the remote timer bases. */ void timer_lock_remote_bases(unsigned int cpu) __acquires(timer_bases[BASE_LOCAL]->lock) __acquires(timer_bases[BASE_GLOBAL]->lock) { struct timer_base *base_local, *base_global; base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); lockdep_assert_irqs_disabled(); raw_spin_lock(&base_local->lock); raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING); } /** * timer_base_is_idle() - Return whether timer base is set idle * * Returns value of local timer base is_idle value. */ bool timer_base_is_idle(void) { return __this_cpu_read(timer_bases[BASE_LOCAL].is_idle); } static void __run_timer_base(struct timer_base *base); /** * timer_expire_remote() - expire global timers of cpu * @cpu: Remote CPU * * Expire timers of global base of remote CPU. */ void timer_expire_remote(unsigned int cpu) { struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); __run_timer_base(base); } static void timer_use_tmigr(unsigned long basej, u64 basem, unsigned long *nextevt, bool *tick_stop_path, bool timer_base_idle, struct timer_events *tevt) { u64 next_tmigr; if (timer_base_idle) next_tmigr = tmigr_cpu_new_timer(tevt->global); else if (tick_stop_path) next_tmigr = tmigr_cpu_deactivate(tevt->global); else next_tmigr = tmigr_quick_check(tevt->global); /* * If the CPU is the last going idle in timer migration hierarchy, make * sure the CPU will wake up in time to handle remote timers. * next_tmigr == KTIME_MAX if other CPUs are still active. */ if (next_tmigr < tevt->local) { u64 tmp; /* If we missed a tick already, force 0 delta */ if (next_tmigr < basem) next_tmigr = basem; tmp = div_u64(next_tmigr - basem, TICK_NSEC); *nextevt = basej + (unsigned long)tmp; tevt->local = next_tmigr; } } # else static void timer_use_tmigr(unsigned long basej, u64 basem, unsigned long *nextevt, bool *tick_stop_path, bool timer_base_idle, struct timer_events *tevt) { /* * Make sure first event is written into tevt->local to not miss a * timer on !SMP systems. */ tevt->local = min_t(u64, tevt->local, tevt->global); } # endif /* CONFIG_SMP */ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem, bool *idle) { struct timer_events tevt = { .local = KTIME_MAX, .global = KTIME_MAX }; struct timer_base *base_local, *base_global; unsigned long nextevt; bool idle_is_possible; /* * When the CPU is offline, the tick is cancelled and nothing is supposed * to try to stop it. */ if (WARN_ON_ONCE(cpu_is_offline(smp_processor_id()))) { if (idle) *idle = true; return tevt.local; } base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]); base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]); raw_spin_lock(&base_local->lock); raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING); nextevt = fetch_next_timer_interrupt(basej, basem, base_local, base_global, &tevt); /* * If the next event is only one jiffy ahead there is no need to call * timer migration hierarchy related functions. The value for the next * global timer in @tevt struct equals then KTIME_MAX. This is also * true, when the timer base is idle. * * The proper timer migration hierarchy function depends on the callsite * and whether timer base is idle or not. @nextevt will be updated when * this CPU needs to handle the first timer migration hierarchy * event. See timer_use_tmigr() for detailed information. */ idle_is_possible = time_after(nextevt, basej + 1); if (idle_is_possible) timer_use_tmigr(basej, basem, &nextevt, idle, base_local->is_idle, &tevt); /* * We have a fresh next event. Check whether we can forward the * base. */ __forward_timer_base(base_local, basej); __forward_timer_base(base_global, basej); /* * Set base->is_idle only when caller is timer_base_try_to_set_idle() */ if (idle) { /* * Bases are idle if the next event is more than a tick * away. Caution: @nextevt could have changed by enqueueing a * global timer into timer migration hierarchy. Therefore a new * check is required here. * * If the base is marked idle then any timer add operation must * forward the base clk itself to keep granularity small. This * idle logic is only maintained for the BASE_LOCAL and * BASE_GLOBAL base, deferrable timers may still see large * granularity skew (by design). */ if (!base_local->is_idle && time_after(nextevt, basej + 1)) { base_local->is_idle = true; /* * Global timers queued locally while running in a task * in nohz_full mode need a self-IPI to kick reprogramming * in IRQ tail. */ if (tick_nohz_full_cpu(base_local->cpu)) base_global->is_idle = true; trace_timer_base_idle(true, base_local->cpu); } *idle = base_local->is_idle; /* * When timer base is not set idle, undo the effect of * tmigr_cpu_deactivate() to prevent inconsistent states - active * timer base but inactive timer migration hierarchy. * * When timer base was already marked idle, nothing will be * changed here. */ if (!base_local->is_idle && idle_is_possible) tmigr_cpu_activate(); } raw_spin_unlock(&base_global->lock); raw_spin_unlock(&base_local->lock); return cmp_next_hrtimer_event(basem, tevt.local); } /** * get_next_timer_interrupt() - return the time (clock mono) of the next timer * @basej: base time jiffies * @basem: base time clock monotonic * * Returns the tick aligned clock monotonic time of the next pending timer or * KTIME_MAX if no timer is pending. If timer of global base was queued into * timer migration hierarchy, first global timer is not taken into account. If * it was the last CPU of timer migration hierarchy going idle, first global * event is taken into account. */ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) { return __get_next_timer_interrupt(basej, basem, NULL); } /** * timer_base_try_to_set_idle() - Try to set the idle state of the timer bases * @basej: base time jiffies * @basem: base time clock monotonic * @idle: pointer to store the value of timer_base->is_idle on return; * *idle contains the information whether tick was already stopped * * Returns the tick aligned clock monotonic time of the next pending timer or * KTIME_MAX if no timer is pending. When tick was already stopped KTIME_MAX is * returned as well. */ u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle) { if (*idle) return KTIME_MAX; return __get_next_timer_interrupt(basej, basem, idle); } /** * timer_clear_idle - Clear the idle state of the timer base * * Called with interrupts disabled */ void timer_clear_idle(void) { /* * We do this unlocked. The worst outcome is a remote pinned timer * enqueue sending a pointless IPI, but taking the lock would just * make the window for sending the IPI a few instructions smaller * for the cost of taking the lock in the exit from idle * path. Required for BASE_LOCAL only. */ __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false); if (tick_nohz_full_cpu(smp_processor_id())) __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false); trace_timer_base_idle(false, smp_processor_id()); /* Activate without holding the timer_base->lock */ tmigr_cpu_activate(); } #endif /** * __run_timers - run all expired timers (if any) on this CPU. * @base: the timer vector to be processed. */ static inline void __run_timers(struct timer_base *base) { struct hlist_head heads[LVL_DEPTH]; int levels; lockdep_assert_held(&base->lock); if (base->running_timer) return; while (time_after_eq(jiffies, base->clk) && time_after_eq(jiffies, base->next_expiry)) { levels = collect_expired_timers(base, heads); /* * The two possible reasons for not finding any expired * timer at this clk are that all matching timers have been * dequeued or no timer has been queued since * base::next_expiry was set to base::clk + * NEXT_TIMER_MAX_DELTA. */ WARN_ON_ONCE(!levels && !base->next_expiry_recalc && base->timers_pending); /* * While executing timers, base->clk is set 1 offset ahead of * jiffies to avoid endless requeuing to current jiffies. */ base->clk++; timer_recalc_next_expiry(base); while (levels--) expire_timers(base, heads + levels); } } static void __run_timer_base(struct timer_base *base) { /* Can race against a remote CPU updating next_expiry under the lock */ if (time_before(jiffies, READ_ONCE(base->next_expiry))) return; timer_base_lock_expiry(base); raw_spin_lock_irq(&base->lock); __run_timers(base); raw_spin_unlock_irq(&base->lock); timer_base_unlock_expiry(base); } static void run_timer_base(int index) { struct timer_base *base = this_cpu_ptr(&timer_bases[index]); __run_timer_base(base); } /* * This function runs timers and the timer-tq in bottom half context. */ static __latent_entropy void run_timer_softirq(void) { run_timer_base(BASE_LOCAL); if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) { run_timer_base(BASE_GLOBAL); run_timer_base(BASE_DEF); if (is_timers_nohz_active()) tmigr_handle_remote(); } } /* * Called by the local, per-CPU timer interrupt on SMP. */ static void run_local_timers(void) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]); hrtimer_run_queues(); for (int i = 0; i < NR_BASES; i++, base++) { /* * Raise the softirq only if required. * * timer_base::next_expiry can be written by a remote CPU while * holding the lock. If this write happens at the same time than * the lockless local read, sanity checker could complain about * data corruption. * * There are two possible situations where * timer_base::next_expiry is written by a remote CPU: * * 1. Remote CPU expires global timers of this CPU and updates * timer_base::next_expiry of BASE_GLOBAL afterwards in * next_timer_interrupt() or timer_recalc_next_expiry(). The * worst outcome is a superfluous raise of the timer softirq * when the not yet updated value is read. * * 2. A new first pinned timer is enqueued by a remote CPU * and therefore timer_base::next_expiry of BASE_LOCAL is * updated. When this update is missed, this isn't a * problem, as an IPI is executed nevertheless when the CPU * was idle before. When the CPU wasn't idle but the update * is missed, then the timer would expire one jiffy late - * bad luck. * * Those unlikely corner cases where the worst outcome is only a * one jiffy delay or a superfluous raise of the softirq are * not that expensive as doing the check always while holding * the lock. * * Possible remote writers are using WRITE_ONCE(). Local reader * uses therefore READ_ONCE(). */ if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) || (i == BASE_DEF && tmigr_requires_handle_remote())) { raise_timer_softirq(TIMER_SOFTIRQ); return; } } } /* * Called from the timer interrupt handler to charge one tick to the current * process. user_tick is 1 if the tick is user time, 0 for system. */ void update_process_times(int user_tick) { struct task_struct *p = current; /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); rcu_sched_clock_irq(user_tick); #ifdef CONFIG_IRQ_WORK if (in_irq()) irq_work_tick(); #endif sched_tick(); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) run_posix_cpu_timers(); } #ifdef CONFIG_HOTPLUG_CPU static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) { struct timer_list *timer; int cpu = new_base->cpu; while (!hlist_empty(head)) { timer = hlist_entry(head->first, struct timer_list, entry); detach_timer(timer, false); timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; internal_add_timer(new_base, timer); } } int timers_prepare_cpu(unsigned int cpu) { struct timer_base *base; int b; for (b = 0; b < NR_BASES; b++) { base = per_cpu_ptr(&timer_bases[b], cpu); base->clk = jiffies; base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; base->next_expiry_recalc = false; base->timers_pending = false; base->is_idle = false; } return 0; } int timers_dead_cpu(unsigned int cpu) { struct timer_base *old_base; struct timer_base *new_base; int b, i; for (b = 0; b < NR_BASES; b++) { old_base = per_cpu_ptr(&timer_bases[b], cpu); new_base = get_cpu_ptr(&timer_bases[b]); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ raw_spin_lock_irq(&new_base->lock); raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); /* * The current CPUs base clock might be stale. Update it * before moving the timers over. */ forward_timer_base(new_base); WARN_ON_ONCE(old_base->running_timer); old_base->running_timer = NULL; for (i = 0; i < WHEEL_SIZE; i++) migrate_timer_list(new_base, old_base->vectors + i); raw_spin_unlock(&old_base->lock); raw_spin_unlock_irq(&new_base->lock); put_cpu_ptr(&timer_bases); } return 0; } #endif /* CONFIG_HOTPLUG_CPU */ static void __init init_timer_cpu(int cpu) { struct timer_base *base; int i; for (i = 0; i < NR_BASES; i++) { base = per_cpu_ptr(&timer_bases[i], cpu); base->cpu = cpu; raw_spin_lock_init(&base->lock); base->clk = jiffies; base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; timer_base_init_expiry_lock(base); } } static void __init init_timer_cpus(void) { int cpu; for_each_possible_cpu(cpu) init_timer_cpu(cpu); } void __init init_timers(void) { init_timer_cpus(); posix_cputimers_init_work(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } |
| 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2011, Intel Corporation. * * Description: Data Center Bridging netlink interface * Author: Lucy Liu <lucy.liu@intel.com> */ #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/slab.h> #include <net/netlink.h> #include <net/rtnetlink.h> #include <linux/dcbnl.h> #include <net/dcbevent.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <net/sock.h> /* Data Center Bridging (DCB) is a collection of Ethernet enhancements * intended to allow network traffic with differing requirements * (highly reliable, no drops vs. best effort vs. low latency) to operate * and co-exist on Ethernet. Current DCB features are: * * Enhanced Transmission Selection (aka Priority Grouping [PG]) - provides a * framework for assigning bandwidth guarantees to traffic classes. * * Priority-based Flow Control (PFC) - provides a flow control mechanism which * can work independently for each 802.1p priority. * * Congestion Notification - provides a mechanism for end-to-end congestion * control for protocols which do not have built-in congestion management. * * More information about the emerging standards for these Ethernet features * can be found at: http://www.ieee802.org/1/pages/dcbridges.html * * This file implements an rtnetlink interface to allow configuration of DCB * features for capable devices. */ /**************** DCB attribute policies *************************************/ /* DCB netlink attributes policy */ static const struct nla_policy dcbnl_rtnl_policy[DCB_ATTR_MAX + 1] = { [DCB_ATTR_IFNAME] = {.type = NLA_NUL_STRING, .len = IFNAMSIZ - 1}, [DCB_ATTR_STATE] = {.type = NLA_U8}, [DCB_ATTR_PFC_CFG] = {.type = NLA_NESTED}, [DCB_ATTR_PG_CFG] = {.type = NLA_NESTED}, [DCB_ATTR_SET_ALL] = {.type = NLA_U8}, [DCB_ATTR_PERM_HWADDR] = {.type = NLA_FLAG}, [DCB_ATTR_CAP] = {.type = NLA_NESTED}, [DCB_ATTR_PFC_STATE] = {.type = NLA_U8}, [DCB_ATTR_BCN] = {.type = NLA_NESTED}, [DCB_ATTR_APP] = {.type = NLA_NESTED}, [DCB_ATTR_IEEE] = {.type = NLA_NESTED}, [DCB_ATTR_DCBX] = {.type = NLA_U8}, [DCB_ATTR_FEATCFG] = {.type = NLA_NESTED}, }; /* DCB priority flow control to User Priority nested attributes */ static const struct nla_policy dcbnl_pfc_up_nest[DCB_PFC_UP_ATTR_MAX + 1] = { [DCB_PFC_UP_ATTR_0] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_1] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_2] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_3] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_4] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_5] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_6] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_7] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_ALL] = {.type = NLA_FLAG}, }; /* DCB priority grouping nested attributes */ static const struct nla_policy dcbnl_pg_nest[DCB_PG_ATTR_MAX + 1] = { [DCB_PG_ATTR_TC_0] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_1] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_2] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_3] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_4] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_5] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_6] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_7] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_ALL] = {.type = NLA_NESTED}, [DCB_PG_ATTR_BW_ID_0] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_1] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_2] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_3] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_4] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_5] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_6] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_7] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_ALL] = {.type = NLA_FLAG}, }; /* DCB traffic class nested attributes. */ static const struct nla_policy dcbnl_tc_param_nest[DCB_TC_ATTR_PARAM_MAX + 1] = { [DCB_TC_ATTR_PARAM_PGID] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_UP_MAPPING] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_STRICT_PRIO] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_BW_PCT] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_ALL] = {.type = NLA_FLAG}, }; /* DCB capabilities nested attributes. */ static const struct nla_policy dcbnl_cap_nest[DCB_CAP_ATTR_MAX + 1] = { [DCB_CAP_ATTR_ALL] = {.type = NLA_FLAG}, [DCB_CAP_ATTR_PG] = {.type = NLA_U8}, [DCB_CAP_ATTR_PFC] = {.type = NLA_U8}, [DCB_CAP_ATTR_UP2TC] = {.type = NLA_U8}, [DCB_CAP_ATTR_PG_TCS] = {.type = NLA_U8}, [DCB_CAP_ATTR_PFC_TCS] = {.type = NLA_U8}, [DCB_CAP_ATTR_GSP] = {.type = NLA_U8}, [DCB_CAP_ATTR_BCN] = {.type = NLA_U8}, [DCB_CAP_ATTR_DCBX] = {.type = NLA_U8}, }; /* DCB capabilities nested attributes. */ static const struct nla_policy dcbnl_numtcs_nest[DCB_NUMTCS_ATTR_MAX + 1] = { [DCB_NUMTCS_ATTR_ALL] = {.type = NLA_FLAG}, [DCB_NUMTCS_ATTR_PG] = {.type = NLA_U8}, [DCB_NUMTCS_ATTR_PFC] = {.type = NLA_U8}, }; /* DCB BCN nested attributes. */ static const struct nla_policy dcbnl_bcn_nest[DCB_BCN_ATTR_MAX + 1] = { [DCB_BCN_ATTR_RP_0] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_1] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_2] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_3] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_4] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_5] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_6] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_7] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_ALL] = {.type = NLA_FLAG}, [DCB_BCN_ATTR_BCNA_0] = {.type = NLA_U32}, [DCB_BCN_ATTR_BCNA_1] = {.type = NLA_U32}, [DCB_BCN_ATTR_ALPHA] = {.type = NLA_U32}, [DCB_BCN_ATTR_BETA] = {.type = NLA_U32}, [DCB_BCN_ATTR_GD] = {.type = NLA_U32}, [DCB_BCN_ATTR_GI] = {.type = NLA_U32}, [DCB_BCN_ATTR_TMAX] = {.type = NLA_U32}, [DCB_BCN_ATTR_TD] = {.type = NLA_U32}, [DCB_BCN_ATTR_RMIN] = {.type = NLA_U32}, [DCB_BCN_ATTR_W] = {.type = NLA_U32}, [DCB_BCN_ATTR_RD] = {.type = NLA_U32}, [DCB_BCN_ATTR_RU] = {.type = NLA_U32}, [DCB_BCN_ATTR_WRTT] = {.type = NLA_U32}, [DCB_BCN_ATTR_RI] = {.type = NLA_U32}, [DCB_BCN_ATTR_C] = {.type = NLA_U32}, [DCB_BCN_ATTR_ALL] = {.type = NLA_FLAG}, }; /* DCB APP nested attributes. */ static const struct nla_policy dcbnl_app_nest[DCB_APP_ATTR_MAX + 1] = { [DCB_APP_ATTR_IDTYPE] = {.type = NLA_U8}, [DCB_APP_ATTR_ID] = {.type = NLA_U16}, [DCB_APP_ATTR_PRIORITY] = {.type = NLA_U8}, }; /* IEEE 802.1Qaz nested attributes. */ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = { [DCB_ATTR_IEEE_ETS] = {.len = sizeof(struct ieee_ets)}, [DCB_ATTR_IEEE_PFC] = {.len = sizeof(struct ieee_pfc)}, [DCB_ATTR_IEEE_APP_TABLE] = {.type = NLA_NESTED}, [DCB_ATTR_IEEE_MAXRATE] = {.len = sizeof(struct ieee_maxrate)}, [DCB_ATTR_IEEE_QCN] = {.len = sizeof(struct ieee_qcn)}, [DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)}, [DCB_ATTR_DCB_BUFFER] = {.len = sizeof(struct dcbnl_buffer)}, [DCB_ATTR_DCB_APP_TRUST_TABLE] = {.type = NLA_NESTED}, }; /* DCB number of traffic classes nested attributes. */ static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = { [DCB_FEATCFG_ATTR_ALL] = {.type = NLA_FLAG}, [DCB_FEATCFG_ATTR_PG] = {.type = NLA_U8}, [DCB_FEATCFG_ATTR_PFC] = {.type = NLA_U8}, [DCB_FEATCFG_ATTR_APP] = {.type = NLA_U8}, }; static LIST_HEAD(dcb_app_list); static LIST_HEAD(dcb_rewr_list); static DEFINE_SPINLOCK(dcb_lock); static enum ieee_attrs_app dcbnl_app_attr_type_get(u8 selector) { switch (selector) { case IEEE_8021QAZ_APP_SEL_ETHERTYPE: case IEEE_8021QAZ_APP_SEL_STREAM: case IEEE_8021QAZ_APP_SEL_DGRAM: case IEEE_8021QAZ_APP_SEL_ANY: case IEEE_8021QAZ_APP_SEL_DSCP: return DCB_ATTR_IEEE_APP; case DCB_APP_SEL_PCP: return DCB_ATTR_DCB_APP; default: return DCB_ATTR_IEEE_APP_UNSPEC; } } static bool dcbnl_app_attr_type_validate(enum ieee_attrs_app type) { switch (type) { case DCB_ATTR_IEEE_APP: case DCB_ATTR_DCB_APP: return true; default: return false; } } static bool dcbnl_app_selector_validate(enum ieee_attrs_app type, u8 selector) { return dcbnl_app_attr_type_get(selector) == type; } static struct sk_buff *dcbnl_newmsg(int type, u8 cmd, u32 port, u32 seq, u32 flags, struct nlmsghdr **nlhp) { struct sk_buff *skb; struct dcbmsg *dcb; struct nlmsghdr *nlh; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return NULL; nlh = nlmsg_put(skb, port, seq, type, sizeof(*dcb), flags); BUG_ON(!nlh); dcb = nlmsg_data(nlh); dcb->dcb_family = AF_UNSPEC; dcb->cmd = cmd; dcb->dcb_pad = 0; if (nlhp) *nlhp = nlh; return skb; } static int dcbnl_getstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { /* if (!tb[DCB_ATTR_STATE] || !netdev->dcbnl_ops->getstate) */ if (!netdev->dcbnl_ops->getstate) return -EOPNOTSUPP; return nla_put_u8(skb, DCB_ATTR_STATE, netdev->dcbnl_ops->getstate(netdev)); } static int dcbnl_getpfccfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1], *nest; u8 value; int ret; int i; int getall = 0; if (!tb[DCB_ATTR_PFC_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->getpfccfg) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX, tb[DCB_ATTR_PFC_CFG], dcbnl_pfc_up_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_PFC_CFG); if (!nest) return -EMSGSIZE; if (data[DCB_PFC_UP_ATTR_ALL]) getall = 1; for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { if (!getall && !data[i]) continue; netdev->dcbnl_ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0, &value); ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); return ret; } } nla_nest_end(skb, nest); return 0; } static int dcbnl_getperm_hwaddr(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 perm_addr[MAX_ADDR_LEN]; if (!netdev->dcbnl_ops->getpermhwaddr) return -EOPNOTSUPP; memset(perm_addr, 0, sizeof(perm_addr)); netdev->dcbnl_ops->getpermhwaddr(netdev, perm_addr); return nla_put(skb, DCB_ATTR_PERM_HWADDR, sizeof(perm_addr), perm_addr); } static int dcbnl_getcap(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_CAP_ATTR_MAX + 1], *nest; u8 value; int ret; int i; int getall = 0; if (!tb[DCB_ATTR_CAP]) return -EINVAL; if (!netdev->dcbnl_ops->getcap) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_CAP_ATTR_MAX, tb[DCB_ATTR_CAP], dcbnl_cap_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_CAP); if (!nest) return -EMSGSIZE; if (data[DCB_CAP_ATTR_ALL]) getall = 1; for (i = DCB_CAP_ATTR_ALL+1; i <= DCB_CAP_ATTR_MAX; i++) { if (!getall && !data[i]) continue; if (!netdev->dcbnl_ops->getcap(netdev, i, &value)) { ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); return ret; } } } nla_nest_end(skb, nest); return 0; } static int dcbnl_getnumtcs(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1], *nest; u8 value; int ret; int i; int getall = 0; if (!tb[DCB_ATTR_NUMTCS]) return -EINVAL; if (!netdev->dcbnl_ops->getnumtcs) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS], dcbnl_numtcs_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_NUMTCS); if (!nest) return -EMSGSIZE; if (data[DCB_NUMTCS_ATTR_ALL]) getall = 1; for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) { if (!getall && !data[i]) continue; ret = netdev->dcbnl_ops->getnumtcs(netdev, i, &value); if (!ret) { ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); return ret; } } else return -EINVAL; } nla_nest_end(skb, nest); return 0; } static int dcbnl_setnumtcs(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1]; int ret; u8 value; int i; if (!tb[DCB_ATTR_NUMTCS]) return -EINVAL; if (!netdev->dcbnl_ops->setnumtcs) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS], dcbnl_numtcs_nest, NULL); if (ret) return ret; for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) { if (data[i] == NULL) continue; value = nla_get_u8(data[i]); ret = netdev->dcbnl_ops->setnumtcs(netdev, i, value); if (ret) break; } return nla_put_u8(skb, DCB_ATTR_NUMTCS, !!ret); } static int dcbnl_getpfcstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { if (!netdev->dcbnl_ops->getpfcstate) return -EOPNOTSUPP; return nla_put_u8(skb, DCB_ATTR_PFC_STATE, netdev->dcbnl_ops->getpfcstate(netdev)); } static int dcbnl_setpfcstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 value; if (!tb[DCB_ATTR_PFC_STATE]) return -EINVAL; if (!netdev->dcbnl_ops->setpfcstate) return -EOPNOTSUPP; value = nla_get_u8(tb[DCB_ATTR_PFC_STATE]); netdev->dcbnl_ops->setpfcstate(netdev, value); return nla_put_u8(skb, DCB_ATTR_PFC_STATE, 0); } static int dcbnl_getapp(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *app_nest; struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1]; u16 id; u8 up, idtype; int ret; if (!tb[DCB_ATTR_APP]) return -EINVAL; ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP], dcbnl_app_nest, NULL); if (ret) return ret; /* all must be non-null */ if ((!app_tb[DCB_APP_ATTR_IDTYPE]) || (!app_tb[DCB_APP_ATTR_ID])) return -EINVAL; /* either by eth type or by socket number */ idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]); if ((idtype != DCB_APP_IDTYPE_ETHTYPE) && (idtype != DCB_APP_IDTYPE_PORTNUM)) return -EINVAL; id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]); if (netdev->dcbnl_ops->getapp) { ret = netdev->dcbnl_ops->getapp(netdev, idtype, id); if (ret < 0) return ret; else up = ret; } else { struct dcb_app app = { .selector = idtype, .protocol = id, }; up = dcb_getapp(netdev, &app); } app_nest = nla_nest_start_noflag(skb, DCB_ATTR_APP); if (!app_nest) return -EMSGSIZE; ret = nla_put_u8(skb, DCB_APP_ATTR_IDTYPE, idtype); if (ret) goto out_cancel; ret = nla_put_u16(skb, DCB_APP_ATTR_ID, id); if (ret) goto out_cancel; ret = nla_put_u8(skb, DCB_APP_ATTR_PRIORITY, up); if (ret) goto out_cancel; nla_nest_end(skb, app_nest); return 0; out_cancel: nla_nest_cancel(skb, app_nest); return ret; } static int dcbnl_setapp(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { int ret; u16 id; u8 up, idtype; struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1]; if (!tb[DCB_ATTR_APP]) return -EINVAL; ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP], dcbnl_app_nest, NULL); if (ret) return ret; /* all must be non-null */ if ((!app_tb[DCB_APP_ATTR_IDTYPE]) || (!app_tb[DCB_APP_ATTR_ID]) || (!app_tb[DCB_APP_ATTR_PRIORITY])) return -EINVAL; /* either by eth type or by socket number */ idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]); if ((idtype != DCB_APP_IDTYPE_ETHTYPE) && (idtype != DCB_APP_IDTYPE_PORTNUM)) return -EINVAL; id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]); up = nla_get_u8(app_tb[DCB_APP_ATTR_PRIORITY]); if (netdev->dcbnl_ops->setapp) { ret = netdev->dcbnl_ops->setapp(netdev, idtype, id, up); if (ret < 0) return ret; } else { struct dcb_app app; app.selector = idtype; app.protocol = id; app.priority = up; ret = dcb_setapp(netdev, &app); } ret = nla_put_u8(skb, DCB_ATTR_APP, ret); dcbnl_cee_notify(netdev, RTM_SETDCB, DCB_CMD_SAPP, seq, 0); return ret; } static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, struct nlattr **tb, struct sk_buff *skb, int dir) { struct nlattr *pg_nest, *param_nest, *data; struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1]; struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1]; u8 prio, pgid, tc_pct, up_map; int ret; int getall = 0; int i; if (!tb[DCB_ATTR_PG_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->getpgtccfgtx || !netdev->dcbnl_ops->getpgtccfgrx || !netdev->dcbnl_ops->getpgbwgcfgtx || !netdev->dcbnl_ops->getpgbwgcfgrx) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX, tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest, NULL); if (ret) return ret; pg_nest = nla_nest_start_noflag(skb, DCB_ATTR_PG_CFG); if (!pg_nest) return -EMSGSIZE; if (pg_tb[DCB_PG_ATTR_TC_ALL]) getall = 1; for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { if (!getall && !pg_tb[i]) continue; if (pg_tb[DCB_PG_ATTR_TC_ALL]) data = pg_tb[DCB_PG_ATTR_TC_ALL]; else data = pg_tb[i]; ret = nla_parse_nested_deprecated(param_tb, DCB_TC_ATTR_PARAM_MAX, data, dcbnl_tc_param_nest, NULL); if (ret) goto err_pg; param_nest = nla_nest_start_noflag(skb, i); if (!param_nest) goto err_pg; pgid = DCB_ATTR_VALUE_UNDEFINED; prio = DCB_ATTR_VALUE_UNDEFINED; tc_pct = DCB_ATTR_VALUE_UNDEFINED; up_map = DCB_ATTR_VALUE_UNDEFINED; if (dir) { /* Rx */ netdev->dcbnl_ops->getpgtccfgrx(netdev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); } else { /* Tx */ netdev->dcbnl_ops->getpgtccfgtx(netdev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); } if (param_tb[DCB_TC_ATTR_PARAM_PGID] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_PGID, pgid); if (ret) goto err_param; } if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_UP_MAPPING, up_map); if (ret) goto err_param; } if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_STRICT_PRIO, prio); if (ret) goto err_param; } if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_BW_PCT, tc_pct); if (ret) goto err_param; } nla_nest_end(skb, param_nest); } if (pg_tb[DCB_PG_ATTR_BW_ID_ALL]) getall = 1; else getall = 0; for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { if (!getall && !pg_tb[i]) continue; tc_pct = DCB_ATTR_VALUE_UNDEFINED; if (dir) { /* Rx */ netdev->dcbnl_ops->getpgbwgcfgrx(netdev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); } else { /* Tx */ netdev->dcbnl_ops->getpgbwgcfgtx(netdev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); } ret = nla_put_u8(skb, i, tc_pct); if (ret) goto err_pg; } nla_nest_end(skb, pg_nest); return 0; err_param: nla_nest_cancel(skb, param_nest); err_pg: nla_nest_cancel(skb, pg_nest); return -EMSGSIZE; } static int dcbnl_pgtx_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_getcfg(netdev, nlh, tb, skb, 0); } static int dcbnl_pgrx_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_getcfg(netdev, nlh, tb, skb, 1); } static int dcbnl_setstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 value; if (!tb[DCB_ATTR_STATE]) return -EINVAL; if (!netdev->dcbnl_ops->setstate) return -EOPNOTSUPP; value = nla_get_u8(tb[DCB_ATTR_STATE]); return nla_put_u8(skb, DCB_ATTR_STATE, netdev->dcbnl_ops->setstate(netdev, value)); } static int dcbnl_setpfccfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1]; int i; int ret; u8 value; if (!tb[DCB_ATTR_PFC_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->setpfccfg) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX, tb[DCB_ATTR_PFC_CFG], dcbnl_pfc_up_nest, NULL); if (ret) return ret; for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { if (data[i] == NULL) continue; value = nla_get_u8(data[i]); netdev->dcbnl_ops->setpfccfg(netdev, data[i]->nla_type - DCB_PFC_UP_ATTR_0, value); } return nla_put_u8(skb, DCB_ATTR_PFC_CFG, 0); } static int dcbnl_setall(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { int ret; if (!tb[DCB_ATTR_SET_ALL]) return -EINVAL; if (!netdev->dcbnl_ops->setall) return -EOPNOTSUPP; ret = nla_put_u8(skb, DCB_ATTR_SET_ALL, netdev->dcbnl_ops->setall(netdev)); dcbnl_cee_notify(netdev, RTM_SETDCB, DCB_CMD_SET_ALL, seq, 0); return ret; } static int __dcbnl_pg_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb, int dir) { struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1]; struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1]; int ret; int i; u8 pgid; u8 up_map; u8 prio; u8 tc_pct; if (!tb[DCB_ATTR_PG_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->setpgtccfgtx || !netdev->dcbnl_ops->setpgtccfgrx || !netdev->dcbnl_ops->setpgbwgcfgtx || !netdev->dcbnl_ops->setpgbwgcfgrx) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX, tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest, NULL); if (ret) return ret; for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { if (!pg_tb[i]) continue; ret = nla_parse_nested_deprecated(param_tb, DCB_TC_ATTR_PARAM_MAX, pg_tb[i], dcbnl_tc_param_nest, NULL); if (ret) return ret; pgid = DCB_ATTR_VALUE_UNDEFINED; prio = DCB_ATTR_VALUE_UNDEFINED; tc_pct = DCB_ATTR_VALUE_UNDEFINED; up_map = DCB_ATTR_VALUE_UNDEFINED; if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO]) prio = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO]); if (param_tb[DCB_TC_ATTR_PARAM_PGID]) pgid = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_PGID]); if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT]) tc_pct = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_BW_PCT]); if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING]) up_map = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING]); /* dir: Tx = 0, Rx = 1 */ if (dir) { /* Rx */ netdev->dcbnl_ops->setpgtccfgrx(netdev, i - DCB_PG_ATTR_TC_0, prio, pgid, tc_pct, up_map); } else { /* Tx */ netdev->dcbnl_ops->setpgtccfgtx(netdev, i - DCB_PG_ATTR_TC_0, prio, pgid, tc_pct, up_map); } } for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { if (!pg_tb[i]) continue; tc_pct = nla_get_u8(pg_tb[i]); /* dir: Tx = 0, Rx = 1 */ if (dir) { /* Rx */ netdev->dcbnl_ops->setpgbwgcfgrx(netdev, i - DCB_PG_ATTR_BW_ID_0, tc_pct); } else { /* Tx */ netdev->dcbnl_ops->setpgbwgcfgtx(netdev, i - DCB_PG_ATTR_BW_ID_0, tc_pct); } } return nla_put_u8(skb, DCB_ATTR_PG_CFG, 0); } static int dcbnl_pgtx_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_setcfg(netdev, nlh, seq, tb, skb, 0); } static int dcbnl_pgrx_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_setcfg(netdev, nlh, seq, tb, skb, 1); } static int dcbnl_bcn_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *bcn_nest; struct nlattr *bcn_tb[DCB_BCN_ATTR_MAX + 1]; u8 value_byte; u32 value_integer; int ret; bool getall = false; int i; if (!tb[DCB_ATTR_BCN]) return -EINVAL; if (!netdev->dcbnl_ops->getbcnrp || !netdev->dcbnl_ops->getbcncfg) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(bcn_tb, DCB_BCN_ATTR_MAX, tb[DCB_ATTR_BCN], dcbnl_bcn_nest, NULL); if (ret) return ret; bcn_nest = nla_nest_start_noflag(skb, DCB_ATTR_BCN); if (!bcn_nest) return -EMSGSIZE; if (bcn_tb[DCB_BCN_ATTR_ALL]) getall = true; for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) { if (!getall && !bcn_tb[i]) continue; netdev->dcbnl_ops->getbcnrp(netdev, i - DCB_BCN_ATTR_RP_0, &value_byte); ret = nla_put_u8(skb, i, value_byte); if (ret) goto err_bcn; } for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) { if (!getall && !bcn_tb[i]) continue; netdev->dcbnl_ops->getbcncfg(netdev, i, &value_integer); ret = nla_put_u32(skb, i, value_integer); if (ret) goto err_bcn; } nla_nest_end(skb, bcn_nest); return 0; err_bcn: nla_nest_cancel(skb, bcn_nest); return ret; } static int dcbnl_bcn_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_BCN_ATTR_MAX + 1]; int i; int ret; u8 value_byte; u32 value_int; if (!tb[DCB_ATTR_BCN]) return -EINVAL; if (!netdev->dcbnl_ops->setbcncfg || !netdev->dcbnl_ops->setbcnrp) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_BCN_ATTR_MAX, tb[DCB_ATTR_BCN], dcbnl_bcn_nest, NULL); if (ret) return ret; for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) { if (data[i] == NULL) continue; value_byte = nla_get_u8(data[i]); netdev->dcbnl_ops->setbcnrp(netdev, data[i]->nla_type - DCB_BCN_ATTR_RP_0, value_byte); } for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) { if (data[i] == NULL) continue; value_int = nla_get_u32(data[i]); netdev->dcbnl_ops->setbcncfg(netdev, i, value_int); } return nla_put_u8(skb, DCB_ATTR_BCN, 0); } static int dcbnl_build_peer_app(struct net_device *netdev, struct sk_buff* skb, int app_nested_type, int app_info_type, int app_entry_type) { struct dcb_peer_app_info info; struct dcb_app *table = NULL; const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; u16 app_count; int err; /** * retrieve the peer app configuration form the driver. If the driver * handlers fail exit without doing anything */ err = ops->peer_getappinfo(netdev, &info, &app_count); if (!err && app_count) { table = kmalloc_array(app_count, sizeof(struct dcb_app), GFP_KERNEL); if (!table) return -ENOMEM; err = ops->peer_getapptable(netdev, table); } if (!err) { u16 i; struct nlattr *app; /** * build the message, from here on the only possible failure * is due to the skb size */ err = -EMSGSIZE; app = nla_nest_start_noflag(skb, app_nested_type); if (!app) goto nla_put_failure; if (app_info_type && nla_put(skb, app_info_type, sizeof(info), &info)) goto nla_put_failure; for (i = 0; i < app_count; i++) { if (nla_put(skb, app_entry_type, sizeof(struct dcb_app), &table[i])) goto nla_put_failure; } nla_nest_end(skb, app); } err = 0; nla_put_failure: kfree(table); return err; } static int dcbnl_getapptrust(struct net_device *netdev, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; enum ieee_attrs_app type; struct nlattr *apptrust; int nselectors, err, i; u8 *selectors; selectors = kzalloc(IEEE_8021QAZ_APP_SEL_MAX + 1, GFP_KERNEL); if (!selectors) return -ENOMEM; err = ops->dcbnl_getapptrust(netdev, selectors, &nselectors); if (err) { err = 0; goto out; } apptrust = nla_nest_start(skb, DCB_ATTR_DCB_APP_TRUST_TABLE); if (!apptrust) { err = -EMSGSIZE; goto out; } for (i = 0; i < nselectors; i++) { type = dcbnl_app_attr_type_get(selectors[i]); err = nla_put_u8(skb, type, selectors[i]); if (err) { nla_nest_cancel(skb, apptrust); goto out; } } nla_nest_end(skb, apptrust); out: kfree(selectors); return err; } /* Set or delete APP table or rewrite table entries. The APP struct is validated * and the appropriate callback function is called. */ static int dcbnl_app_table_setdel(struct nlattr *attr, struct net_device *netdev, int (*setdel)(struct net_device *dev, struct dcb_app *app)) { struct dcb_app *app_data; enum ieee_attrs_app type; struct nlattr *attr_itr; int rem, err; nla_for_each_nested(attr_itr, attr, rem) { type = nla_type(attr_itr); if (!dcbnl_app_attr_type_validate(type)) continue; if (nla_len(attr_itr) < sizeof(struct dcb_app)) return -ERANGE; app_data = nla_data(attr_itr); if (!dcbnl_app_selector_validate(type, app_data->selector)) return -EINVAL; err = setdel(netdev, app_data); if (err) return err; } return 0; } /* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; struct nlattr *ieee, *app, *rewr; struct dcb_app_type *itr; int dcbx; int err; if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name)) return -EMSGSIZE; ieee = nla_nest_start_noflag(skb, DCB_ATTR_IEEE); if (!ieee) return -EMSGSIZE; if (ops->ieee_getets) { struct ieee_ets ets; memset(&ets, 0, sizeof(ets)); err = ops->ieee_getets(netdev, &ets); if (!err && nla_put(skb, DCB_ATTR_IEEE_ETS, sizeof(ets), &ets)) return -EMSGSIZE; } if (ops->ieee_getmaxrate) { struct ieee_maxrate maxrate; memset(&maxrate, 0, sizeof(maxrate)); err = ops->ieee_getmaxrate(netdev, &maxrate); if (!err) { err = nla_put(skb, DCB_ATTR_IEEE_MAXRATE, sizeof(maxrate), &maxrate); if (err) return -EMSGSIZE; } } if (ops->ieee_getqcn) { struct ieee_qcn qcn; memset(&qcn, 0, sizeof(qcn)); err = ops->ieee_getqcn(netdev, &qcn); if (!err) { err = nla_put(skb, DCB_ATTR_IEEE_QCN, sizeof(qcn), &qcn); if (err) return -EMSGSIZE; } } if (ops->ieee_getqcnstats) { struct ieee_qcn_stats qcn_stats; memset(&qcn_stats, 0, sizeof(qcn_stats)); err = ops->ieee_getqcnstats(netdev, &qcn_stats); if (!err) { err = nla_put(skb, DCB_ATTR_IEEE_QCN_STATS, sizeof(qcn_stats), &qcn_stats); if (err) return -EMSGSIZE; } } if (ops->ieee_getpfc) { struct ieee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); err = ops->ieee_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_IEEE_PFC, sizeof(pfc), &pfc)) return -EMSGSIZE; } if (ops->dcbnl_getbuffer) { struct dcbnl_buffer buffer; memset(&buffer, 0, sizeof(buffer)); err = ops->dcbnl_getbuffer(netdev, &buffer); if (!err && nla_put(skb, DCB_ATTR_DCB_BUFFER, sizeof(buffer), &buffer)) return -EMSGSIZE; } app = nla_nest_start_noflag(skb, DCB_ATTR_IEEE_APP_TABLE); if (!app) return -EMSGSIZE; spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == netdev->ifindex) { enum ieee_attrs_app type = dcbnl_app_attr_type_get(itr->app.selector); err = nla_put(skb, type, sizeof(itr->app), &itr->app); if (err) { spin_unlock_bh(&dcb_lock); return -EMSGSIZE; } } } if (netdev->dcbnl_ops->getdcbx) dcbx = netdev->dcbnl_ops->getdcbx(netdev); else dcbx = -EOPNOTSUPP; spin_unlock_bh(&dcb_lock); nla_nest_end(skb, app); rewr = nla_nest_start(skb, DCB_ATTR_DCB_REWR_TABLE); if (!rewr) return -EMSGSIZE; spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->ifindex == netdev->ifindex) { enum ieee_attrs_app type = dcbnl_app_attr_type_get(itr->app.selector); err = nla_put(skb, type, sizeof(itr->app), &itr->app); if (err) { spin_unlock_bh(&dcb_lock); nla_nest_cancel(skb, rewr); return -EMSGSIZE; } } } spin_unlock_bh(&dcb_lock); nla_nest_end(skb, rewr); if (ops->dcbnl_getapptrust) { err = dcbnl_getapptrust(netdev, skb); if (err) return err; } /* get peer info if available */ if (ops->ieee_peer_getets) { struct ieee_ets ets; memset(&ets, 0, sizeof(ets)); err = ops->ieee_peer_getets(netdev, &ets); if (!err && nla_put(skb, DCB_ATTR_IEEE_PEER_ETS, sizeof(ets), &ets)) return -EMSGSIZE; } if (ops->ieee_peer_getpfc) { struct ieee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); err = ops->ieee_peer_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_IEEE_PEER_PFC, sizeof(pfc), &pfc)) return -EMSGSIZE; } if (ops->peer_getappinfo && ops->peer_getapptable) { err = dcbnl_build_peer_app(netdev, skb, DCB_ATTR_IEEE_PEER_APP, DCB_ATTR_IEEE_APP_UNSPEC, DCB_ATTR_IEEE_APP); if (err) return -EMSGSIZE; } nla_nest_end(skb, ieee); if (dcbx >= 0) { err = nla_put_u8(skb, DCB_ATTR_DCBX, dcbx); if (err) return -EMSGSIZE; } return 0; } static int dcbnl_cee_pg_fill(struct sk_buff *skb, struct net_device *dev, int dir) { u8 pgid, up_map, prio, tc_pct; const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops; int i = dir ? DCB_ATTR_CEE_TX_PG : DCB_ATTR_CEE_RX_PG; struct nlattr *pg = nla_nest_start_noflag(skb, i); if (!pg) return -EMSGSIZE; for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { struct nlattr *tc_nest = nla_nest_start_noflag(skb, i); if (!tc_nest) return -EMSGSIZE; pgid = DCB_ATTR_VALUE_UNDEFINED; prio = DCB_ATTR_VALUE_UNDEFINED; tc_pct = DCB_ATTR_VALUE_UNDEFINED; up_map = DCB_ATTR_VALUE_UNDEFINED; if (!dir) ops->getpgtccfgrx(dev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); else ops->getpgtccfgtx(dev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); if (nla_put_u8(skb, DCB_TC_ATTR_PARAM_PGID, pgid) || nla_put_u8(skb, DCB_TC_ATTR_PARAM_UP_MAPPING, up_map) || nla_put_u8(skb, DCB_TC_ATTR_PARAM_STRICT_PRIO, prio) || nla_put_u8(skb, DCB_TC_ATTR_PARAM_BW_PCT, tc_pct)) return -EMSGSIZE; nla_nest_end(skb, tc_nest); } for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { tc_pct = DCB_ATTR_VALUE_UNDEFINED; if (!dir) ops->getpgbwgcfgrx(dev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); else ops->getpgbwgcfgtx(dev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); if (nla_put_u8(skb, i, tc_pct)) return -EMSGSIZE; } nla_nest_end(skb, pg); return 0; } static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev) { struct nlattr *cee, *app; struct dcb_app_type *itr; const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; int dcbx, i, err = -EMSGSIZE; u8 value; if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name)) goto nla_put_failure; cee = nla_nest_start_noflag(skb, DCB_ATTR_CEE); if (!cee) goto nla_put_failure; /* local pg */ if (ops->getpgtccfgtx && ops->getpgbwgcfgtx) { err = dcbnl_cee_pg_fill(skb, netdev, 1); if (err) goto nla_put_failure; } if (ops->getpgtccfgrx && ops->getpgbwgcfgrx) { err = dcbnl_cee_pg_fill(skb, netdev, 0); if (err) goto nla_put_failure; } /* local pfc */ if (ops->getpfccfg) { struct nlattr *pfc_nest = nla_nest_start_noflag(skb, DCB_ATTR_CEE_PFC); if (!pfc_nest) goto nla_put_failure; for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0, &value); if (nla_put_u8(skb, i, value)) goto nla_put_failure; } nla_nest_end(skb, pfc_nest); } /* local app */ spin_lock_bh(&dcb_lock); app = nla_nest_start_noflag(skb, DCB_ATTR_CEE_APP_TABLE); if (!app) goto dcb_unlock; list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == netdev->ifindex) { struct nlattr *app_nest = nla_nest_start_noflag(skb, DCB_ATTR_APP); if (!app_nest) goto dcb_unlock; err = nla_put_u8(skb, DCB_APP_ATTR_IDTYPE, itr->app.selector); if (err) goto dcb_unlock; err = nla_put_u16(skb, DCB_APP_ATTR_ID, itr->app.protocol); if (err) goto dcb_unlock; err = nla_put_u8(skb, DCB_APP_ATTR_PRIORITY, itr->app.priority); if (err) goto dcb_unlock; nla_nest_end(skb, app_nest); } } nla_nest_end(skb, app); if (netdev->dcbnl_ops->getdcbx) dcbx = netdev->dcbnl_ops->getdcbx(netdev); else dcbx = -EOPNOTSUPP; spin_unlock_bh(&dcb_lock); /* features flags */ if (ops->getfeatcfg) { struct nlattr *feat = nla_nest_start_noflag(skb, DCB_ATTR_CEE_FEAT); if (!feat) goto nla_put_failure; for (i = DCB_FEATCFG_ATTR_ALL + 1; i <= DCB_FEATCFG_ATTR_MAX; i++) if (!ops->getfeatcfg(netdev, i, &value) && nla_put_u8(skb, i, value)) goto nla_put_failure; nla_nest_end(skb, feat); } /* peer info if available */ if (ops->cee_peer_getpg) { struct cee_pg pg; memset(&pg, 0, sizeof(pg)); err = ops->cee_peer_getpg(netdev, &pg); if (!err && nla_put(skb, DCB_ATTR_CEE_PEER_PG, sizeof(pg), &pg)) goto nla_put_failure; } if (ops->cee_peer_getpfc) { struct cee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); err = ops->cee_peer_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_CEE_PEER_PFC, sizeof(pfc), &pfc)) goto nla_put_failure; } if (ops->peer_getappinfo && ops->peer_getapptable) { err = dcbnl_build_peer_app(netdev, skb, DCB_ATTR_CEE_PEER_APP_TABLE, DCB_ATTR_CEE_PEER_APP_INFO, DCB_ATTR_CEE_PEER_APP); if (err) goto nla_put_failure; } nla_nest_end(skb, cee); /* DCBX state */ if (dcbx >= 0) { err = nla_put_u8(skb, DCB_ATTR_DCBX, dcbx); if (err) goto nla_put_failure; } return 0; dcb_unlock: spin_unlock_bh(&dcb_lock); nla_put_failure: err = -EMSGSIZE; return err; } static int dcbnl_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 portid, int dcbx_ver) { struct net *net = dev_net(dev); struct sk_buff *skb; struct nlmsghdr *nlh; const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops; int err; if (!ops) return -EOPNOTSUPP; skb = dcbnl_newmsg(event, cmd, portid, seq, 0, &nlh); if (!skb) return -ENOMEM; if (dcbx_ver == DCB_CAP_DCBX_VER_IEEE) err = dcbnl_ieee_fill(skb, dev); else err = dcbnl_cee_fill(skb, dev); if (err < 0) { /* Report error to broadcast listeners */ nlmsg_free(skb); rtnl_set_sk_err(net, RTNLGRP_DCB, err); } else { /* End nlmsg and notify broadcast listeners */ nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_DCB, NULL, GFP_KERNEL); } return err; } int dcbnl_ieee_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 portid) { return dcbnl_notify(dev, event, cmd, seq, portid, DCB_CAP_DCBX_VER_IEEE); } EXPORT_SYMBOL(dcbnl_ieee_notify); int dcbnl_cee_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 portid) { return dcbnl_notify(dev, event, cmd, seq, portid, DCB_CAP_DCBX_VER_CEE); } EXPORT_SYMBOL(dcbnl_cee_notify); /* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb SET commands. * If any requested operation can not be completed * the entire msg is aborted and error value is returned. * No attempt is made to reconcile the case where only part of the * cmd can be completed. */ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1]; int prio; int err; if (!ops) return -EOPNOTSUPP; if (!tb[DCB_ATTR_IEEE]) return -EINVAL; err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX, tb[DCB_ATTR_IEEE], dcbnl_ieee_policy, NULL); if (err) return err; if (ieee[DCB_ATTR_IEEE_ETS] && ops->ieee_setets) { struct ieee_ets *ets = nla_data(ieee[DCB_ATTR_IEEE_ETS]); err = ops->ieee_setets(netdev, ets); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_MAXRATE] && ops->ieee_setmaxrate) { struct ieee_maxrate *maxrate = nla_data(ieee[DCB_ATTR_IEEE_MAXRATE]); err = ops->ieee_setmaxrate(netdev, maxrate); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_QCN] && ops->ieee_setqcn) { struct ieee_qcn *qcn = nla_data(ieee[DCB_ATTR_IEEE_QCN]); err = ops->ieee_setqcn(netdev, qcn); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setpfc) { struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]); err = ops->ieee_setpfc(netdev, pfc); if (err) goto err; } if (ieee[DCB_ATTR_DCB_BUFFER] && ops->dcbnl_setbuffer) { struct dcbnl_buffer *buffer = nla_data(ieee[DCB_ATTR_DCB_BUFFER]); for (prio = 0; prio < ARRAY_SIZE(buffer->prio2buffer); prio++) { if (buffer->prio2buffer[prio] >= DCBX_MAX_BUFFERS) { err = -EINVAL; goto err; } } err = ops->dcbnl_setbuffer(netdev, buffer); if (err) goto err; } if (ieee[DCB_ATTR_DCB_REWR_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE], netdev, ops->dcbnl_setrewr ?: dcb_setrewr); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE], netdev, ops->ieee_setapp ?: dcb_ieee_setapp); if (err) goto err; } if (ieee[DCB_ATTR_DCB_APP_TRUST_TABLE]) { u8 selectors[IEEE_8021QAZ_APP_SEL_MAX + 1] = {0}; struct nlattr *attr; int nselectors = 0; int rem; if (!ops->dcbnl_setapptrust) { err = -EOPNOTSUPP; goto err; } nla_for_each_nested(attr, ieee[DCB_ATTR_DCB_APP_TRUST_TABLE], rem) { enum ieee_attrs_app type = nla_type(attr); u8 selector; int i; if (!dcbnl_app_attr_type_validate(type) || nla_len(attr) != 1 || nselectors >= sizeof(selectors)) { err = -EINVAL; goto err; } selector = nla_get_u8(attr); if (!dcbnl_app_selector_validate(type, selector)) { err = -EINVAL; goto err; } /* Duplicate selector ? */ for (i = 0; i < nselectors; i++) { if (selectors[i] == selector) { err = -EINVAL; goto err; } } selectors[nselectors++] = selector; } err = ops->dcbnl_setapptrust(netdev, selectors, nselectors); if (err) goto err; } err: err = nla_put_u8(skb, DCB_ATTR_IEEE, err); dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_SET, seq, 0); return err; } static int dcbnl_ieee_get(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; if (!ops) return -EOPNOTSUPP; return dcbnl_ieee_fill(skb, netdev); } static int dcbnl_ieee_del(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1]; int err; if (!ops) return -EOPNOTSUPP; if (!tb[DCB_ATTR_IEEE]) return -EINVAL; err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX, tb[DCB_ATTR_IEEE], dcbnl_ieee_policy, NULL); if (err) return err; if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE], netdev, ops->ieee_delapp ?: dcb_ieee_delapp); if (err) goto err; } if (ieee[DCB_ATTR_DCB_REWR_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE], netdev, ops->dcbnl_delrewr ?: dcb_delrewr); if (err) goto err; } err: err = nla_put_u8(skb, DCB_ATTR_IEEE, err); dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_DEL, seq, 0); return err; } /* DCBX configuration */ static int dcbnl_getdcbx(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { if (!netdev->dcbnl_ops->getdcbx) return -EOPNOTSUPP; return nla_put_u8(skb, DCB_ATTR_DCBX, netdev->dcbnl_ops->getdcbx(netdev)); } static int dcbnl_setdcbx(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 value; if (!netdev->dcbnl_ops->setdcbx) return -EOPNOTSUPP; if (!tb[DCB_ATTR_DCBX]) return -EINVAL; value = nla_get_u8(tb[DCB_ATTR_DCBX]); return nla_put_u8(skb, DCB_ATTR_DCBX, netdev->dcbnl_ops->setdcbx(netdev, value)); } static int dcbnl_getfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1], *nest; u8 value; int ret, i; int getall = 0; if (!netdev->dcbnl_ops->getfeatcfg) return -EOPNOTSUPP; if (!tb[DCB_ATTR_FEATCFG]) return -EINVAL; ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG], dcbnl_featcfg_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_FEATCFG); if (!nest) return -EMSGSIZE; if (data[DCB_FEATCFG_ATTR_ALL]) getall = 1; for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) { if (!getall && !data[i]) continue; ret = netdev->dcbnl_ops->getfeatcfg(netdev, i, &value); if (!ret) ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); goto nla_put_failure; } } nla_nest_end(skb, nest); nla_put_failure: return ret; } static int dcbnl_setfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1]; int ret, i; u8 value; if (!netdev->dcbnl_ops->setfeatcfg) return -ENOTSUPP; if (!tb[DCB_ATTR_FEATCFG]) return -EINVAL; ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG], dcbnl_featcfg_nest, NULL); if (ret) goto err; for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) { if (data[i] == NULL) continue; value = nla_get_u8(data[i]); ret = netdev->dcbnl_ops->setfeatcfg(netdev, i, value); if (ret) goto err; } err: ret = nla_put_u8(skb, DCB_ATTR_FEATCFG, ret); return ret; } /* Handle CEE DCBX GET commands. */ static int dcbnl_cee_get(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; if (!ops) return -EOPNOTSUPP; return dcbnl_cee_fill(skb, netdev); } struct reply_func { /* reply netlink message type */ int type; /* function to fill message contents */ int (*cb)(struct net_device *, struct nlmsghdr *, u32, struct nlattr **, struct sk_buff *); }; static const struct reply_func reply_funcs[DCB_CMD_MAX+1] = { [DCB_CMD_GSTATE] = { RTM_GETDCB, dcbnl_getstate }, [DCB_CMD_SSTATE] = { RTM_SETDCB, dcbnl_setstate }, [DCB_CMD_PFC_GCFG] = { RTM_GETDCB, dcbnl_getpfccfg }, [DCB_CMD_PFC_SCFG] = { RTM_SETDCB, dcbnl_setpfccfg }, [DCB_CMD_GPERM_HWADDR] = { RTM_GETDCB, dcbnl_getperm_hwaddr }, [DCB_CMD_GCAP] = { RTM_GETDCB, dcbnl_getcap }, [DCB_CMD_GNUMTCS] = { RTM_GETDCB, dcbnl_getnumtcs }, [DCB_CMD_SNUMTCS] = { RTM_SETDCB, dcbnl_setnumtcs }, [DCB_CMD_PFC_GSTATE] = { RTM_GETDCB, dcbnl_getpfcstate }, [DCB_CMD_PFC_SSTATE] = { RTM_SETDCB, dcbnl_setpfcstate }, [DCB_CMD_GAPP] = { RTM_GETDCB, dcbnl_getapp }, [DCB_CMD_SAPP] = { RTM_SETDCB, dcbnl_setapp }, [DCB_CMD_PGTX_GCFG] = { RTM_GETDCB, dcbnl_pgtx_getcfg }, [DCB_CMD_PGTX_SCFG] = { RTM_SETDCB, dcbnl_pgtx_setcfg }, [DCB_CMD_PGRX_GCFG] = { RTM_GETDCB, dcbnl_pgrx_getcfg }, [DCB_CMD_PGRX_SCFG] = { RTM_SETDCB, dcbnl_pgrx_setcfg }, [DCB_CMD_SET_ALL] = { RTM_SETDCB, dcbnl_setall }, [DCB_CMD_BCN_GCFG] = { RTM_GETDCB, dcbnl_bcn_getcfg }, [DCB_CMD_BCN_SCFG] = { RTM_SETDCB, dcbnl_bcn_setcfg }, [DCB_CMD_IEEE_GET] = { RTM_GETDCB, dcbnl_ieee_get }, [DCB_CMD_IEEE_SET] = { RTM_SETDCB, dcbnl_ieee_set }, [DCB_CMD_IEEE_DEL] = { RTM_SETDCB, dcbnl_ieee_del }, [DCB_CMD_GDCBX] = { RTM_GETDCB, dcbnl_getdcbx }, [DCB_CMD_SDCBX] = { RTM_SETDCB, dcbnl_setdcbx }, [DCB_CMD_GFEATCFG] = { RTM_GETDCB, dcbnl_getfeatcfg }, [DCB_CMD_SFEATCFG] = { RTM_SETDCB, dcbnl_setfeatcfg }, [DCB_CMD_CEE_GET] = { RTM_GETDCB, dcbnl_cee_get }, }; static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net_device *netdev; struct dcbmsg *dcb = nlmsg_data(nlh); struct nlattr *tb[DCB_ATTR_MAX + 1]; u32 portid = NETLINK_CB(skb).portid; int ret = -EINVAL; struct sk_buff *reply_skb; struct nlmsghdr *reply_nlh = NULL; const struct reply_func *fn; if ((nlh->nlmsg_type == RTM_SETDCB) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nlmsg_parse_deprecated(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX, dcbnl_rtnl_policy, extack); if (ret < 0) return ret; if (dcb->cmd > DCB_CMD_MAX) return -EINVAL; /* check if a reply function has been defined for the command */ fn = &reply_funcs[dcb->cmd]; if (!fn->cb) return -EOPNOTSUPP; if (fn->type == RTM_SETDCB && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!tb[DCB_ATTR_IFNAME]) return -EINVAL; netdev = __dev_get_by_name(net, nla_data(tb[DCB_ATTR_IFNAME])); if (!netdev) return -ENODEV; if (!netdev->dcbnl_ops) return -EOPNOTSUPP; reply_skb = dcbnl_newmsg(fn->type, dcb->cmd, portid, nlh->nlmsg_seq, nlh->nlmsg_flags, &reply_nlh); if (!reply_skb) return -ENOMEM; ret = fn->cb(netdev, nlh, nlh->nlmsg_seq, tb, reply_skb); if (ret < 0) { nlmsg_free(reply_skb); goto out; } nlmsg_end(reply_skb, reply_nlh); ret = rtnl_unicast(reply_skb, net, portid); out: return ret; } static struct dcb_app_type *dcb_rewr_lookup(const struct dcb_app *app, int ifindex, int proto) { struct dcb_app_type *itr; list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->app.selector == app->selector && itr->app.priority == app->priority && itr->ifindex == ifindex && ((proto == -1) || itr->app.protocol == proto)) return itr; } return NULL; } static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app, int ifindex, int prio) { struct dcb_app_type *itr; list_for_each_entry(itr, &dcb_app_list, list) { if (itr->app.selector == app->selector && itr->app.protocol == app->protocol && itr->ifindex == ifindex && ((prio == -1) || itr->app.priority == prio)) return itr; } return NULL; } static int dcb_app_add(struct list_head *list, const struct dcb_app *app, int ifindex) { struct dcb_app_type *entry; entry = kmalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return -ENOMEM; memcpy(&entry->app, app, sizeof(*app)); entry->ifindex = ifindex; list_add(&entry->list, list); return 0; } /** * dcb_getapp - retrieve the DCBX application user priority * @dev: network interface * @app: application to get user priority of * * On success returns a non-zero 802.1p user priority bitmap * otherwise returns 0 as the invalid user priority bitmap to * indicate an error. */ u8 dcb_getapp(struct net_device *dev, struct dcb_app *app) { struct dcb_app_type *itr; u8 prio = 0; spin_lock_bh(&dcb_lock); itr = dcb_app_lookup(app, dev->ifindex, -1); if (itr) prio = itr->app.priority; spin_unlock_bh(&dcb_lock); return prio; } EXPORT_SYMBOL(dcb_getapp); /** * dcb_setapp - add CEE dcb application data to app list * @dev: network interface * @new: application data to add * * Priority 0 is an invalid priority in CEE spec. This routine * removes applications from the app list if the priority is * set to zero. Priority is expected to be 8-bit 802.1p user priority bitmap */ int dcb_setapp(struct net_device *dev, struct dcb_app *new) { struct dcb_app_type *itr; struct dcb_app_type event; int err = 0; event.ifindex = dev->ifindex; memcpy(&event.app, new, sizeof(event.app)); if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); spin_lock_bh(&dcb_lock); /* Search for existing match and replace */ itr = dcb_app_lookup(new, dev->ifindex, -1); if (itr) { if (new->priority) itr->app.priority = new->priority; else { list_del(&itr->list); kfree(itr); } goto out; } /* App type does not exist add new application type */ if (new->priority) err = dcb_app_add(&dcb_app_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; } EXPORT_SYMBOL(dcb_setapp); /** * dcb_ieee_getapp_mask - retrieve the IEEE DCB application priority * @dev: network interface * @app: where to store the retrieve application data * * Helper routine which on success returns a non-zero 802.1Qaz user * priority bitmap otherwise returns 0 to indicate the dcb_app was * not found in APP list. */ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app) { struct dcb_app_type *itr; u8 prio = 0; spin_lock_bh(&dcb_lock); itr = dcb_app_lookup(app, dev->ifindex, -1); if (itr) prio |= 1 << itr->app.priority; spin_unlock_bh(&dcb_lock); return prio; } EXPORT_SYMBOL(dcb_ieee_getapp_mask); /* Get protocol value from rewrite entry. */ u16 dcb_getrewr(struct net_device *dev, struct dcb_app *app) { struct dcb_app_type *itr; u16 proto = 0; spin_lock_bh(&dcb_lock); itr = dcb_rewr_lookup(app, dev->ifindex, -1); if (itr) proto = itr->app.protocol; spin_unlock_bh(&dcb_lock); return proto; } EXPORT_SYMBOL(dcb_getrewr); /* Add rewrite entry to the rewrite list. */ int dcb_setrewr(struct net_device *dev, struct dcb_app *new) { int err; spin_lock_bh(&dcb_lock); /* Search for existing match and abort if found. */ if (dcb_rewr_lookup(new, dev->ifindex, new->protocol)) { err = -EEXIST; goto out; } err = dcb_app_add(&dcb_rewr_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); return err; } EXPORT_SYMBOL(dcb_setrewr); /* Delete rewrite entry from the rewrite list. */ int dcb_delrewr(struct net_device *dev, struct dcb_app *del) { struct dcb_app_type *itr; int err = -ENOENT; spin_lock_bh(&dcb_lock); /* Search for existing match and remove it. */ itr = dcb_rewr_lookup(del, dev->ifindex, del->protocol); if (itr) { list_del(&itr->list); kfree(itr); err = 0; } spin_unlock_bh(&dcb_lock); return err; } EXPORT_SYMBOL(dcb_delrewr); /** * dcb_ieee_setapp - add IEEE dcb application data to app list * @dev: network interface * @new: application data to add * * This adds Application data to the list. Multiple application * entries may exists for the same selector and protocol as long * as the priorities are different. Priority is expected to be a * 3-bit unsigned integer */ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new) { struct dcb_app_type event; int err = 0; event.ifindex = dev->ifindex; memcpy(&event.app, new, sizeof(event.app)); if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); spin_lock_bh(&dcb_lock); /* Search for existing match and abort if found */ if (dcb_app_lookup(new, dev->ifindex, new->priority)) { err = -EEXIST; goto out; } err = dcb_app_add(&dcb_app_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; } EXPORT_SYMBOL(dcb_ieee_setapp); /** * dcb_ieee_delapp - delete IEEE dcb application data from list * @dev: network interface * @del: application data to delete * * This removes a matching APP data from the APP list */ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) { struct dcb_app_type *itr; struct dcb_app_type event; int err = -ENOENT; event.ifindex = dev->ifindex; memcpy(&event.app, del, sizeof(event.app)); if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); spin_lock_bh(&dcb_lock); /* Search for existing match and remove it. */ if ((itr = dcb_app_lookup(del, dev->ifindex, del->priority))) { list_del(&itr->list); kfree(itr); err = 0; } spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; } EXPORT_SYMBOL(dcb_ieee_delapp); /* dcb_getrewr_prio_pcp_mask_map - For a given device, find mapping from * priorities to the PCP and DEI values assigned to that priority. */ void dcb_getrewr_prio_pcp_mask_map(const struct net_device *dev, struct dcb_rewr_prio_pcp_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 prio; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->ifindex == ifindex && itr->app.selector == DCB_APP_SEL_PCP && itr->app.protocol < 16 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) { prio = itr->app.priority; p_map->map[prio] |= 1 << itr->app.protocol; } } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_getrewr_prio_pcp_mask_map); /* dcb_getrewr_prio_dscp_mask_map - For a given device, find mapping from * priorities to the DSCP values assigned to that priority. */ void dcb_getrewr_prio_dscp_mask_map(const struct net_device *dev, struct dcb_ieee_app_prio_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 prio; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && itr->app.protocol < 64 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) { prio = itr->app.priority; p_map->map[prio] |= 1ULL << itr->app.protocol; } } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_getrewr_prio_dscp_mask_map); /* * dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from * priorities to the DSCP values assigned to that priority. Initialize p_map * such that each map element holds a bit mask of DSCP values configured for * that priority by APP entries. */ void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev, struct dcb_ieee_app_prio_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 prio; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && itr->app.protocol < 64 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) { prio = itr->app.priority; p_map->map[prio] |= 1ULL << itr->app.protocol; } } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_ieee_getapp_prio_dscp_mask_map); /* * dcb_ieee_getapp_dscp_prio_mask_map - For a given device, find mapping from * DSCP values to the priorities assigned to that DSCP value. Initialize p_map * such that each map element holds a bit mask of priorities configured for a * given DSCP value by APP entries. */ void dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev, struct dcb_ieee_app_dscp_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && itr->app.protocol < 64 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) p_map->map[itr->app.protocol] |= 1 << itr->app.priority; } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_ieee_getapp_dscp_prio_mask_map); /* * Per 802.1Q-2014, the selector value of 1 is used for matching on Ethernet * type, with valid PID values >= 1536. A special meaning is then assigned to * protocol value of 0: "default priority. For use when priority is not * otherwise specified". * * dcb_ieee_getapp_default_prio_mask - For a given device, find all APP entries * of the form {$PRIO, ETHERTYPE, 0} and construct a bit mask of all default * priorities set by these entries. */ u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 mask = 0; spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_ETHERTYPE && itr->app.protocol == 0 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) mask |= 1 << itr->app.priority; } spin_unlock_bh(&dcb_lock); return mask; } EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask); static void dcbnl_flush_dev(struct net_device *dev) { struct dcb_app_type *itr, *tmp; spin_lock_bh(&dcb_lock); list_for_each_entry_safe(itr, tmp, &dcb_app_list, list) { if (itr->ifindex == dev->ifindex) { list_del(&itr->list); kfree(itr); } } spin_unlock_bh(&dcb_lock); } static int dcbnl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_UNREGISTER: if (!dev->dcbnl_ops) return NOTIFY_DONE; dcbnl_flush_dev(dev); return NOTIFY_OK; default: return NOTIFY_DONE; } } static struct notifier_block dcbnl_nb __read_mostly = { .notifier_call = dcbnl_netdevice_event, }; static const struct rtnl_msg_handler dcbnl_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_GETDCB, .doit = dcb_doit}, {.msgtype = RTM_SETDCB, .doit = dcb_doit}, }; static int __init dcbnl_init(void) { int err; err = register_netdevice_notifier(&dcbnl_nb); if (err) return err; rtnl_register_many(dcbnl_rtnl_msg_handlers); return 0; } device_initcall(dcbnl_init); |
| 89 530 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2021, Google LLC. * Pasha Tatashin <pasha.tatashin@soleen.com> */ #ifndef __LINUX_PAGE_TABLE_CHECK_H #define __LINUX_PAGE_TABLE_CHECK_H #ifdef CONFIG_PAGE_TABLE_CHECK #include <linux/jump_label.h> extern struct static_key_true page_table_check_disabled; extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr); void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud); void __page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd); static inline void page_table_check_alloc(struct page *page, unsigned int order) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_zero(page, order); } static inline void page_table_check_free(struct page *page, unsigned int order) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_zero(page, order); } static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_pte_clear(mm, pte); } static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_pmd_clear(mm, pmd); } static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_pud_clear(mm, pud); } static inline void page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_ptes_set(mm, ptep, pte, nr); } static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_pmd_set(mm, pmdp, pmd); } static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_pud_set(mm, pudp, pud); } static inline void page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_pte_clear_range(mm, addr, pmd); } #else static inline void page_table_check_alloc(struct page *page, unsigned int order) { } static inline void page_table_check_free(struct page *page, unsigned int order) { } static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { } static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { } static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { } static inline void page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr) { } static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { } static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { } static inline void page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd) { } #endif /* CONFIG_PAGE_TABLE_CHECK */ #endif /* __LINUX_PAGE_TABLE_CHECK_H */ |
| 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2018, Intel Corporation. */ /* A common module to handle registrations and notifications for paravirtual * drivers to enable accelerated datapath and support VF live migration. * * The notifier and event handling code is based on netvsc driver. */ #include <linux/module.h> #include <linux/etherdevice.h> #include <uapi/linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/if_vlan.h> #include <net/failover.h> static LIST_HEAD(failover_list); static DEFINE_SPINLOCK(failover_lock); static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops) { struct net_device *failover_dev; struct failover *failover; spin_lock(&failover_lock); list_for_each_entry(failover, &failover_list, list) { failover_dev = rtnl_dereference(failover->failover_dev); if (ether_addr_equal(failover_dev->perm_addr, mac)) { *ops = rtnl_dereference(failover->ops); spin_unlock(&failover_lock); return failover_dev; } } spin_unlock(&failover_lock); return NULL; } /** * failover_slave_register - Register a slave netdev * * @slave_dev: slave netdev that is being registered * * Registers a slave device to a failover instance. Only ethernet devices * are supported. */ static int failover_slave_register(struct net_device *slave_dev) { struct netdev_lag_upper_info lag_upper_info; struct net_device *failover_dev; struct failover_ops *fops; int err; if (slave_dev->type != ARPHRD_ETHER) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops && fops->slave_pre_register && fops->slave_pre_register(slave_dev, failover_dev)) goto done; err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame, failover_dev); if (err) { netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n", err); goto done; } lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL, &lag_upper_info, NULL); if (err) { netdev_err(slave_dev, "can not set failover device %s (err = %d)\n", failover_dev->name, err); goto err_upper_link; } slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_register && !fops->slave_register(slave_dev, failover_dev)) return NOTIFY_OK; netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); err_upper_link: netdev_rx_handler_unregister(slave_dev); done: return NOTIFY_DONE; } /** * failover_slave_unregister - Unregister a slave netdev * * @slave_dev: slave netdev that is being unregistered * * Unregisters a slave device from a failover instance. */ int failover_slave_unregister(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops && fops->slave_pre_unregister && fops->slave_pre_unregister(slave_dev, failover_dev)) goto done; netdev_rx_handler_unregister(slave_dev); netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_unregister && !fops->slave_unregister(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } EXPORT_SYMBOL_GPL(failover_slave_unregister); static int failover_slave_link_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops && fops->slave_link_change && !fops->slave_link_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_slave_name_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops && fops->slave_name_change && !fops->slave_name_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); /* Skip parent events */ if (netif_is_failover(event_dev)) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: return failover_slave_register(event_dev); case NETDEV_UNREGISTER: return failover_slave_unregister(event_dev); case NETDEV_UP: case NETDEV_DOWN: case NETDEV_CHANGE: return failover_slave_link_change(event_dev); case NETDEV_CHANGENAME: return failover_slave_name_change(event_dev); default: return NOTIFY_DONE; } } static struct notifier_block failover_notifier = { .notifier_call = failover_event, }; static void failover_existing_slave_register(struct net_device *failover_dev) { struct net *net = dev_net(failover_dev); struct net_device *dev; rtnl_lock(); for_each_netdev(net, dev) { if (netif_is_failover(dev)) continue; if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) failover_slave_register(dev); } rtnl_unlock(); } /** * failover_register - Register a failover instance * * @dev: failover netdev * @ops: failover ops * * Allocate and register a failover instance for a failover netdev. ops * provides handlers for slave device register/unregister/link change/ * name change events. * * Return: pointer to failover instance */ struct failover *failover_register(struct net_device *dev, struct failover_ops *ops) { struct failover *failover; if (dev->type != ARPHRD_ETHER) return ERR_PTR(-EINVAL); failover = kzalloc(sizeof(*failover), GFP_KERNEL); if (!failover) return ERR_PTR(-ENOMEM); rcu_assign_pointer(failover->ops, ops); netdev_hold(dev, &failover->dev_tracker, GFP_KERNEL); dev->priv_flags |= IFF_FAILOVER; rcu_assign_pointer(failover->failover_dev, dev); spin_lock(&failover_lock); list_add_tail(&failover->list, &failover_list); spin_unlock(&failover_lock); netdev_info(dev, "failover master:%s registered\n", dev->name); failover_existing_slave_register(dev); return failover; } EXPORT_SYMBOL_GPL(failover_register); /** * failover_unregister - Unregister a failover instance * * @failover: pointer to failover instance * * Unregisters and frees a failover instance. */ void failover_unregister(struct failover *failover) { struct net_device *failover_dev; failover_dev = rcu_dereference(failover->failover_dev); netdev_info(failover_dev, "failover master:%s unregistered\n", failover_dev->name); failover_dev->priv_flags &= ~IFF_FAILOVER; netdev_put(failover_dev, &failover->dev_tracker); spin_lock(&failover_lock); list_del(&failover->list); spin_unlock(&failover_lock); kfree(failover); } EXPORT_SYMBOL_GPL(failover_unregister); static __init int failover_init(void) { register_netdevice_notifier(&failover_notifier); return 0; } module_init(failover_init); static __exit void failover_exit(void) { unregister_netdevice_notifier(&failover_notifier); } module_exit(failover_exit); MODULE_DESCRIPTION("Generic failover infrastructure/interface"); MODULE_LICENSE("GPL v2"); |
| 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 | /* * llc_core.c - Minimum needed routines for sap handling and module init/exit * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/module.h> #include <linux/interrupt.h> #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/init.h> #include <net/net_namespace.h> #include <net/llc.h> LIST_HEAD(llc_sap_list); static DEFINE_SPINLOCK(llc_sap_list_lock); /** * llc_sap_alloc - allocates and initializes sap. * * Allocates and initializes sap. */ static struct llc_sap *llc_sap_alloc(void) { struct llc_sap *sap = kzalloc(sizeof(*sap), GFP_ATOMIC); int i; if (sap) { /* sap->laddr.mac - leave as a null, it's filled by bind */ sap->state = LLC_SAP_STATE_ACTIVE; spin_lock_init(&sap->sk_lock); for (i = 0; i < LLC_SK_LADDR_HASH_ENTRIES; i++) INIT_HLIST_NULLS_HEAD(&sap->sk_laddr_hash[i], i); refcount_set(&sap->refcnt, 1); } return sap; } static struct llc_sap *__llc_sap_find(unsigned char sap_value) { struct llc_sap *sap; list_for_each_entry(sap, &llc_sap_list, node) if (sap->laddr.lsap == sap_value) goto out; sap = NULL; out: return sap; } /** * llc_sap_find - searches a SAP in station * @sap_value: sap to be found * * Searches for a sap in the sap list of the LLC's station upon the sap ID. * If the sap is found it will be refcounted and the user will have to do * a llc_sap_put after use. * Returns the sap or %NULL if not found. */ struct llc_sap *llc_sap_find(unsigned char sap_value) { struct llc_sap *sap; rcu_read_lock_bh(); sap = __llc_sap_find(sap_value); if (!sap || !llc_sap_hold_safe(sap)) sap = NULL; rcu_read_unlock_bh(); return sap; } /** * llc_sap_open - open interface to the upper layers. * @lsap: SAP number. * @func: rcv func for datalink protos * * Interface function to upper layer. Each one who wants to get a SAP * (for example NetBEUI) should call this function. Returns the opened * SAP for success, NULL for failure. */ struct llc_sap *llc_sap_open(unsigned char lsap, int (*func)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)) { struct llc_sap *sap = NULL; spin_lock_bh(&llc_sap_list_lock); if (__llc_sap_find(lsap)) /* SAP already exists */ goto out; sap = llc_sap_alloc(); if (!sap) goto out; sap->laddr.lsap = lsap; sap->rcv_func = func; list_add_tail_rcu(&sap->node, &llc_sap_list); out: spin_unlock_bh(&llc_sap_list_lock); return sap; } /** * llc_sap_close - close interface for upper layers. * @sap: SAP to be closed. * * Close interface function to upper layer. Each one who wants to * close an open SAP (for example NetBEUI) should call this function. * Removes this sap from the list of saps in the station and then * frees the memory for this sap. */ void llc_sap_close(struct llc_sap *sap) { WARN_ON(sap->sk_count); spin_lock_bh(&llc_sap_list_lock); list_del_rcu(&sap->node); spin_unlock_bh(&llc_sap_list_lock); kfree_rcu(sap, rcu); } static struct packet_type llc_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_802_2), .func = llc_rcv, }; static int __init llc_init(void) { dev_add_pack(&llc_packet_type); return 0; } static void __exit llc_exit(void) { dev_remove_pack(&llc_packet_type); } module_init(llc_init); module_exit(llc_exit); EXPORT_SYMBOL(llc_sap_list); EXPORT_SYMBOL(llc_sap_find); EXPORT_SYMBOL(llc_sap_open); EXPORT_SYMBOL(llc_sap_close); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Procom 1997, Jay Schullist 2001, Arnaldo C. Melo 2001-2003"); MODULE_DESCRIPTION("LLC IEEE 802.2 core support"); |
| 777 720 319 776 11 7 507 501 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_SEQLOCK_H #define __LINUX_SEQLOCK_H /* * seqcount_t / seqlock_t - a reader-writer consistency mechanism with * lockless readers (read-only retry loops), and no writer starvation. * * See Documentation/locking/seqlock.rst * * Copyrights: * - Based on x86_64 vsyscall gettimeofday: Keith Owens, Andrea Arcangeli * - Sequence counters with associated locks, (C) 2020 Linutronix GmbH */ #include <linux/compiler.h> #include <linux/kcsan-checks.h> #include <linux/lockdep.h> #include <linux/mutex.h> #include <linux/preempt.h> #include <linux/seqlock_types.h> #include <linux/spinlock.h> #include <asm/processor.h> /* * The seqlock seqcount_t interface does not prescribe a precise sequence of * read begin/retry/end. For readers, typically there is a call to * read_seqcount_begin() and read_seqcount_retry(), however, there are more * esoteric cases which do not follow this pattern. * * As a consequence, we take the following best-effort approach for raw usage * via seqcount_t under KCSAN: upon beginning a seq-reader critical section, * pessimistically mark the next KCSAN_SEQLOCK_REGION_MAX memory accesses as * atomics; if there is a matching read_seqcount_retry() call, no following * memory operations are considered atomic. Usage of the seqlock_t interface * is not affected. */ #define KCSAN_SEQLOCK_REGION_MAX 1000 static inline void __seqcount_init(seqcount_t *s, const char *name, struct lock_class_key *key) { /* * Make sure we are not reinitializing a held lock: */ lockdep_init_map(&s->dep_map, name, key, 0); s->sequence = 0; } #ifdef CONFIG_DEBUG_LOCK_ALLOC # define SEQCOUNT_DEP_MAP_INIT(lockname) \ .dep_map = { .name = #lockname } /** * seqcount_init() - runtime initializer for seqcount_t * @s: Pointer to the seqcount_t instance */ # define seqcount_init(s) \ do { \ static struct lock_class_key __key; \ __seqcount_init((s), #s, &__key); \ } while (0) static inline void seqcount_lockdep_reader_access(const seqcount_t *s) { seqcount_t *l = (seqcount_t *)s; unsigned long flags; local_irq_save(flags); seqcount_acquire_read(&l->dep_map, 0, 0, _RET_IP_); seqcount_release(&l->dep_map, _RET_IP_); local_irq_restore(flags); } #else # define SEQCOUNT_DEP_MAP_INIT(lockname) # define seqcount_init(s) __seqcount_init(s, NULL, NULL) # define seqcount_lockdep_reader_access(x) #endif /** * SEQCNT_ZERO() - static initializer for seqcount_t * @name: Name of the seqcount_t instance */ #define SEQCNT_ZERO(name) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(name) } /* * Sequence counters with associated locks (seqcount_LOCKNAME_t) * * A sequence counter which associates the lock used for writer * serialization at initialization time. This enables lockdep to validate * that the write side critical section is properly serialized. * * For associated locks which do not implicitly disable preemption, * preemption protection is enforced in the write side function. * * Lockdep is never used in any for the raw write variants. * * See Documentation/locking/seqlock.rst */ /* * typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated * @seqcount: The real sequence counter * @lock: Pointer to the associated lock * * A plain sequence counter with external writer synchronization by * LOCKNAME @lock. The lock is associated to the sequence counter in the * static initializer or init function. This enables lockdep to validate * that the write side critical section is properly serialized. * * LOCKNAME: raw_spinlock, spinlock, rwlock or mutex */ /* * seqcount_LOCKNAME_init() - runtime initializer for seqcount_LOCKNAME_t * @s: Pointer to the seqcount_LOCKNAME_t instance * @lock: Pointer to the associated lock */ #define seqcount_LOCKNAME_init(s, _lock, lockname) \ do { \ seqcount_##lockname##_t *____s = (s); \ seqcount_init(&____s->seqcount); \ __SEQ_LOCK(____s->lock = (_lock)); \ } while (0) #define seqcount_raw_spinlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, raw_spinlock) #define seqcount_spinlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, spinlock) #define seqcount_rwlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, rwlock) #define seqcount_mutex_init(s, lock) seqcount_LOCKNAME_init(s, lock, mutex) /* * SEQCOUNT_LOCKNAME() - Instantiate seqcount_LOCKNAME_t and helpers * seqprop_LOCKNAME_*() - Property accessors for seqcount_LOCKNAME_t * * @lockname: "LOCKNAME" part of seqcount_LOCKNAME_t * @locktype: LOCKNAME canonical C data type * @preemptible: preemptibility of above locktype * @lockbase: prefix for associated lock/unlock */ #define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase) \ static __always_inline seqcount_t * \ __seqprop_##lockname##_ptr(seqcount_##lockname##_t *s) \ { \ return &s->seqcount; \ } \ \ static __always_inline const seqcount_t * \ __seqprop_##lockname##_const_ptr(const seqcount_##lockname##_t *s) \ { \ return &s->seqcount; \ } \ \ static __always_inline unsigned \ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \ { \ unsigned seq = smp_load_acquire(&s->seqcount.sequence); \ \ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ return seq; \ \ if (preemptible && unlikely(seq & 1)) { \ __SEQ_LOCK(lockbase##_lock(s->lock)); \ __SEQ_LOCK(lockbase##_unlock(s->lock)); \ \ /* \ * Re-read the sequence counter since the (possibly \ * preempted) writer made progress. \ */ \ seq = smp_load_acquire(&s->seqcount.sequence); \ } \ \ return seq; \ } \ \ static __always_inline bool \ __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \ { \ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ return preemptible; \ \ /* PREEMPT_RT relies on the above LOCK+UNLOCK */ \ return false; \ } \ \ static __always_inline void \ __seqprop_##lockname##_assert(const seqcount_##lockname##_t *s) \ { \ __SEQ_LOCK(lockdep_assert_held(s->lock)); \ } /* * __seqprop() for seqcount_t */ static inline seqcount_t *__seqprop_ptr(seqcount_t *s) { return s; } static inline const seqcount_t *__seqprop_const_ptr(const seqcount_t *s) { return s; } static inline unsigned __seqprop_sequence(const seqcount_t *s) { return smp_load_acquire(&s->sequence); } static inline bool __seqprop_preemptible(const seqcount_t *s) { return false; } static inline void __seqprop_assert(const seqcount_t *s) { lockdep_assert_preemption_disabled(); } #define __SEQ_RT IS_ENABLED(CONFIG_PREEMPT_RT) SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, raw_spin) SEQCOUNT_LOCKNAME(spinlock, spinlock_t, __SEQ_RT, spin) SEQCOUNT_LOCKNAME(rwlock, rwlock_t, __SEQ_RT, read) SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) #undef SEQCOUNT_LOCKNAME /* * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t * @name: Name of the seqcount_LOCKNAME_t instance * @lock: Pointer to the associated LOCKNAME */ #define SEQCOUNT_LOCKNAME_ZERO(seq_name, assoc_lock) { \ .seqcount = SEQCNT_ZERO(seq_name.seqcount), \ __SEQ_LOCK(.lock = (assoc_lock)) \ } #define SEQCNT_RAW_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_RWLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_WW_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define __seqprop_case(s, lockname, prop) \ seqcount_##lockname##_t: __seqprop_##lockname##_##prop #define __seqprop(s, prop) _Generic(*(s), \ seqcount_t: __seqprop_##prop, \ __seqprop_case((s), raw_spinlock, prop), \ __seqprop_case((s), spinlock, prop), \ __seqprop_case((s), rwlock, prop), \ __seqprop_case((s), mutex, prop)) #define seqprop_ptr(s) __seqprop(s, ptr)(s) #define seqprop_const_ptr(s) __se |