Total coverage: 159066 (9%)of 1909711
7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 // SPDX-License-Identifier: GPL-2.0-or-later /* * User-space Probes (UProbes) for x86 * * Copyright (C) IBM Corporation, 2008-2011 * Authors: * Srikar Dronamraju * Jim Keniston */ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/ptrace.h> #include <linux/uprobes.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/kdebug.h> #include <asm/processor.h> #include <asm/insn.h> #include <asm/insn-eval.h> #include <asm/mmu_context.h> #include <asm/nops.h> /* Post-execution fixups. */ /* Adjust IP back to vicinity of actual insn */ #define UPROBE_FIX_IP 0x01 /* Adjust the return address of a call insn */ #define UPROBE_FIX_CALL 0x02 /* Instruction will modify TF, don't change it */ #define UPROBE_FIX_SETF 0x04 #define UPROBE_FIX_RIP_SI 0x08 #define UPROBE_FIX_RIP_DI 0x10 #define UPROBE_FIX_RIP_BX 0x20 #define UPROBE_FIX_RIP_MASK \ (UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX) #define UPROBE_TRAP_NR UINT_MAX /* Adaptations for mhiramat x86 decoder v14. */ #define OPCODE1(insn) ((insn)->opcode.bytes[0]) #define OPCODE2(insn) ((insn)->opcode.bytes[1]) #define OPCODE3(insn) ((insn)->opcode.bytes[2]) #define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value) #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ << (row % 32)) /* * Good-instruction tables for 32-bit apps. This is non-const and volatile * to keep gcc from statically optimizing it out, as variable_test_bit makes * some versions of gcc to think only *(unsigned long*) is used. * * Opcodes we'll probably never support: * 6c-6f - ins,outs. SEGVs if used in userspace * e4-e7 - in,out imm. SEGVs if used in userspace * ec-ef - in,out acc. SEGVs if used in userspace * cc - int3. SIGTRAP if used in userspace * ce - into. Not used in userspace - no kernel support to make it useful. SEGVs * (why we support bound (62) then? it's similar, and similarly unused...) * f1 - int1. SIGTRAP if used in userspace * f4 - hlt. SEGVs if used in userspace * fa - cli. SEGVs if used in userspace * fb - sti. SEGVs if used in userspace * * Opcodes which need some work to be supported: * 07,17,1f - pop es/ss/ds * Normally not used in userspace, but would execute if used. * Can cause GP or stack exception if tries to load wrong segment descriptor. * We hesitate to run them under single step since kernel's handling * of userspace single-stepping (TF flag) is fragile. * We can easily refuse to support push es/cs/ss/ds (06/0e/16/1e) * on the same grounds that they are never used. * cd - int N. * Used by userspace for "int 80" syscall entry. (Other "int N" * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3). * Not supported since kernel's handling of userspace single-stepping * (TF flag) is fragile. * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad */ #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static volatile u32 good_insns_32[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 00 */ W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */ W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ W(0x30, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; #else #define good_insns_32 NULL #endif /* Good-instruction tables for 64-bit apps. * * Genuinely invalid opcodes: * 06,07 - formerly push/pop es * 0e - formerly push cs * 16,17 - formerly push/pop ss * 1e,1f - formerly push/pop ds * 27,2f,37,3f - formerly daa/das/aaa/aas * 60,61 - formerly pusha/popa * 62 - formerly bound. EVEX prefix for AVX512 (not yet supported) * 82 - formerly redundant encoding of Group1 * 9a - formerly call seg:ofs * ce - formerly into * d4,d5 - formerly aam/aad * d6 - formerly undocumented salc * ea - formerly jmp seg:ofs * * Opcodes we'll probably never support: * 6c-6f - ins,outs. SEGVs if used in userspace * e4-e7 - in,out imm. SEGVs if used in userspace * ec-ef - in,out acc. SEGVs if used in userspace * cc - int3. SIGTRAP if used in userspace * f1 - int1. SIGTRAP if used in userspace * f4 - hlt. SEGVs if used in userspace * fa - cli. SEGVs if used in userspace * fb - sti. SEGVs if used in userspace * * Opcodes which need some work to be supported: * cd - int N. * Used by userspace for "int 80" syscall entry. (Other "int N" * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3). * Not supported since kernel's handling of userspace single-stepping * (TF flag) is fragile. * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad */ #if defined(CONFIG_X86_64) static volatile u32 good_insns_64[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* 00 */ W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */ W(0x20, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 20 */ W(0x30, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 30 */ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ W(0x60, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1) , /* 90 */ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0) | /* e0 */ W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; #else #define good_insns_64 NULL #endif /* Using this for both 64-bit and 32-bit apps. * Opcodes we don't support: * 0f 00 - SLDT/STR/LLDT/LTR/VERR/VERW/-/- group. System insns * 0f 01 - SGDT/SIDT/LGDT/LIDT/SMSW/-/LMSW/INVLPG group. * Also encodes tons of other system insns if mod=11. * Some are in fact non-system: xend, xtest, rdtscp, maybe more * 0f 05 - syscall * 0f 06 - clts (CPL0 insn) * 0f 07 - sysret * 0f 08 - invd (CPL0 insn) * 0f 09 - wbinvd (CPL0 insn) * 0f 0b - ud2 * 0f 30 - wrmsr (CPL0 insn) (then why rdmsr is allowed, it's also CPL0 insn?) * 0f 34 - sysenter * 0f 35 - sysexit * 0f 37 - getsec * 0f 78 - vmread (Intel VMX. CPL0 insn) * 0f 79 - vmwrite (Intel VMX. CPL0 insn) * Note: with prefixes, these two opcodes are * extrq/insertq/AVX512 convert vector ops. * 0f ae - group15: [f]xsave,[f]xrstor,[v]{ld,st}mxcsr,clflush[opt], * {rd,wr}{fs,gs}base,{s,l,m}fence. * Why? They are all user-executable. */ static volatile u32 good_2byte_insns[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1) | /* 00 */ W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ W(0x30, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* 70 */ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) /* f0 */ /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; #undef W /* * opcodes we may need to refine support for: * * 0f - 2-byte instructions: For many of these instructions, the validity * depends on the prefix and/or the reg field. On such instructions, we * just consider the opcode combination valid if it corresponds to any * valid instruction. * * 8f - Group 1 - only reg = 0 is OK * c6-c7 - Group 11 - only reg = 0 is OK * d9-df - fpu insns with some illegal encodings * f2, f3 - repnz, repz prefixes. These are also the first byte for * certain floating-point instructions, such as addsd. * * fe - Group 4 - only reg = 0 or 1 is OK * ff - Group 5 - only reg = 0-6 is OK * * others -- Do we need to support these? * * 0f - (floating-point?) prefetch instructions * 07, 17, 1f - pop es, pop ss, pop ds * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes -- * but 64 and 65 (fs: and gs:) seem to be used, so we support them * 67 - addr16 prefix * ce - into * f0 - lock prefix */ /* * TODO: * - Where necessary, examine the modrm byte and allow only valid instructions * in the different Groups and fpu instructions. */ static bool is_prefix_bad(struct insn *insn) { insn_byte_t p; for_each_insn_prefix(insn, p) { insn_attr_t attr; attr = inat_get_opcode_attribute(p); switch (attr) { case INAT_MAKE_PREFIX(INAT_PFX_ES): case INAT_MAKE_PREFIX(INAT_PFX_CS): case INAT_MAKE_PREFIX(INAT_PFX_DS): case INAT_MAKE_PREFIX(INAT_PFX_SS): case INAT_MAKE_PREFIX(INAT_PFX_LOCK): return true; } } return false; } static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64) { enum insn_mode m = x86_64 ? INSN_MODE_64 : INSN_MODE_32; u32 volatile *good_insns; int ret; ret = insn_decode(insn, auprobe->insn, sizeof(auprobe->insn), m); if (ret < 0) return -ENOEXEC; if (is_prefix_bad(insn)) return -ENOTSUPP; /* We should not singlestep on the exception masking instructions */ if (insn_masking_exception(insn)) return -ENOTSUPP; if (x86_64) good_insns = good_insns_64; else good_insns = good_insns_32; if (test_bit(OPCODE1(insn), (unsigned long *)good_insns)) return 0; if (insn->opcode.nbytes == 2) { if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) return 0; } return -ENOTSUPP; } #ifdef CONFIG_X86_64 struct uretprobe_syscall_args { unsigned long r11; unsigned long cx; unsigned long ax; }; asm ( ".pushsection .rodata\n" ".global uretprobe_trampoline_entry\n" "uretprobe_trampoline_entry:\n" "push %rax\n" "push %rcx\n" "push %r11\n" "mov $" __stringify(__NR_uretprobe) ", %rax\n" "syscall\n" ".global uretprobe_syscall_check\n" "uretprobe_syscall_check:\n" "pop %r11\n" "pop %rcx\n" /* * The uretprobe syscall replaces stored %rax value with final * return address, so we don't restore %rax in here and just * call ret. */ "ret\n" "int3\n" ".global uretprobe_trampoline_end\n" "uretprobe_trampoline_end:\n" ".popsection\n" ); extern u8 uretprobe_trampoline_entry[]; extern u8 uretprobe_trampoline_end[]; extern u8 uretprobe_syscall_check[]; void *arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; struct pt_regs *regs = task_pt_regs(current); /* * At the moment the uretprobe syscall trampoline is supported * only for native 64-bit process, the compat process still uses * standard breakpoint. */ if (user_64bit_mode(regs)) { *psize = uretprobe_trampoline_end - uretprobe_trampoline_entry; return uretprobe_trampoline_entry; } *psize = UPROBE_SWBP_INSN_SIZE; return &insn; } static unsigned long trampoline_check_ip(unsigned long tramp) { return tramp + (uretprobe_syscall_check - uretprobe_trampoline_entry); } SYSCALL_DEFINE0(uretprobe) { struct pt_regs *regs = task_pt_regs(current); struct uretprobe_syscall_args args; unsigned long err, ip, sp, tramp; /* If there's no trampoline, we are called from wrong place. */ tramp = uprobe_get_trampoline_vaddr(); if (unlikely(tramp == UPROBE_NO_TRAMPOLINE_VADDR)) goto sigill; /* Make sure the ip matches the only allowed sys_uretprobe caller. */ if (unlikely(regs->ip != trampoline_check_ip(tramp))) goto sigill; err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); if (err) goto sigill; /* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */ regs->r11 = args.r11; regs->cx = args.cx; regs->ax = args.ax; regs->sp += sizeof(args); regs->orig_ax = -1; ip = regs->ip; sp = regs->sp; uprobe_handle_trampoline(regs); /* * Some of the uprobe consumers has changed sp, we can do nothing, * just return via iret. * .. or shadow stack is enabled, in which case we need to skip * return through the user space stack address. */ if (regs->sp != sp || shstk_is_enabled()) return regs->ax; regs->sp -= sizeof(args); /* for the case uprobe_consumer has changed r11/cx */ args.r11 = regs->r11; args.cx = regs->cx; /* * ax register is passed through as return value, so we can use * its space on stack for ip value and jump to it through the * trampoline's ret instruction */ args.ax = regs->ip; regs->ip = ip; err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); if (err) goto sigill; /* ensure sysret, see do_syscall_64() */ regs->r11 = regs->flags; regs->cx = regs->ip; return regs->ax; sigill: force_sig(SIGILL); return -1; } /* * If arch_uprobe->insn doesn't use rip-relative addressing, return * immediately. Otherwise, rewrite the instruction so that it accesses * its memory operand indirectly through a scratch register. Set * defparam->fixups accordingly. (The contents of the scratch register * will be saved before we single-step the modified instruction, * and restored afterward). * * We do this because a rip-relative instruction can access only a * relatively small area (+/- 2 GB from the instruction), and the XOL * area typically lies beyond that area. At least for instructions * that store to memory, we can't execute the original instruction * and "fix things up" later, because the misdirected store could be * disastrous. * * Some useful facts about rip-relative instructions: * * - There's always a modrm byte with bit layout "00 reg 101". * - There's never a SIB byte. * - The displacement is always 4 bytes. * - REX.B=1 bit in REX prefix, which normally extends r/m field, * has no effect on rip-relative mode. It doesn't make modrm byte * with r/m=101 refer to register 1101 = R13. */ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) { u8 *cursor; u8 reg; u8 reg2; if (!insn_rip_relative(insn)) return; /* * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm. * Clear REX.b bit (extension of MODRM.rm field): * we want to encode low numbered reg, not r8+. */ if (insn->rex_prefix.nbytes) { cursor = auprobe->insn + insn_offset_rex_prefix(insn); /* REX byte has 0100wrxb layout, clearing REX.b bit */ *cursor &= 0xfe; } /* * Similar treatment for VEX3/EVEX prefix. * TODO: add XOP treatment when insn decoder supports them */ if (insn->vex_prefix.nbytes >= 3) { /* * vex2: c5 rvvvvLpp (has no b bit) * vex3/xop: c4/8f rxbmmmmm wvvvvLpp * evex: 62 rxbR00mm wvvvv1pp zllBVaaa * Setting VEX3.b (setting because it has inverted meaning). * Setting EVEX.x since (in non-SIB encoding) EVEX.x * is the 4th bit of MODRM.rm, and needs the same treatment. * For VEX3-encoded insns, VEX3.x value has no effect in * non-SIB encoding, the change is superfluous but harmless. */ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; *cursor |= 0x60; } /* * Convert from rip-relative addressing to register-relative addressing * via a scratch register. * * This is tricky since there are insns with modrm byte * which also use registers not encoded in modrm byte: * [i]div/[i]mul: implicitly use dx:ax * shift ops: implicitly use cx * cmpxchg: implicitly uses ax * cmpxchg8/16b: implicitly uses dx:ax and bx:cx * Encoding: 0f c7/1 modrm * The code below thinks that reg=1 (cx), chooses si as scratch. * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m. * First appeared in Haswell (BMI2 insn). It is vex-encoded. * Example where none of bx,cx,dx can be used as scratch reg: * c4 e2 63 f6 0d disp32 mulx disp32(%rip),%ebx,%ecx * [v]pcmpistri: implicitly uses cx, xmm0 * [v]pcmpistrm: implicitly uses xmm0 * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0 * [v]pcmpestrm: implicitly uses ax, dx, xmm0 * Evil SSE4.2 string comparison ops from hell. * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination. * Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm. * Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi). * AMD says it has no 3-operand form (vex.vvvv must be 1111) * and that it can have only register operands, not mem * (its modrm byte must have mode=11). * If these restrictions will ever be lifted, * we'll need code to prevent selection of di as scratch reg! * * Summary: I don't know any insns with modrm byte which * use SI register implicitly. DI register is used only * by one insn (maskmovq) and BX register is used * only by one too (cmpxchg8b). * BP is stack-segment based (may be a problem?). * AX, DX, CX are off-limits (many implicit users). * SP is unusable (it's stack pointer - think about "pop mem"; * also, rsp+disp32 needs sib encoding -> insn length change). */ reg = MODRM_REG(insn); /* Fetch modrm.reg */ reg2 = 0xff; /* Fetch vex.vvvv */ if (insn->vex_prefix.nbytes) reg2 = insn->vex_prefix.bytes[2]; /* * TODO: add XOP vvvv reading. * * vex.vvvv field is in bits 6-3, bits are inverted. * But in 32-bit mode, high-order bit may be ignored. * Therefore, let's consider only 3 low-order bits. */ reg2 = ((reg2 >> 3) & 0x7) ^ 0x7; /* * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15. * * Choose scratch reg. Order is important: must not select bx * if we can use si (cmpxchg8b case!) */ if (reg != 6 && reg2 != 6) { reg2 = 6; auprobe->defparam.fixups |= UPROBE_FIX_RIP_SI; } else if (reg != 7 && reg2 != 7) { reg2 = 7; auprobe->defparam.fixups |= UPROBE_FIX_RIP_DI; /* TODO (paranoia): force maskmovq to not use di */ } else { reg2 = 3; auprobe->defparam.fixups |= UPROBE_FIX_RIP_BX; } /* * Point cursor at the modrm byte. The next 4 bytes are the * displacement. Beyond the displacement, for some instructions, * is the immediate operand. */ cursor = auprobe->insn + insn_offset_modrm(insn); /* * Change modrm from "00 reg 101" to "10 reg reg2". Example: * 89 05 disp32 mov %eax,disp32(%rip) becomes * 89 86 disp32 mov %eax,disp32(%rsi) */ *cursor = 0x80 | (reg << 3) | reg2; } static inline unsigned long * scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs) { if (auprobe->defparam.fixups & UPROBE_FIX_RIP_SI) return &regs->si; if (auprobe->defparam.fixups & UPROBE_FIX_RIP_DI) return &regs->di; return &regs->bx; } /* * If we're emulating a rip-relative instruction, save the contents * of the scratch register and store the target address in that register. */ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) { struct uprobe_task *utask = current->utask; unsigned long *sr = scratch_reg(auprobe, regs); utask->autask.saved_scratch_register = *sr; *sr = utask->vaddr + auprobe->defparam.ilen; } } static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) { struct uprobe_task *utask = current->utask; unsigned long *sr = scratch_reg(auprobe, regs); *sr = utask->autask.saved_scratch_register; } } static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { return -EPERM; } static struct page *tramp_mapping_pages[2] __ro_after_init; static struct vm_special_mapping tramp_mapping = { .name = "[uprobes-trampoline]", .mremap = tramp_mremap, .pages = tramp_mapping_pages, }; struct uprobe_trampoline { struct hlist_node node; unsigned long vaddr; }; static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr) { long delta = (long)(vaddr + 5 - vtramp); return delta >= INT_MIN && delta <= INT_MAX; } static unsigned long find_nearest_trampoline(unsigned long vaddr) { struct vm_unmapped_area_info info = { .length = PAGE_SIZE, .align_mask = ~PAGE_MASK, }; unsigned long low_limit, high_limit; unsigned long low_tramp, high_tramp; unsigned long call_end = vaddr + 5; if (check_add_overflow(call_end, INT_MIN, &low_limit)) low_limit = PAGE_SIZE; high_limit = call_end + INT_MAX; /* Search up from the caller address. */ info.low_limit = call_end; info.high_limit = min(high_limit, TASK_SIZE); high_tramp = vm_unmapped_area(&info); /* Search down from the caller address. */ info.low_limit = max(low_limit, PAGE_SIZE); info.high_limit = call_end; info.flags = VM_UNMAPPED_AREA_TOPDOWN; low_tramp = vm_unmapped_area(&info); if (IS_ERR_VALUE(high_tramp) && IS_ERR_VALUE(low_tramp)) return -ENOMEM; if (IS_ERR_VALUE(high_tramp)) return low_tramp; if (IS_ERR_VALUE(low_tramp)) return high_tramp; /* Return address that's closest to the caller address. */ if (call_end - low_tramp < high_tramp - call_end) return low_tramp; return high_tramp; } static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr) { struct pt_regs *regs = task_pt_regs(current); struct mm_struct *mm = current->mm; struct uprobe_trampoline *tramp; struct vm_area_struct *vma; if (!user_64bit_mode(regs)) return NULL; vaddr = find_nearest_trampoline(vaddr); if (IS_ERR_VALUE(vaddr)) return NULL; tramp = kzalloc_obj(*tramp); if (unlikely(!tramp)) return NULL; tramp->vaddr = vaddr; vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE, VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO, &tramp_mapping); if (IS_ERR(vma)) { kfree(tramp); return NULL; } return tramp; } static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new) { struct uprobes_state *state = &current->mm->uprobes_state; struct uprobe_trampoline *tramp = NULL; if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE) return NULL; hlist_for_each_entry(tramp, &state->head_tramps, node) { if (is_reachable_by_call(tramp->vaddr, vaddr)) { *new = false; return tramp; } } tramp = create_uprobe_trampoline(vaddr); if (!tramp) return NULL; *new = true; hlist_add_head(&tramp->node, &state->head_tramps); return tramp; } static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp) { /* * We do not unmap and release uprobe trampoline page itself, * because there's no easy way to make sure none of the threads * is still inside the trampoline. */ hlist_del(&tramp->node); kfree(tramp); } void arch_uprobe_init_state(struct mm_struct *mm) { INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps); } void arch_uprobe_clear_state(struct mm_struct *mm) { struct uprobes_state *state = &mm->uprobes_state; struct uprobe_trampoline *tramp; struct hlist_node *n; hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) destroy_uprobe_trampoline(tramp); } static bool __in_uprobe_trampoline(unsigned long ip) { struct vm_area_struct *vma = vma_lookup(current->mm, ip); return vma && vma_is_special_mapping(vma, &tramp_mapping); } static bool in_uprobe_trampoline(unsigned long ip) { struct mm_struct *mm = current->mm; bool found, retry = true; unsigned int seq; rcu_read_lock(); if (mmap_lock_speculate_try_begin(mm, &seq)) { found = __in_uprobe_trampoline(ip); retry = mmap_lock_speculate_retry(mm, seq); } rcu_read_unlock(); if (retry) { mmap_read_lock(mm); found = __in_uprobe_trampoline(ip); mmap_read_unlock(mm); } return found; } /* * See uprobe syscall trampoline; the call to the trampoline will push * the return address on the stack, the trampoline itself then pushes * cx, r11 and ax. */ struct uprobe_syscall_args { unsigned long ax; unsigned long r11; unsigned long cx; unsigned long retaddr; }; SYSCALL_DEFINE0(uprobe) { struct pt_regs *regs = task_pt_regs(current); struct uprobe_syscall_args args; unsigned long ip, sp, sret; int err; /* Allow execution only from uprobe trampolines. */ if (!in_uprobe_trampoline(regs->ip)) return -ENXIO; err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); if (err) goto sigill; ip = regs->ip; /* * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus: * - adjust ip to the probe address, call saved next instruction address * - adjust sp to the probe's stack frame (check trampoline code) */ regs->ax = args.ax; regs->r11 = args.r11; regs->cx = args.cx; regs->ip = args.retaddr - 5; regs->sp += sizeof(args); regs->orig_ax = -1; sp = regs->sp; err = shstk_pop((u64 *)&sret); if (err == -EFAULT || (!err && sret != args.retaddr)) goto sigill; handle_syscall_uprobe(regs, regs->ip); /* * Some of the uprobe consumers has changed sp, we can do nothing, * just return via iret. */ if (regs->sp != sp) { /* skip the trampoline call */ if (args.retaddr - 5 == regs->ip) regs->ip += 5; return regs->ax; } regs->sp -= sizeof(args); /* for the case uprobe_consumer has changed ax/r11/cx */ args.ax = regs->ax; args.r11 = regs->r11; args.cx = regs->cx; /* keep return address unless we are instructed otherwise */ if (args.retaddr - 5 != regs->ip) args.retaddr = regs->ip; if (shstk_push(args.retaddr) == -EFAULT) goto sigill; regs->ip = ip; err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); if (err) goto sigill; /* ensure sysret, see do_syscall_64() */ regs->r11 = regs->flags; regs->cx = regs->ip; return 0; sigill: force_sig(SIGILL); return -1; } asm ( ".pushsection .rodata\n" ".balign " __stringify(PAGE_SIZE) "\n" "uprobe_trampoline_entry:\n" "push %rcx\n" "push %r11\n" "push %rax\n" "mov $" __stringify(__NR_uprobe) ", %rax\n" "syscall\n" "pop %rax\n" "pop %r11\n" "pop %rcx\n" "ret\n" "int3\n" ".balign " __stringify(PAGE_SIZE) "\n" ".popsection\n" ); extern u8 uprobe_trampoline_entry[]; static int __init arch_uprobes_init(void) { tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry); return 0; } late_initcall(arch_uprobes_init); enum { EXPECT_SWBP, EXPECT_CALL, }; struct write_opcode_ctx { unsigned long base; int expect; }; static int is_call_insn(uprobe_opcode_t *insn) { return *insn == CALL_INSN_OPCODE; } /* * Verification callback used by int3_update uprobe_write calls to make sure * the underlying instruction is as expected - either int3 or call. */ static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode, int nbytes, void *data) { struct write_opcode_ctx *ctx = data; uprobe_opcode_t old_opcode[5]; uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5); switch (ctx->expect) { case EXPECT_SWBP: if (is_swbp_insn(&old_opcode[0])) return 1; break; case EXPECT_CALL: if (is_call_insn(&old_opcode[0])) return 1; break; } return -1; } /* * Modify multi-byte instructions by using INT3 breakpoints on SMP. * We completely avoid using stop_machine() here, and achieve the * synchronization using INT3 breakpoints and SMP cross-calls. * (borrowed comment from smp_text_poke_batch_finish) * * The way it is done: * - Add an INT3 trap to the address that will be patched * - SMP sync all CPUs * - Update all but the first byte of the patched range * - SMP sync all CPUs * - Replace the first byte (INT3) by the first byte of the replacing opcode * - SMP sync all CPUs */ static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, char *insn, bool optimize) { uprobe_opcode_t int3 = UPROBE_SWBP_INSN; struct write_opcode_ctx ctx = { .base = vaddr, }; int err; /* * Write int3 trap. * * The swbp_optimize path comes with breakpoint already installed, * so we can skip this step for optimize == true. */ if (!optimize) { ctx.expect = EXPECT_CALL; err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn, true /* is_register */, false /* do_update_ref_ctr */, &ctx); if (err) return err; } smp_text_poke_sync_each_cpu(); /* Write all but the first byte of the patched range. */ ctx.expect = EXPECT_SWBP; err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn, true /* is_register */, false /* do_update_ref_ctr */, &ctx); if (err) return err; smp_text_poke_sync_each_cpu(); /* * Write first byte. * * The swbp_unoptimize needs to finish uprobe removal together * with ref_ctr update, using uprobe_write with proper flags. */ err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn, optimize /* is_register */, !optimize /* do_update_ref_ctr */, &ctx); if (err) return err; smp_text_poke_sync_each_cpu(); return 0; } static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, unsigned long tramp) { u8 call[5]; __text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr, (const void *) tramp, CALL_INSN_SIZE); return int3_update(auprobe, vma, vaddr, call, true /* optimize */); } static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */); } static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len) { unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD; struct vm_area_struct *vma; struct page *page; page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); if (IS_ERR(page)) return PTR_ERR(page); uprobe_copy_from_page(page, vaddr, dst, len); put_page(page); return 0; } static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr) { struct __packed __arch_relative_insn { u8 op; s32 raddr; } *call = (struct __arch_relative_insn *) insn; if (!is_call_insn(insn)) return false; return __in_uprobe_trampoline(vaddr + 5 + call->raddr); } static int is_optimized(struct mm_struct *mm, unsigned long vaddr) { uprobe_opcode_t insn[5]; int err; err = copy_from_vaddr(mm, vaddr, &insn, 5); if (err) return err; return __is_optimized((uprobe_opcode_t *)&insn, vaddr); } static bool should_optimize(struct arch_uprobe *auprobe) { return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) && test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); } int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { if (should_optimize(auprobe)) { /* * We could race with another thread that already optimized the probe, * so let's not overwrite it with int3 again in this case. */ int ret = is_optimized(vma->vm_mm, vaddr); if (ret < 0) return ret; if (ret) return 0; } return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true /* is_register */); } int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) { int ret = is_optimized(vma->vm_mm, vaddr); if (ret < 0) return ret; if (ret) { ret = swbp_unoptimize(auprobe, vma, vaddr); WARN_ON_ONCE(ret); return ret; } } return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, false /* is_register */); } static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { struct uprobe_trampoline *tramp; struct vm_area_struct *vma; bool new = false; int err = 0; vma = find_vma(mm, vaddr); if (!vma) return -EINVAL; tramp = get_uprobe_trampoline(vaddr, &new); if (!tramp) return -EINVAL; err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr); if (WARN_ON_ONCE(err) && new) destroy_uprobe_trampoline(tramp); return err; } void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) { struct mm_struct *mm = current->mm; uprobe_opcode_t insn[5]; if (!should_optimize(auprobe)) return; mmap_write_lock(mm); /* * Check if some other thread already optimized the uprobe for us, * if it's the case just go away silently. */ if (copy_from_vaddr(mm, vaddr, &insn, 5)) goto unlock; if (!is_swbp_insn((uprobe_opcode_t*) &insn)) goto unlock; /* * If we fail to optimize the uprobe we set the fail bit so the * above should_optimize will fail from now on. */ if (__arch_uprobe_optimize(auprobe, mm, vaddr)) set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags); unlock: mmap_write_unlock(mm); } static bool can_optimize(struct insn *insn, unsigned long vaddr) { if (!insn->x86_64 || insn->length != 5) return false; if (!insn_is_nop(insn)) return false; /* We can't do cross page atomic writes yet. */ return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5; } #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit */ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) { } static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } static bool can_optimize(struct insn *insn, unsigned long vaddr) { return false; } #endif /* CONFIG_X86_64 */ struct uprobe_xol_ops { bool (*emulate)(struct arch_uprobe *, struct pt_regs *); int (*pre_xol)(struct arch_uprobe *, struct pt_regs *); int (*post_xol)(struct arch_uprobe *, struct pt_regs *); void (*abort)(struct arch_uprobe *, struct pt_regs *); }; static inline int sizeof_long(struct pt_regs *regs) { /* * Check registers for mode as in_xxx_syscall() does not apply here. */ return user_64bit_mode(regs) ? 8 : 4; } static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { riprel_pre_xol(auprobe, regs); return 0; } static int emulate_push_stack(struct pt_regs *regs, unsigned long val) { unsigned long new_sp = regs->sp - sizeof_long(regs); if (copy_to_user((void __user *)new_sp, &val, sizeof_long(regs))) return -EFAULT; regs->sp = new_sp; return 0; } /* * We have to fix things up as follows: * * Typically, the new ip is relative to the copied instruction. We need * to make it relative to the original instruction (FIX_IP). Exceptions * are return instructions and absolute or indirect jump or call instructions. * * If the single-stepped instruction was a call, the return address that * is atop the stack is the address following the copied instruction. We * need to make it the address following the original instruction (FIX_CALL). * * If the original instruction was a rip-relative instruction such as * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)". * We need to restore the contents of the scratch register * (FIX_RIP_reg). */ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; riprel_post_xol(auprobe, regs); if (auprobe->defparam.fixups & UPROBE_FIX_IP) { long correction = utask->vaddr - utask->xol_vaddr; regs->ip += correction; } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) { regs->sp += sizeof_long(regs); /* Pop incorrect return address */ if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen)) return -ERESTART; } /* popf; tell the caller to not touch TF */ if (auprobe->defparam.fixups & UPROBE_FIX_SETF) utask->autask.saved_tf = true; return 0; } static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { riprel_post_xol(auprobe, regs); } static const struct uprobe_xol_ops default_xol_ops = { .pre_xol = default_pre_xol_op, .post_xol = default_post_xol_op, .abort = default_abort_op, }; static bool branch_is_call(struct arch_uprobe *auprobe) { return auprobe->branch.opc1 == 0xe8; } #define CASE_COND \ COND(70, 71, XF(OF)) \ COND(72, 73, XF(CF)) \ COND(74, 75, XF(ZF)) \ COND(78, 79, XF(SF)) \ COND(7a, 7b, XF(PF)) \ COND(76, 77, XF(CF) || XF(ZF)) \ COND(7c, 7d, XF(SF) != XF(OF)) \ COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF)) #define COND(op_y, op_n, expr) \ case 0x ## op_y: DO((expr) != 0) \ case 0x ## op_n: DO((expr) == 0) #define XF(xf) (!!(flags & X86_EFLAGS_ ## xf)) static bool is_cond_jmp_opcode(u8 opcode) { switch (opcode) { #define DO(expr) \ return true; CASE_COND #undef DO default: return false; } } static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs) { unsigned long flags = regs->flags; switch (auprobe->branch.opc1) { #define DO(expr) \ return expr; CASE_COND #undef DO default: /* not a conditional jmp */ return true; } } #undef XF #undef COND #undef CASE_COND static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { unsigned long new_ip = regs->ip += auprobe->branch.ilen; unsigned long offs = (long)auprobe->branch.offs; if (branch_is_call(auprobe)) { /* * If it fails we execute this (mangled, see the comment in * branch_clear_offset) insn out-of-line. In the likely case * this should trigger the trap, and the probed application * should die or restart the same insn after it handles the * signal, arch_uprobe_post_xol() won't be even called. * * But there is corner case, see the comment in ->post_xol(). */ if (emulate_push_stack(regs, new_ip)) return false; } else if (!check_jmp_cond(auprobe, regs)) { offs = 0; } regs->ip = new_ip + offs; return true; } static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset; if (emulate_push_stack(regs, *src_ptr)) return false; regs->ip += auprobe->push.ilen; return true; } static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { BUG_ON(!branch_is_call(auprobe)); /* * We can only get here if branch_emulate_op() failed to push the ret * address _and_ another thread expanded our stack before the (mangled) * "call" insn was executed out-of-line. Just restore ->sp and restart. * We could also restore ->ip and try to call branch_emulate_op() again. */ regs->sp += sizeof_long(regs); return -ERESTART; } static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn) { /* * Turn this insn into "call 1f; 1:", this is what we will execute * out-of-line if ->emulate() fails. We only need this to generate * a trap, so that the probed task receives the correct signal with * the properly filled siginfo. * * But see the comment in ->post_xol(), in the unlikely case it can * succeed. So we need to ensure that the new ->ip can not fall into * the non-canonical area and trigger #GP. * * We could turn it into (say) "pushf", but then we would need to * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte * of ->insn[] for set_orig_insn(). */ memset(auprobe->insn + insn_offset_immediate(insn), 0, insn->immediate.nbytes); } static const struct uprobe_xol_ops branch_xol_ops = { .emulate = branch_emulate_op, .post_xol = branch_post_xol_op, }; static const struct uprobe_xol_ops push_xol_ops = { .emulate = push_emulate_op, }; /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) { u8 opc1 = OPCODE1(insn); insn_byte_t p; if (insn_is_nop(insn)) goto setup; switch (opc1) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ break; case 0xe8: /* call relative */ branch_clear_offset(auprobe, insn); break; case 0x0f: if (insn->opcode.nbytes != 2) return -ENOSYS; /* * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches * OPCODE1() of the "short" jmp which checks the same condition. */ opc1 = OPCODE2(insn) - 0x10; fallthrough; default: if (!is_cond_jmp_opcode(opc1)) return -ENOSYS; } /* * 16-bit overrides such as CALLW (66 e8 nn nn) are not supported. * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix. * No one uses these insns, reject any branch insns with such prefix. */ for_each_insn_prefix(insn, p) { if (p == 0x66) return -ENOTSUPP; } setup: auprobe->branch.opc1 = opc1; auprobe->branch.ilen = insn->length; auprobe->branch.offs = insn->immediate.value; auprobe->ops = &branch_xol_ops; return 0; } /* Returns -ENOSYS if push_xol_ops doesn't handle this insn */ static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) { u8 opc1 = OPCODE1(insn), reg_offset = 0; if (opc1 < 0x50 || opc1 > 0x57) return -ENOSYS; if (insn->length > 2) return -ENOSYS; if (insn->length == 2) { /* only support rex_prefix 0x41 (x64 only) */ #ifdef CONFIG_X86_64 if (insn->rex_prefix.nbytes != 1 || insn->rex_prefix.bytes[0] != 0x41) return -ENOSYS; switch (opc1) { case 0x50: reg_offset = offsetof(struct pt_regs, r8); break; case 0x51: reg_offset = offsetof(struct pt_regs, r9); break; case 0x52: reg_offset = offsetof(struct pt_regs, r10); break; case 0x53: reg_offset = offsetof(struct pt_regs, r11); break; case 0x54: reg_offset = offsetof(struct pt_regs, r12); break; case 0x55: reg_offset = offsetof(struct pt_regs, r13); break; case 0x56: reg_offset = offsetof(struct pt_regs, r14); break; case 0x57: reg_offset = offsetof(struct pt_regs, r15); break; } #else return -ENOSYS; #endif } else { switch (opc1) { case 0x50: reg_offset = offsetof(struct pt_regs, ax); break; case 0x51: reg_offset = offsetof(struct pt_regs, cx); break; case 0x52: reg_offset = offsetof(struct pt_regs, dx); break; case 0x53: reg_offset = offsetof(struct pt_regs, bx); break; case 0x54: reg_offset = offsetof(struct pt_regs, sp); break; case 0x55: reg_offset = offsetof(struct pt_regs, bp); break; case 0x56: reg_offset = offsetof(struct pt_regs, si); break; case 0x57: reg_offset = offsetof(struct pt_regs, di); break; } } auprobe->push.reg_offset = reg_offset; auprobe->push.ilen = insn->length; auprobe->ops = &push_xol_ops; return 0; } /** * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. * @auprobe: the probepoint information. * @mm: the probed address space. * @addr: virtual address at which to install the probepoint * Return 0 on success or a -ve number on error. */ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { u8 fix_ip_or_call = UPROBE_FIX_IP; struct insn insn; int ret; ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm)); if (ret) return ret; if (can_optimize(&insn, addr)) set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); ret = branch_setup_xol_ops(auprobe, &insn); if (ret != -ENOSYS) return ret; ret = push_setup_xol_ops(auprobe, &insn); if (ret != -ENOSYS) return ret; /* * Figure out which fixups default_post_xol_op() will need to perform, * and annotate defparam->fixups accordingly. */ switch (OPCODE1(&insn)) { case 0x9d: /* popf */ auprobe->defparam.fixups |= UPROBE_FIX_SETF; break; case 0xc3: /* ret or lret -- ip is correct */ case 0xcb: case 0xc2: case 0xca: case 0xea: /* jmp absolute -- ip is correct */ fix_ip_or_call = 0; break; case 0x9a: /* call absolute - Fix return addr, not ip */ fix_ip_or_call = UPROBE_FIX_CALL; break; case 0xff: switch (MODRM_REG(&insn)) { case 2: case 3: /* call or lcall, indirect */ fix_ip_or_call = UPROBE_FIX_CALL; break; case 4: case 5: /* jmp or ljmp, indirect */ fix_ip_or_call = 0; break; } fallthrough; default: riprel_analyze(auprobe, &insn); } auprobe->defparam.ilen = insn.length; auprobe->defparam.fixups |= fix_ip_or_call; auprobe->ops = &default_xol_ops; return 0; } /* * arch_uprobe_pre_xol - prepare to execute out of line. * @auprobe: the probepoint information. * @regs: reflects the saved user state of current task. */ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; if (auprobe->ops->pre_xol) { int err = auprobe->ops->pre_xol(auprobe, regs); if (err) return err; } regs->ip = utask->xol_vaddr; utask->autask.saved_trap_nr = current->thread.trap_nr; current->thread.trap_nr = UPROBE_TRAP_NR; utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF); regs->flags |= X86_EFLAGS_TF; if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) set_task_blockstep(current, false); return 0; } /* * If xol insn itself traps and generates a signal(Say, * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped * instruction jumps back to its own address. It is assumed that anything * like do_page_fault/do_trap/etc sets thread.trap_nr != -1. * * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr, * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol(). */ bool arch_uprobe_xol_was_trapped(struct task_struct *t) { if (t->thread.trap_nr != UPROBE_TRAP_NR) return true; return false; } /* * Called after single-stepping. To avoid the SMP problems that can * occur when we temporarily put back the original opcode to * single-step, we single-stepped a copy of the instruction. * * This function prepares to resume execution after the single-step. */ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; bool send_sigtrap = utask->autask.saved_tf; int err = 0; WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); current->thread.trap_nr = utask->autask.saved_trap_nr; if (auprobe->ops->post_xol) { err = auprobe->ops->post_xol(auprobe, regs); if (err) { /* * Restore ->ip for restart or post mortem analysis. * ->post_xol() must not return -ERESTART unless this * is really possible. */ regs->ip = utask->vaddr; if (err == -ERESTART) err = 0; send_sigtrap = false; } } /* * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP * so we can get an extra SIGTRAP if we do not clear TF. We need * to examine the opcode to make it right. */ if (send_sigtrap) send_sig(SIGTRAP, current, 0); if (!utask->autask.saved_tf) regs->flags &= ~X86_EFLAGS_TF; return err; } /* callback routine for handling exceptions. */ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data) { struct die_args *args = data; struct pt_regs *regs = args->regs; int ret = NOTIFY_DONE; /* We are only interested in userspace traps */ if (regs && !user_mode(regs)) return NOTIFY_DONE; switch (val) { case DIE_INT3: if (uprobe_pre_sstep_notifier(regs)) ret = NOTIFY_STOP; break; case DIE_DEBUG: if (uprobe_post_sstep_notifier(regs)) ret = NOTIFY_STOP; break; default: break; } return ret; } /* * This function gets called when XOL instruction either gets trapped or * the thread has a fatal signal. Reset the instruction pointer to its * probed address for the potential restart or for post mortem analysis. */ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; if (auprobe->ops->abort) auprobe->ops->abort(auprobe, regs); current->thread.trap_nr = utask->autask.saved_trap_nr; regs->ip = utask->vaddr; /* clear TF if it was set by us in arch_uprobe_pre_xol() */ if (!utask->autask.saved_tf) regs->flags &= ~X86_EFLAGS_TF; } static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { if (auprobe->ops->emulate) return auprobe->ops->emulate(auprobe, regs); return false; } bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { bool ret = __skip_sstep(auprobe, regs); if (ret && (regs->flags & X86_EFLAGS_TF)) send_sig(SIGTRAP, current, 0); return ret; } unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) { int rasize = sizeof_long(regs), nleft; unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */ if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize)) return -1; /* check whether address has been already hijacked */ if (orig_ret_vaddr == trampoline_vaddr) return orig_ret_vaddr; nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); if (likely(!nleft)) { if (shstk_update_last_frame(trampoline_vaddr)) { force_sig(SIGSEGV); return -1; } return orig_ret_vaddr; } if (nleft != rasize) { pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n", current->pid, regs->sp, regs->ip); force_sig(SIGSEGV); } return -1; } bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, struct pt_regs *regs) { if (ctx == RP_CHECK_CALL) /* sp was just decremented by "call" insn */ return regs->sp < ret->stack; else return regs->sp <= ret->stack; } /* * Heuristic-based check if uprobe is installed at the function entry. * * Under assumption of user code being compiled with frame pointers, * `push %rbp/%ebp` is a good indicator that we indeed are. * * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern. * If we get this wrong, captured stack trace might have one extra bogus * entry, but the rest of stack trace will still be meaningful. */ bool is_uprobe_at_func_entry(struct pt_regs *regs) { struct arch_uprobe *auprobe; if (!current->utask) return false; auprobe = current->utask->auprobe; if (!auprobe) return false; /* push %rbp/%ebp */ if (auprobe->insn[0] == 0x55) return true; /* endbr64 (64-bit only) */ if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn)) return true; return false; } #ifdef CONFIG_IA32_EMULATION unsigned long arch_uprobe_get_xol_area(void) { struct thread_info *ti = current_thread_info(); unsigned long vaddr; /* * HACK: we are not in a syscall, but x86 get_unmapped_area() paths * ignore TIF_ADDR32 and rely on in_32bit_syscall() to calculate * vm_unmapped_area_info.high_limit. * * The #ifdef above doesn't cover the CONFIG_X86_X32_ABI=y case, * but in this case in_32bit_syscall() -> in_x32_syscall() always * (falsely) returns true because ->orig_ax == -1. */ if (test_thread_flag(TIF_ADDR32)) ti->status |= TS_COMPAT; vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); ti->status &= ~TS_COMPAT; return vaddr; } #endif
14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 /* SPDX-License-Identifier: GPL-2.0 */ /* * Device core Trace Support * Copyright (C) 2021, Intel Corporation * * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com> */ #undef TRACE_SYSTEM #define TRACE_SYSTEM dev #if !defined(__DEV_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define __DEV_TRACE_H #include <linux/device.h> #include <linux/tracepoint.h> #include <linux/types.h> DECLARE_EVENT_CLASS(devres, TP_PROTO(struct device *dev, const char *op, void *node, const char *name, size_t size), TP_ARGS(dev, op, node, name, size), TP_STRUCT__entry( __string(devname, dev_name(dev)) __field(struct device *, dev) __field(const char *, op) __field(void *, node) __string(name, name) __field(size_t, size) ), TP_fast_assign( __assign_str(devname); __entry->op = op; __entry->node = node; __assign_str(name); __entry->size = size; ), TP_printk("%s %3s %p %s (%zu bytes)", __get_str(devname), __entry->op, __entry->node, __get_str(name), __entry->size) ); DEFINE_EVENT(devres, devres_log, TP_PROTO(struct device *dev, const char *op, void *node, const char *name, size_t size), TP_ARGS(dev, op, node, name, size) ); #endif /* __DEV_TRACE_H */ /* this part has to be here */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h>
787 787 787 787 786 8 8 4 786 786 786 786 781 767 1 1 1 706 706 29 688 784 784 783 784 784 760 674 6 670 6 667 674 682 675 675 671 673 681 681 1 1 13 11 2 1 2 1 1 24 28 28 33 4 30 29 30 1 28 29 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/base.c * * Copyright (C) 1991, 1992 Linus Torvalds * * proc base directory handling functions * * 1999, Al Viro. Rewritten. Now it covers the whole per-process part. * Instead of using magical inumbers to determine the kind of object * we allocate and fill in-core inodes upon lookup. They don't even * go into icache. We cache the reference to task_struct upon lookup too. * Eventually it should become a filesystem in its own. We don't use the * rest of procfs anymore. * * * Changelog: * 17-Jan-2005 * Allan Bezerra * Bruna Moreira <bruna.moreira@indt.org.br> * Edjard Mota <edjard.mota@indt.org.br> * Ilias Biris <ilias.biris@indt.org.br> * Mauricio Lin <mauricio.lin@indt.org.br> * * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT * * A new process specific entry (smaps) included in /proc. It shows the * size of rss for each memory area. The maps entry lacks information * about physical memory size (rss) for each mapped file, i.e., * rss information for executables and library files. * This additional information is useful for any tools that need to know * about physical memory consumption for a process specific library. * * Changelog: * 21-Feb-2005 * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT * Pud inclusion in the page table walking. * * ChangeLog: * 10-Mar-2005 * 10LE Instituto Nokia de Tecnologia - INdT: * A better way to walks through the page table as suggested by Hugh Dickins. * * Simo Piiroinen <simo.piiroinen@nokia.com>: * Smaps information related to shared, private, clean and dirty pages. * * Paul Mundt <paul.mundt@nokia.com>: * Overall revision about smaps. */ #include <linux/uaccess.h> #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/task_io_accounting_ops.h> #include <linux/init.h> #include <linux/capability.h> #include <linux/file.h> #include <linux/generic-radix-tree.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/namei.h> #include <linux/mnt_namespace.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/rcupdate.h> #include <linux/kallsyms.h> #include <linux/stacktrace.h> #include <linux/resource.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/ptrace.h> #include <linux/printk.h> #include <linux/cache.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/audit.h> #include <linux/poll.h> #include <linux/nsproxy.h> #include <linux/oom.h> #include <linux/elf.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/fs_parser.h> #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/debug.h> #include <linux/sched/stat.h> #include <linux/posix-timers.h> #include <linux/time_namespace.h> #include <linux/resctrl.h> #include <linux/cn_proc.h> #include <linux/ksm.h> #include <uapi/linux/lsm.h> #include <trace/events/oom.h> #include "internal.h" #include "fd.h" #include "../../lib/kstrtox.h" /* NOTE: * Implementing inode permission operations in /proc is almost * certainly an error. Permission checks need to happen during * each system call not at open time. The reason is that most of * what we wish to check for permissions in /proc varies at runtime. * * The classic example of a problem is opening file descriptors * in /proc for a task before it execs a suid executable. */ static u8 nlink_tid __ro_after_init; static u8 nlink_tgid __ro_after_init; enum proc_mem_force { PROC_MEM_FORCE_ALWAYS, PROC_MEM_FORCE_PTRACE, PROC_MEM_FORCE_NEVER }; static enum proc_mem_force proc_mem_force_override __ro_after_init = IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER : IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE : PROC_MEM_FORCE_ALWAYS; static const struct constant_table proc_mem_force_table[] __initconst = { { "always", PROC_MEM_FORCE_ALWAYS }, { "ptrace", PROC_MEM_FORCE_PTRACE }, { "never", PROC_MEM_FORCE_NEVER }, { } }; static int __init early_proc_mem_force_override(char *buf) { if (!buf) return -EINVAL; /* * lookup_constant() defaults to proc_mem_force_override to preseve * the initial Kconfig choice in case an invalid param gets passed. */ proc_mem_force_override = lookup_constant(proc_mem_force_table, buf, proc_mem_force_override); return 0; } early_param("proc_mem.force_override", early_proc_mem_force_override); struct pid_entry { const char *name; unsigned int len; umode_t mode; const struct inode_operations *iop; const struct file_operations *fop; union proc_op op; }; #define NOD(NAME, MODE, IOP, FOP, OP) { \ .name = (NAME), \ .len = sizeof(NAME) - 1, \ .mode = MODE, \ .iop = IOP, \ .fop = FOP, \ .op = OP, \ } #define DIR(NAME, MODE, iops, fops) \ NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} ) #define LNK(NAME, get_link) \ NOD(NAME, (S_IFLNK|S_IRWXUGO), \ &proc_pid_link_inode_operations, NULL, \ { .proc_get_link = get_link } ) #define REG(NAME, MODE, fops) \ NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) #define ONE(NAME, MODE, show) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_single_file_operations, \ { .proc_show = show } ) #define ATTR(LSMID, NAME, MODE) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_pid_attr_operations, \ { .lsmid = LSMID }) /* * Count the number of hardlinks for the pid_entry table, excluding the . * and .. links. */ static unsigned int __init pid_entry_nlink(const struct pid_entry *entries, unsigned int n) { unsigned int i; unsigned int count; count = 2; for (i = 0; i < n; ++i) { if (S_ISDIR(entries[i].mode)) ++count; } return count; } static int get_task_root(struct task_struct *task, struct path *root) { int result = -ENOENT; task_lock(task); if (task->fs) { get_fs_root(task->fs, root); result = 0; } task_unlock(task); return result; } static int proc_cwd_link(struct dentry *dentry, struct path *path) { struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT; if (task) { task_lock(task); if (task->fs) { get_fs_pwd(task->fs, path); result = 0; } task_unlock(task); put_task_struct(task); } return result; } static int proc_root_link(struct dentry *dentry, struct path *path) { struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT; if (task) { result = get_task_root(task, path); put_task_struct(task); } return result; } /* * If the user used setproctitle(), we just get the string from * user space at arg_start, and limit it to a maximum of one page. */ static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf, size_t count, unsigned long pos, unsigned long arg_start) { char *page; int ret, got; if (pos >= PAGE_SIZE) return 0; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; ret = 0; got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON); if (got > 0) { int len = strnlen(page, got); /* Include the NUL character if it was found */ if (len < got) len++; if (len > pos) { len -= pos; if (len > count) len = count; len -= copy_to_user(buf, page+pos, len); if (!len) len = -EFAULT; ret = len; } } free_page((unsigned long)page); return ret; } static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, size_t count, loff_t *ppos) { unsigned long arg_start, arg_end, env_start, env_end; unsigned long pos, len; char *page, c; /* Check if process spawned far enough to have cmdline. */ if (!mm->env_end) return 0; spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; spin_unlock(&mm->arg_lock); if (arg_start >= arg_end) return 0; /* * We allow setproctitle() to overwrite the argument * strings, and overflow past the original end. But * only when it overflows into the environment area. */ if (env_start != arg_end || env_end < env_start) env_start = env_end = arg_end; len = env_end - arg_start; /* We're not going to care if "*ppos" has high bits set */ pos = *ppos; if (pos >= len) return 0; if (count > len - pos) count = len - pos; if (!count) return 0; /* * Magical special case: if the argv[] end byte is not * zero, the user has overwritten it with setproctitle(3). * * Possible future enhancement: do this only once when * pos is 0, and set a flag in the 'struct file'. */ if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c) return get_mm_proctitle(mm, buf, count, pos, arg_start); /* * For the non-setproctitle() case we limit things strictly * to the [arg_start, arg_end[ range. */ pos += arg_start; if (pos < arg_start || pos >= arg_end) return 0; if (count > arg_end - pos) count = arg_end - pos; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; len = 0; while (count) { int got; size_t size = min_t(size_t, PAGE_SIZE, count); got = access_remote_vm(mm, pos, page, size, FOLL_ANON); if (got <= 0) break; got -= copy_to_user(buf, page, got); if (unlikely(!got)) { if (!len) len = -EFAULT; break; } pos += got; buf += got; len += got; count -= got; } free_page((unsigned long)page); return len; } static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf, size_t count, loff_t *pos) { struct mm_struct *mm; ssize_t ret; mm = get_task_mm(tsk); if (!mm) return 0; ret = get_mm_cmdline(mm, buf, count, pos); mmput(mm); return ret; } static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { struct task_struct *tsk; ssize_t ret; BUG_ON(*pos < 0); tsk = get_proc_task(file_inode(file)); if (!tsk) return -ESRCH; ret = get_task_cmdline(tsk, buf, count, pos); put_task_struct(tsk); if (ret > 0) *pos += ret; return ret; } static const struct file_operations proc_pid_cmdline_ops = { .read = proc_pid_cmdline_read, .llseek = generic_file_llseek, }; #ifdef CONFIG_KALLSYMS /* * Provides a wchan file via kallsyms in a proper one-value-per-file format. * Returns the resolved symbol to user space. */ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long wchan; char symname[KSYM_NAME_LEN]; if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto print0; wchan = get_wchan(task); if (wchan && !lookup_symbol_name(wchan, symname)) { seq_puts(m, symname); return 0; } print0: seq_putc(m, '0'); return 0; } #endif /* CONFIG_KALLSYMS */ static int lock_trace(struct task_struct *task) { int err = down_read_killable(&task->signal->exec_update_lock); if (err) return err; if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { up_read(&task->signal->exec_update_lock); return -EPERM; } return 0; } static void unlock_trace(struct task_struct *task) { up_read(&task->signal->exec_update_lock); } #ifdef CONFIG_STACKTRACE #define MAX_STACK_TRACE_DEPTH 64 static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long *entries; int err; /* * The ability to racily run the kernel stack unwinder on a running task * and then observe the unwinder output is scary; while it is useful for * debugging kernel issues, it can also allow an attacker to leak kernel * stack contents. * Doing this in a manner that is at least safe from races would require * some work to ensure that the remote task can not be scheduled; and * even then, this would still expose the unwinder as local attack * surface. * Therefore, this interface is restricted to root. */ if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) return -EACCES; entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries), GFP_KERNEL); if (!entries) return -ENOMEM; err = lock_trace(task); if (!err) { unsigned int i, nr_entries; nr_entries = stack_trace_save_tsk(task, entries, MAX_STACK_TRACE_DEPTH, 0); for (i = 0; i < nr_entries; i++) { seq_printf(m, "[<0>] %pB\n", (void *)entries[i]); } unlock_trace(task); } kfree(entries); return err; } #endif #ifdef CONFIG_SCHED_INFO /* * Provides /proc/PID/schedstat */ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { if (unlikely(!sched_info_on())) seq_puts(m, "0 0 0\n"); else seq_printf(m, "%llu %llu %lu\n", (unsigned long long)task->se.sum_exec_runtime, (unsigned long long)task->sched_info.run_delay, task->sched_info.pcount); return 0; } #endif #ifdef CONFIG_LATENCYTOP static int lstats_show_proc(struct seq_file *m, void *v) { int i; struct inode *inode = m->private; struct task_struct *task = get_proc_task(inode); if (!task) return -ESRCH; seq_puts(m, "Latency Top version : v0.1\n"); for (i = 0; i < LT_SAVECOUNT; i++) { struct latency_record *lr = &task->latency_record[i]; if (lr->backtrace[0]) { int q; seq_printf(m, "%i %li %li", lr->count, lr->time, lr->max); for (q = 0; q < LT_BACKTRACEDEPTH; q++) { unsigned long bt = lr->backtrace[q]; if (!bt) break; seq_printf(m, " %ps", (void *)bt); } seq_putc(m, '\n'); } } put_task_struct(task); return 0; } static int lstats_open(struct inode *inode, struct file *file) { return single_open(file, lstats_show_proc, inode); } static ssize_t lstats_write(struct file *file, const char __user *buf, size_t count, loff_t *offs) { struct task_struct *task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; clear_tsk_latency_tracing(task); put_task_struct(task); return count; } static const struct file_operations proc_lstats_operations = { .open = lstats_open, .read = seq_read, .write = lstats_write, .llseek = seq_lseek, .release = single_release, }; #endif static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; long badness; badness = oom_badness(task, totalpages); /* * Special case OOM_SCORE_ADJ_MIN for all others scale the * badness value into [0, 2000] range which we have been * exporting for a long time so userspace might depend on it. */ if (badness != LONG_MIN) points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3; seq_printf(m, "%lu\n", points); return 0; } struct limit_names { const char *name; const char *unit; }; static const struct limit_names lnames[RLIM_NLIMITS] = { [RLIMIT_CPU] = {"Max cpu time", "seconds"}, [RLIMIT_FSIZE] = {"Max file size", "bytes"}, [RLIMIT_DATA] = {"Max data size", "bytes"}, [RLIMIT_STACK] = {"Max stack size", "bytes"}, [RLIMIT_CORE] = {"Max core file size", "bytes"}, [RLIMIT_RSS] = {"Max resident set", "bytes"}, [RLIMIT_NPROC] = {"Max processes", "processes"}, [RLIMIT_NOFILE] = {"Max open files", "files"}, [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"}, [RLIMIT_AS] = {"Max address space", "bytes"}, [RLIMIT_LOCKS] = {"Max file locks", "locks"}, [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"}, [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"}, [RLIMIT_NICE] = {"Max nice priority", NULL}, [RLIMIT_RTPRIO] = {"Max realtime priority", NULL}, [RLIMIT_RTTIME] = {"Max realtime timeout", "us"}, }; /* Display limits for a process */ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned int i; unsigned long flags; struct rlimit rlim[RLIM_NLIMITS]; if (!lock_task_sighand(task, &flags)) return 0; memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS); unlock_task_sighand(task, &flags); /* * print the file header */ seq_puts(m, "Limit " "Soft Limit " "Hard Limit " "Units \n"); for (i = 0; i < RLIM_NLIMITS; i++) { if (rlim[i].rlim_cur == RLIM_INFINITY) seq_printf(m, "%-25s %-20s ", lnames[i].name, "unlimited"); else seq_printf(m, "%-25s %-20lu ", lnames[i].name, rlim[i].rlim_cur); if (rlim[i].rlim_max == RLIM_INFINITY) seq_printf(m, "%-20s ", "unlimited"); else seq_printf(m, "%-20lu ", rlim[i].rlim_max); if (lnames[i].unit) seq_printf(m, "%-10s\n", lnames[i].unit); else seq_putc(m, '\n'); } return 0; } #ifdef CONFIG_HAVE_ARCH_TRACEHOOK static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct syscall_info info; u64 *args = &info.data.args[0]; int res; res = lock_trace(task); if (res) return res; if (task_current_syscall(task, &info)) seq_puts(m, "running\n"); else if (info.data.nr < 0) seq_printf(m, "%d 0x%llx 0x%llx\n", info.data.nr, info.sp, info.data.instruction_pointer); else seq_printf(m, "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n", info.data.nr, args[0], args[1], args[2], args[3], args[4], args[5], info.sp, info.data.instruction_pointer); unlock_trace(task); return 0; } #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ /************************************************************************/ /* Here the fs part begins */ /************************************************************************/ /* permission checks */ static bool proc_fd_access_allowed(struct inode *inode) { struct task_struct *task; bool allowed = false; /* Allow access to a task's file descriptors if it is us or we * may use ptrace attach to the process and find out that * information. */ task = get_proc_task(inode); if (task) { allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); put_task_struct(task); } return allowed; } int proc_nochmod_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int error; struct inode *inode = d_inode(dentry); if (attr->ia_valid & ATTR_MODE) return -EPERM; error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; setattr_copy(&nop_mnt_idmap, inode, attr); return 0; } /* * May current process learn task's sched/cmdline info (for hide_pid_min=1) * or euid/egid (for hide_pid_min=2)? */ static bool has_pid_permissions(struct proc_fs_info *fs_info, struct task_struct *task, enum proc_hidepid hide_pid_min) { /* * If 'hidpid' mount option is set force a ptrace check, * we indicate that we are using a filesystem syscall * by passing PTRACE_MODE_READ_FSCREDS */ if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); if (fs_info->hide_pid < hide_pid_min) return true; if (in_group_p(fs_info->pid_gid)) return true; return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); } static int proc_pid_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; bool has_perms; task = get_proc_task(inode); if (!task) return -ESRCH; has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS); put_task_struct(task); if (!has_perms) { if (fs_info->hide_pid == HIDEPID_INVISIBLE) { /* * Let's make getdents(), stat(), and open() * consistent with each other. If a process * may not stat() a file, it shouldn't be seen * in procfs at all. */ return -ENOENT; } return -EPERM; } return generic_permission(&nop_mnt_idmap, inode, mask); } static const struct inode_operations proc_def_inode_operations = { .setattr = proc_nochmod_setattr, }; static int proc_single_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct pid_namespace *ns = proc_pid_ns(inode->i_sb); struct pid *pid = proc_pid(inode); struct task_struct *task; int ret; task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; ret = PROC_I(inode)->op.proc_show(m, ns, pid, task); put_task_struct(task); return ret; } static int proc_single_open(struct inode *inode, struct file *filp) { return single_open(filp, proc_single_show, inode); } static const struct file_operations proc_single_file_operations = { .open = proc_single_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; /* * proc_mem_open() can return errno, NULL or mm_struct*. * * - Returns NULL if the task has no mm (PF_KTHREAD or PF_EXITING) * - Returns mm_struct* on success * - Returns error code on failure */ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) { struct task_struct *task = get_proc_task(inode); struct mm_struct *mm; if (!task) return ERR_PTR(-ESRCH); mm = mm_access(task, mode | PTRACE_MODE_FSCREDS); put_task_struct(task); if (IS_ERR(mm)) return mm == ERR_PTR(-ESRCH) ? NULL : mm; /* ensure this mm_struct can't be freed */ mmgrab(mm); /* but do not pin its memory */ mmput(mm); return mm; } static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) { struct mm_struct *mm = proc_mem_open(inode, mode); if (IS_ERR_OR_NULL(mm)) return mm ? PTR_ERR(mm) : -ESRCH; file->private_data = mm; return 0; } static int mem_open(struct inode *inode, struct file *file) { if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET))) return -EINVAL; return __mem_open(inode, file, PTRACE_MODE_ATTACH); } static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm) { struct task_struct *task; bool ptrace_active = false; switch (proc_mem_force_override) { case PROC_MEM_FORCE_NEVER: return false; case PROC_MEM_FORCE_PTRACE: task = get_proc_task(file_inode(file)); if (task) { ptrace_active = READ_ONCE(task->ptrace) && READ_ONCE(task->mm) == mm && READ_ONCE(task->parent) == current; put_task_struct(task); } return ptrace_active; default: return true; } } static ssize_t mem_rw(struct file *file, char __user *buf, size_t count, loff_t *ppos, int write) { struct mm_struct *mm = file->private_data; unsigned long addr = *ppos; ssize_t copied; char *page; unsigned int flags; if (!mm) return 0; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; copied = 0; if (!mmget_not_zero(mm)) goto free; flags = write ? FOLL_WRITE : 0; if (proc_mem_foll_force(file, mm)) flags |= FOLL_FORCE; while (count > 0) { size_t this_len = min_t(size_t, count, PAGE_SIZE); if (write && copy_from_user(page, buf, this_len)) { copied = -EFAULT; break; } this_len = access_remote_vm(mm, addr, page, this_len, flags); if (!this_len) { if (!copied) copied = -EIO; break; } if (!write && copy_to_user(buf, page, this_len)) { copied = -EFAULT; break; } buf += this_len; addr += this_len; copied += this_len; count -= this_len; } *ppos = addr; mmput(mm); free: free_page((unsigned long) page); return copied; } static ssize_t mem_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { return mem_rw(file, buf, count, ppos, 0); } static ssize_t mem_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { return mem_rw(file, (char __user*)buf, count, ppos, 1); } loff_t mem_lseek(struct file *file, loff_t offset, int orig) { switch (orig) { case 0: file->f_pos = offset; break; case 1: file->f_pos += offset; break; default: return -EINVAL; } force_successful_syscall_return(); return file->f_pos; } static int mem_release(struct inode *inode, struct file *file) { struct mm_struct *mm = file->private_data; if (mm) mmdrop(mm); return 0; } static const struct file_operations proc_mem_operations = { .llseek = mem_lseek, .read = mem_read, .write = mem_write, .open = mem_open, .release = mem_release, .fop_flags = FOP_UNSIGNED_OFFSET, }; static int environ_open(struct inode *inode, struct file *file) { return __mem_open(inode, file, PTRACE_MODE_READ); } static ssize_t environ_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { char *page; unsigned long src = *ppos; int ret = 0; struct mm_struct *mm = file->private_data; unsigned long env_start, env_end; /* Ensure the process spawned far enough to have an environment. */ if (!mm || !mm->env_end) return 0; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; ret = 0; if (!mmget_not_zero(mm)) goto free; spin_lock(&mm->arg_lock); env_start = mm->env_start; env_end = mm->env_end; spin_unlock(&mm->arg_lock); while (count > 0) { size_t this_len, max_len; int retval; if (src >= (env_end - env_start)) break; this_len = env_end - (env_start + src); max_len = min_t(size_t, PAGE_SIZE, count); this_len = min(max_len, this_len); retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON); if (retval <= 0) { ret = retval; break; } if (copy_to_user(buf, page, retval)) { ret = -EFAULT; break; } ret += retval; src += retval; buf += retval; count -= retval; } *ppos = src; mmput(mm); free: free_page((unsigned long) page); return ret; } static const struct file_operations proc_environ_operations = { .open = environ_open, .read = environ_read, .llseek = generic_file_llseek, .release = mem_release, }; static int auxv_open(struct inode *inode, struct file *file) { return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); } static ssize_t auxv_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct mm_struct *mm = file->private_data; unsigned int nwords = 0; if (!mm) return 0; do { nwords += 2; } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0])); } static const struct file_operations proc_auxv_operations = { .open = auxv_open, .read = auxv_read, .llseek = generic_file_llseek, .release = mem_release, }; static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; int oom_adj = OOM_ADJUST_MIN; size_t len; if (!task) return -ESRCH; if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) oom_adj = OOM_ADJUST_MAX; else oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / OOM_SCORE_ADJ_MAX; put_task_struct(task); if (oom_adj > OOM_ADJUST_MAX) oom_adj = OOM_ADJUST_MAX; len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) { struct mm_struct *mm = NULL; struct task_struct *task; int err = 0; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; mutex_lock(&oom_adj_mutex); if (legacy) { if (oom_adj < task->signal->oom_score_adj && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_unlock; } /* * /proc/pid/oom_adj is provided for legacy purposes, ask users to use * /proc/pid/oom_score_adj instead. */ pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", current->comm, task_pid_nr(current), task_pid_nr(task), task_pid_nr(task)); } else { if ((short)oom_adj < task->signal->oom_score_adj_min && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_unlock; } } /* * Make sure we will check other processes sharing the mm if this is * not vfrok which wants its own oom_score_adj. * pin the mm so it doesn't go away and get reused after task_unlock */ if (!task->vfork_done) { struct task_struct *p = find_lock_task_mm(task); if (p) { if (mm_flags_test(MMF_MULTIPROCESS, p->mm)) { mm = p->mm; mmgrab(mm); } task_unlock(p); } } task->signal->oom_score_adj = oom_adj; if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = (short)oom_adj; trace_oom_score_adj_update(task); if (mm) { struct task_struct *p; rcu_read_lock(); for_each_process(p) { if (same_thread_group(task, p)) continue; /* do not touch kernel threads or the global init */ if (p->flags & PF_KTHREAD || is_global_init(p)) continue; task_lock(p); if (!p->vfork_done && process_shares_mm(p, mm)) { p->signal->oom_score_adj = oom_adj; if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) p->signal->oom_score_adj_min = (short)oom_adj; } task_unlock(p); } rcu_read_unlock(); mmdrop(mm); } err_unlock: mutex_unlock(&oom_adj_mutex); put_task_struct(task); return err; } /* * /proc/pid/oom_adj exists solely for backwards compatibility with previous * kernels. The effective policy is defined by oom_score_adj, which has a * different scale: oom_adj grew exponentially and oom_score_adj grows linearly. * Values written to oom_adj are simply mapped linearly to oom_score_adj. * Processes that become oom disabled via oom_adj will still be oom disabled * with this implementation. * * oom_adj cannot be removed since existing userspace binaries use it. */ static ssize_t oom_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char buffer[PROC_NUMBUF] = {}; int oom_adj; int err; if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) { err = -EFAULT; goto out; } err = kstrtoint(strstrip(buffer), 0, &oom_adj); if (err) goto out; if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) && oom_adj != OOM_DISABLE) { err = -EINVAL; goto out; } /* * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum * value is always attainable. */ if (oom_adj == OOM_ADJUST_MAX) oom_adj = OOM_SCORE_ADJ_MAX; else oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; err = __set_oom_adj(file, oom_adj, true); out: return err < 0 ? err : count; } static const struct file_operations proc_oom_adj_operations = { .read = oom_adj_read, .write = oom_adj_write, .llseek = generic_file_llseek, }; static ssize_t oom_score_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; short oom_score_adj = OOM_SCORE_ADJ_MIN; size_t len; if (!task) return -ESRCH; oom_score_adj = task->signal->oom_score_adj; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char buffer[PROC_NUMBUF] = {}; int oom_score_adj; int err; if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) { err = -EFAULT; goto out; } err = kstrtoint(strstrip(buffer), 0, &oom_score_adj); if (err) goto out; if (oom_score_adj < OOM_SCORE_ADJ_MIN || oom_score_adj > OOM_SCORE_ADJ_MAX) { err = -EINVAL; goto out; } err = __set_oom_adj(file, oom_score_adj, false); out: return err < 0 ? err : count; } static const struct file_operations proc_oom_score_adj_operations = { .read = oom_score_adj_read, .write = oom_score_adj_write, .llseek = default_llseek, }; #ifdef CONFIG_AUDIT #define TMPBUFLEN 11 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; if (!task) return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", from_kuid(file->f_cred->user_ns, audit_get_loginuid(task))); put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); uid_t loginuid; kuid_t kloginuid; int rv; /* Don't let kthreads write their own loginuid */ if (current->flags & PF_KTHREAD) return -EPERM; rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { rcu_read_unlock(); return -EPERM; } rcu_read_unlock(); if (*ppos != 0) { /* No partial writes. */ return -EINVAL; } rv = kstrtou32_from_user(buf, count, 10, &loginuid); if (rv < 0) return rv; /* is userspace tring to explicitly UNSET the loginuid? */ if (loginuid == AUDIT_UID_UNSET) { kloginuid = INVALID_UID; } else { kloginuid = make_kuid(file->f_cred->user_ns, loginuid); if (!uid_valid(kloginuid)) return -EINVAL; } rv = audit_set_loginuid(kloginuid); if (rv < 0) return rv; return count; } static const struct file_operations proc_loginuid_operations = { .read = proc_loginuid_read, .write = proc_loginuid_write, .llseek = generic_file_llseek, }; static ssize_t proc_sessionid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; if (!task) return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", audit_get_sessionid(task)); put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } static const struct file_operations proc_sessionid_operations = { .read = proc_sessionid_read, .llseek = generic_file_llseek, }; #endif #ifdef CONFIG_FAULT_INJECTION static ssize_t proc_fault_inject_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; size_t len; int make_it_fail; if (!task) return -ESRCH; make_it_fail = task->make_it_fail; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail); return simple_read_from_buffer(buf, count, ppos, buffer, len); } static ssize_t proc_fault_inject_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct task_struct *task; char buffer[PROC_NUMBUF] = {}; int make_it_fail; int rv; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; rv = kstrtoint(strstrip(buffer), 0, &make_it_fail); if (rv < 0) return rv; if (make_it_fail < 0 || make_it_fail > 1) return -EINVAL; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; task->make_it_fail = make_it_fail; put_task_struct(task); return count; } static const struct file_operations proc_fault_inject_operations = { .read = proc_fault_inject_read, .write = proc_fault_inject_write, .llseek = generic_file_llseek, }; static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; int err; unsigned int n; err = kstrtouint_from_user(buf, count, 0, &n); if (err) return err; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; task->fail_nth = n; put_task_struct(task); return count; } static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; char numbuf[PROC_NUMBUF]; ssize_t len; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth); put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, numbuf, len); } static const struct file_operations proc_fail_nth_operations = { .read = proc_fail_nth_read, .write = proc_fail_nth_write, }; #endif /* * Print out various scheduling related per-task fields: */ static int sched_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct pid_namespace *ns = proc_pid_ns(inode->i_sb); struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_sched_show_task(p, ns, m); put_task_struct(p); return 0; } static ssize_t sched_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_sched_set_task(p); put_task_struct(p); return count; } static int sched_open(struct inode *inode, struct file *filp) { return single_open(filp, sched_show, inode); } static const struct file_operations proc_pid_sched_operations = { .open = sched_open, .read = seq_read, .write = sched_write, .llseek = seq_lseek, .release = single_release, }; #ifdef CONFIG_SCHED_AUTOGROUP /* * Print out autogroup related information: */ static int sched_autogroup_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_sched_autogroup_show_task(p, m); put_task_struct(p); return 0; } static ssize_t sched_autogroup_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; char buffer[PROC_NUMBUF] = {}; int nice; int err; if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; err = kstrtoint(strstrip(buffer), 0, &nice); if (err < 0) return err; p = get_proc_task(inode); if (!p) return -ESRCH; err = proc_sched_autogroup_set_nice(p, nice); if (err) count = err; put_task_struct(p); return count; } static int sched_autogroup_open(struct inode *inode, struct file *filp) { int ret; ret = single_open(filp, sched_autogroup_show, NULL); if (!ret) { struct seq_file *m = filp->private_data; m->private = inode; } return ret; } static const struct file_operations proc_pid_sched_autogroup_operations = { .open = sched_autogroup_open, .read = seq_read, .write = sched_autogroup_write, .llseek = seq_lseek, .release = single_release, }; #endif /* CONFIG_SCHED_AUTOGROUP */ #ifdef CONFIG_TIME_NS static int timens_offsets_show(struct seq_file *m, void *v) { struct task_struct *p; p = get_proc_task(file_inode(m->file)); if (!p) return -ESRCH; proc_timens_show_offsets(p, m); put_task_struct(p); return 0; } static ssize_t timens_offsets_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct inode *inode = file_inode(file); struct proc_timens_offset offsets[2]; char *kbuf = NULL, *pos, *next_line; struct task_struct *p; int ret, noffsets; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* Parse the user data */ ret = -EINVAL; noffsets = 0; for (pos = kbuf; pos; pos = next_line) { struct proc_timens_offset *off = &offsets[noffsets]; char clock[10]; int err; /* Find the end of line and ensure we don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } err = sscanf(pos, "%9s %lld %lu", clock, &off->val.tv_sec, &off->val.tv_nsec); if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC) goto out; clock[sizeof(clock) - 1] = 0; if (strcmp(clock, "monotonic") == 0 || strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0) off->clockid = CLOCK_MONOTONIC; else if (strcmp(clock, "boottime") == 0 || strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0) off->clockid = CLOCK_BOOTTIME; else goto out; noffsets++; if (noffsets == ARRAY_SIZE(offsets)) { if (next_line) count = next_line - kbuf; break; } } ret = -ESRCH; p = get_proc_task(inode); if (!p) goto out; ret = proc_timens_set_offset(file, p, offsets, noffsets); put_task_struct(p); if (ret) goto out; ret = count; out: kfree(kbuf); return ret; } static int timens_offsets_open(struct inode *inode, struct file *filp) { return single_open(filp, timens_offsets_show, inode); } static const struct file_operations proc_timens_offsets_operations = { .open = timens_offsets_open, .read = seq_read, .write = timens_offsets_write, .llseek = seq_lseek, .release = single_release, }; #endif /* CONFIG_TIME_NS */ static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; char buffer[TASK_COMM_LEN] = {}; const size_t maxlen = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) return -EFAULT; p = get_proc_task(inode); if (!p) return -ESRCH; if (same_thread_group(current, p)) { set_task_comm(p, buffer); proc_comm_connector(p); } else count = -EINVAL; put_task_struct(p); return count; } static int comm_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_task_name(m, p, false); seq_putc(m, '\n'); put_task_struct(p); return 0; } static int comm_open(struct inode *inode, struct file *filp) { return single_open(filp, comm_show, inode); } static const struct file_operations proc_pid_set_comm_operations = { .open = comm_open, .read = seq_read, .write = comm_write, .llseek = seq_lseek, .release = single_release, }; static int proc_exe_link(struct dentry *dentry, struct path *exe_path) { struct task_struct *task; struct file *exe_file; task = get_proc_task(d_inode(dentry)); if (!task) return -ENOENT; exe_file = get_task_exe_file(task); put_task_struct(task); if (exe_file) { *exe_path = exe_file->f_path; path_get(&exe_file->f_path); fput(exe_file); return 0; } else return -ENOENT; } static const char *proc_pid_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct path path; int error = -EACCES; if (!dentry) return ERR_PTR(-ECHILD); /* Are we allowed to snoop on the tasks file descriptors? */ if (!proc_fd_access_allowed(inode)) goto out; error = PROC_I(inode)->op.proc_get_link(dentry, &path); if (error) goto out; error = nd_jump_link(&path); out: return ERR_PTR(error); } static int do_proc_readlink(const struct path *path, char __user *buffer, int buflen) { char *tmp = kmalloc(PATH_MAX, GFP_KERNEL); char *pathname; int len; if (!tmp) return -ENOMEM; pathname = d_path(path, tmp, PATH_MAX); len = PTR_ERR(pathname); if (IS_ERR(pathname)) goto out; len = tmp + PATH_MAX - 1 - pathname; if (len > buflen) len = buflen; if (copy_to_user(buffer, pathname, len)) len = -EFAULT; out: kfree(tmp); return len; } static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen) { int error = -EACCES; struct inode *inode = d_inode(dentry); struct path path; /* Are we allowed to snoop on the tasks file descriptors? */ if (!proc_fd_access_allowed(inode)) goto out; error = PROC_I(inode)->op.proc_get_link(dentry, &path); if (error) goto out; error = do_proc_readlink(&path, buffer, buflen); path_put(&path); out: return error; } const struct inode_operations proc_pid_link_inode_operations = { .readlink = proc_pid_readlink, .get_link = proc_pid_get_link, .setattr = proc_nochmod_setattr, }; /* building an inode */ void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t *ruid, kgid_t *rgid) { /* Depending on the state of dumpable compute who should own a * proc file for a task. */ const struct cred *cred; kuid_t uid; kgid_t gid; if (unlikely(task->flags & PF_KTHREAD)) { *ruid = GLOBAL_ROOT_UID; *rgid = GLOBAL_ROOT_GID; return; } /* Default to the tasks effective ownership */ rcu_read_lock(); cred = __task_cred(task); uid = cred->euid; gid = cred->egid; rcu_read_unlock(); /* * Before the /proc/pid/status file was created the only way to read * the effective uid of a /process was to stat /proc/pid. Reading * /proc/pid/status is slow enough that procps and other packages * kept stating /proc/pid. To keep the rules in /proc simple I have * made this apply to all per process world readable and executable * directories. */ if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) { struct mm_struct *mm; task_lock(task); mm = task->mm; /* Make non-dumpable tasks owned by some root */ if (mm) { if (get_dumpable(mm) != SUID_DUMP_USER) { struct user_namespace *user_ns = mm->user_ns; uid = make_kuid(user_ns, 0); if (!uid_valid(uid)) uid = GLOBAL_ROOT_UID; gid = make_kgid(user_ns, 0); if (!gid_valid(gid)) gid = GLOBAL_ROOT_GID; } } else { uid = GLOBAL_ROOT_UID; gid = GLOBAL_ROOT_GID; } task_unlock(task); } *ruid = uid; *rgid = gid; } void proc_pid_evict_inode(struct proc_inode *ei) { struct pid *pid = ei->pid; if (S_ISDIR(ei->vfs_inode.i_mode)) { spin_lock(&pid->lock); hlist_del_init_rcu(&ei->sibling_inodes); spin_unlock(&pid->lock); } } struct inode *proc_pid_make_inode(struct super_block *sb, struct task_struct *task, umode_t mode) { struct inode * inode; struct proc_inode *ei; struct pid *pid; /* We need a new inode */ inode = new_inode(sb); if (!inode) goto out; /* Common stuff */ ei = PROC_I(inode); inode->i_mode = mode; inode->i_ino = get_next_ino(); simple_inode_init_ts(inode); inode->i_op = &proc_def_inode_operations; /* * grab the reference to task. */ pid = get_task_pid(task, PIDTYPE_PID); if (!pid) goto out_unlock; /* Let the pid remember us for quick removal */ ei->pid = pid; task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); out: return inode; out_unlock: iput(inode); return NULL; } /* * Generating an inode and adding it into @pid->inodes, so that task will * invalidate inode's dentry before being released. * * This helper is used for creating dir-type entries under '/proc' and * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>' * can be released by invalidating '/proc/<tgid>' dentry. * In theory, dentries under '/proc/<tgid>/task' can also be released by * invalidating '/proc/<tgid>' dentry, we reserve it to handle single * thread exiting situation: Any one of threads should invalidate its * '/proc/<tgid>/task/<pid>' dentry before released. */ static struct inode *proc_pid_make_base_inode(struct super_block *sb, struct task_struct *task, umode_t mode) { struct inode *inode; struct proc_inode *ei; struct pid *pid; inode = proc_pid_make_inode(sb, task, mode); if (!inode) return NULL; /* Let proc_flush_pid find this directory inode */ ei = PROC_I(inode); pid = ei->pid; spin_lock(&pid->lock); hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes); spin_unlock(&pid->lock); return inode; } int pid_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->uid = GLOBAL_ROOT_UID; stat->gid = GLOBAL_ROOT_GID; rcu_read_lock(); task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) { rcu_read_unlock(); /* * This doesn't prevent learning whether PID exists, * it only makes getattr() consistent with readdir(). */ return -ENOENT; } task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid); } rcu_read_unlock(); return 0; } /* dentry stuff */ /* * Set <pid>/... inode ownership (can change due to setuid(), etc.) */ void pid_update_inode(struct task_struct *task, struct inode *inode) { task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid); inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); } /* * Rewrite the inode's ownerships here because the owning task may have * performed a setuid(), etc. * */ static int pid_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; int ret = 0; rcu_read_lock(); inode = d_inode_rcu(dentry); if (!inode) goto out; task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { pid_update_inode(task, inode); ret = 1; } out: rcu_read_unlock(); return ret; } static inline bool proc_inode_is_dead(struct inode *inode) { return !proc_pid(inode)->tasks[PIDTYPE_PID].first; } int pid_delete_dentry(const struct dentry *dentry) { /* Is the task we represent dead? * If so, then don't put the dentry on the lru list, * kill it immediately. */ return proc_inode_is_dead(d_inode(dentry)); } const struct dentry_operations pid_dentry_operations = { .d_revalidate = pid_revalidate, .d_delete = pid_delete_dentry, }; /* Lookups */ /* * Fill a directory entry. * * If possible create the dcache entry and derive our inode number and * file type from dcache entry. * * Since all of the proc inode numbers are dynamically generated, the inode * numbers do not exist until the inode is cache. This means creating * the dcache entry in readdir is necessary to keep the inode numbers * reported by readdir in sync with the inode numbers reported * by stat. */ bool proc_fill_cache(struct file *file, struct dir_context *ctx, const char *name, unsigned int len, instantiate_t instantiate, struct task_struct *task, const void *ptr) { struct dentry *child, *dir = file->f_path.dentry; struct qstr qname = QSTR_INIT(name, len); struct inode *inode; unsigned type = DT_UNKNOWN; ino_t ino = 1; child = try_lookup_noperm(&qname, dir); if (IS_ERR(child)) goto end_instantiate; if (!child) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) goto end_instantiate; if (d_in_lookup(child)) { struct dentry *res; res = instantiate(child, task, ptr); d_lookup_done(child); if (unlikely(res)) { dput(child); child = res; if (IS_ERR(child)) goto end_instantiate; } } } inode = d_inode(child); ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); end_instantiate: return dir_emit(ctx, name, len, ino, type); } /* * dname_to_vma_addr - maps a dentry name into two unsigned longs * which represent vma start and end addresses. */ static int dname_to_vma_addr(struct dentry *dentry, unsigned long *start, unsigned long *end) { const char *str = dentry->d_name.name; unsigned long long sval, eval; unsigned int len; if (str[0] == '0' && str[1] != '-') return -EINVAL; len = _parse_integer(str, 16, &sval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; if (sval != (unsigned long)sval) return -EINVAL; str += len; if (*str != '-') return -EINVAL; str++; if (str[0] == '0' && str[1]) return -EINVAL; len = _parse_integer(str, 16, &eval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; if (eval != (unsigned long)eval) return -EINVAL; str += len; if (*str != '\0') return -EINVAL; *start = sval; *end = eval; return 0; } static int map_files_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { unsigned long vm_start, vm_end; bool exact_vma_exists = false; struct mm_struct *mm = NULL; struct task_struct *task; struct inode *inode; int status = 0; if (flags & LOOKUP_RCU) return -ECHILD; inode = d_inode(dentry); task = get_proc_task(inode); if (!task) goto out_notask; mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR(mm)) goto out; if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { status = mmap_read_lock_killable(mm); if (!status) { exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); mmap_read_unlock(mm); } } mmput(mm); if (exact_vma_exists) { task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); status = 1; } out: put_task_struct(task); out_notask: return status; } static const struct dentry_operations tid_map_files_dentry_operations = { .d_revalidate = map_files_d_revalidate, .d_delete = pid_delete_dentry, }; static int map_files_get_link(struct dentry *dentry, struct path *path) { unsigned long vm_start, vm_end; struct vm_area_struct *vma; struct task_struct *task; struct mm_struct *mm; int rc; rc = -ENOENT; task = get_proc_task(d_inode(dentry)); if (!task) goto out; mm = get_task_mm(task); put_task_struct(task); if (!mm) goto out; rc = dname_to_vma_addr(dentry, &vm_start, &vm_end); if (rc) goto out_mmput; rc = mmap_read_lock_killable(mm); if (rc) goto out_mmput; rc = -ENOENT; vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { *path = *file_user_path(vma->vm_file); path_get(path); rc = 0; } mmap_read_unlock(mm); out_mmput: mmput(mm); out: return rc; } struct map_files_info { unsigned long start; unsigned long end; fmode_t mode; }; /* * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due * to concerns about how the symlinks may be used to bypass permissions on * ancestor directories in the path to the file in question. */ static const char * proc_map_files_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { if (!checkpoint_restore_ns_capable(&init_user_ns)) return ERR_PTR(-EPERM); return proc_pid_get_link(dentry, inode, done); } /* * Identical to proc_pid_link_inode_operations except for get_link() */ static const struct inode_operations proc_map_files_link_inode_operations = { .readlink = proc_pid_readlink, .get_link = proc_map_files_get_link, .setattr = proc_nochmod_setattr, }; static struct dentry * proc_map_files_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { fmode_t mode = (fmode_t)(unsigned long)ptr; struct proc_inode *ei; struct inode *inode; inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | ((mode & FMODE_READ ) ? S_IRUSR : 0) | ((mode & FMODE_WRITE) ? S_IWUSR : 0)); if (!inode) return ERR_PTR(-ENOENT); ei = PROC_I(inode); ei->op.proc_get_link = map_files_get_link; inode->i_op = &proc_map_files_link_inode_operations; inode->i_size = 64; return proc_splice_unmountable(inode, dentry, &tid_map_files_dentry_operations); } static struct dentry *proc_map_files_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { unsigned long vm_start, vm_end; struct vm_area_struct *vma; struct task_struct *task; struct dentry *result; struct mm_struct *mm; result = ERR_PTR(-ENOENT); task = get_proc_task(dir); if (!task) goto out; result = ERR_PTR(-EACCES); if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; result = ERR_PTR(-ENOENT); if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) goto out_put_task; mm = get_task_mm(task); if (!mm) goto out_put_task; result = ERR_PTR(-EINTR); if (mmap_read_lock_killable(mm)) goto out_put_mm; result = ERR_PTR(-ENOENT); vma = find_exact_vma(mm, vm_start, vm_end); if (!vma) goto out_no_vma; if (vma->vm_file) result = proc_map_files_instantiate(dentry, task, (void *)(unsigned long)vma->vm_file->f_mode); out_no_vma: mmap_read_unlock(mm); out_put_mm: mmput(mm); out_put_task: put_task_struct(task); out: return result; } static const struct inode_operations proc_map_files_inode_operations = { .lookup = proc_map_files_lookup, .permission = proc_fd_permission, .setattr = proc_nochmod_setattr, }; static int proc_map_files_readdir(struct file *file, struct dir_context *ctx) { struct vm_area_struct *vma; struct task_struct *task; struct mm_struct *mm; unsigned long nr_files, pos, i; GENRADIX(struct map_files_info) fa; struct map_files_info *p; int ret; struct vma_iterator vmi; genradix_init(&fa); ret = -ENOENT; task = get_proc_task(file_inode(file)); if (!task) goto out; ret = -EACCES; if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; ret = 0; if (!dir_emit_dots(file, ctx)) goto out_put_task; mm = get_task_mm(task); if (!mm) goto out_put_task; ret = mmap_read_lock_killable(mm); if (ret) { mmput(mm); goto out_put_task; } nr_files = 0; /* * We need two passes here: * * 1) Collect vmas of mapped files with mmap_lock taken * 2) Release mmap_lock and instantiate entries * * otherwise we get lockdep complained, since filldir() * routine might require mmap_lock taken in might_fault(). */ pos = 2; vma_iter_init(&vmi, mm, 0); for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (++pos <= ctx->pos) continue; p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL); if (!p) { ret = -ENOMEM; mmap_read_unlock(mm); mmput(mm); goto out_put_task; } p->start = vma->vm_start; p->end = vma->vm_end; p->mode = vma->vm_file->f_mode; } mmap_read_unlock(mm); mmput(mm); for (i = 0; i < nr_files; i++) { char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ unsigned int len; p = genradix_ptr(&fa, i); len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end); if (!proc_fill_cache(file, ctx, buf, len, proc_map_files_instantiate, task, (void *)(unsigned long)p->mode)) break; ctx->pos++; } out_put_task: put_task_struct(task); out: genradix_free(&fa); return ret; } static const struct file_operations proc_map_files_operations = { .read = generic_read_dir, .iterate_shared = proc_map_files_readdir, .llseek = generic_file_llseek, }; #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) struct timers_private { struct pid *pid; struct task_struct *task; struct pid_namespace *ns; }; static void *timers_start(struct seq_file *m, loff_t *pos) { struct timers_private *tp = m->private; tp->task = get_pid_task(tp->pid, PIDTYPE_PID); if (!tp->task) return ERR_PTR(-ESRCH); rcu_read_lock(); return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos); } static void *timers_next(struct seq_file *m, void *v, loff_t *pos) { struct timers_private *tp = m->private; return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos); } static void timers_stop(struct seq_file *m, void *v) { struct timers_private *tp = m->private; if (tp->task) { put_task_struct(tp->task); tp->task = NULL; rcu_read_unlock(); } } static int show_timer(struct seq_file *m, void *v) { static const char * const nstr[] = { [SIGEV_SIGNAL] = "signal", [SIGEV_NONE] = "none", [SIGEV_THREAD] = "thread", }; struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list); struct timers_private *tp = m->private; int notify = timer->it_sigev_notify; guard(spinlock_irq)(&timer->it_lock); if (!posixtimer_valid(timer)) return 0; seq_printf(m, "ID: %d\n", timer->it_id); seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo, timer->sigq.info.si_value.sival_ptr); seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID], (notify & SIGEV_THREAD_ID) ? "tid" : "pid", pid_nr_ns(timer->it_pid, tp->ns)); seq_printf(m, "ClockID: %d\n", timer->it_clock); return 0; } static const struct seq_operations proc_timers_seq_ops = { .start = timers_start, .next = timers_next, .stop = timers_stop, .show = show_timer, }; static int proc_timers_open(struct inode *inode, struct file *file) { struct timers_private *tp; tp = __seq_open_private(file, &proc_timers_seq_ops, sizeof(struct timers_private)); if (!tp) return -ENOMEM; tp->pid = proc_pid(inode); tp->ns = proc_pid_ns(inode->i_sb); return 0; } static const struct file_operations proc_timers_operations = { .open = proc_timers_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, }; #endif static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; u64 slack_ns; int err; err = kstrtoull_from_user(buf, count, 10, &slack_ns); if (err < 0) return err; p = get_proc_task(inode); if (!p) return -ESRCH; if (p != current) { rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); count = -EPERM; goto out; } rcu_read_unlock(); err = security_task_setscheduler(p); if (err) { count = err; goto out; } } task_lock(p); if (rt_or_dl_task_policy(p)) slack_ns = 0; else if (slack_ns == 0) slack_ns = p->default_timer_slack_ns; p->timer_slack_ns = slack_ns; task_unlock(p); out: put_task_struct(p); return count; } static int timerslack_ns_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct task_struct *p; int err = 0; p = get_proc_task(inode); if (!p) return -ESRCH; if (p != current) { rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); err = -EPERM; goto out; } rcu_read_unlock(); err = security_task_getscheduler(p); if (err) goto out; } task_lock(p); seq_printf(m, "%llu\n", p->timer_slack_ns); task_unlock(p); out: put_task_struct(p); return err; } static int timerslack_ns_open(struct inode *inode, struct file *filp) { return single_open(filp, timerslack_ns_show, inode); } static const struct file_operations proc_pid_set_timerslack_ns_operations = { .open = timerslack_ns_open, .read = seq_read, .write = timerslack_ns_write, .llseek = seq_lseek, .release = single_release, }; static struct dentry *proc_pident_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { const struct pid_entry *p = ptr; struct inode *inode; struct proc_inode *ei; inode = proc_pid_make_inode(dentry->d_sb, task, p->mode); if (!inode) return ERR_PTR(-ENOENT); ei = PROC_I(inode); if (S_ISDIR(inode->i_mode)) set_nlink(inode, 2); /* Use getattr to fix if necessary */ if (p->iop) inode->i_op = p->iop; if (p->fop) inode->i_fop = p->fop; ei->op = p->op; pid_update_inode(task, inode); return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } static struct dentry *proc_pident_lookup(struct inode *dir, struct dentry *dentry, const struct pid_entry *p, const struct pid_entry *end) { struct task_struct *task = get_proc_task(dir); struct dentry *res = ERR_PTR(-ENOENT); if (!task) goto out_no_task; /* * Yes, it does not scale. And it should not. Don't add * new entries into /proc/<tgid>/ without very good reasons. */ for (; p < end; p++) { if (p->len != dentry->d_name.len) continue; if (!memcmp(dentry->d_name.name, p->name, p->len)) { res = proc_pident_instantiate(dentry, task, p); break; } } put_task_struct(task); out_no_task: return res; } static int proc_pident_readdir(struct file *file, struct dir_context *ctx, const struct pid_entry *ents, unsigned int nents) { struct task_struct *task = get_proc_task(file_inode(file)); const struct pid_entry *p; if (!task) return -ENOENT; if (!dir_emit_dots(file, ctx)) goto out; if (ctx->pos >= nents + 2) goto out; for (p = ents + (ctx->pos - 2); p < ents + nents; p++) { if (!proc_fill_cache(file, ctx, p->name, p->len, proc_pident_instantiate, task, p)) break; ctx->pos++; } out: put_task_struct(task); return 0; } #ifdef CONFIG_SECURITY static int proc_pid_attr_open(struct inode *inode, struct file *file) { file->private_data = NULL; __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); return 0; } static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); char *p = NULL; ssize_t length; struct task_struct *task = get_proc_task(inode); if (!task) return -ESRCH; length = security_getprocattr(task, PROC_I(inode)->op.lsmid, file->f_path.dentry->d_name.name, &p); put_task_struct(task); if (length > 0) length = simple_read_from_buffer(buf, count, ppos, p, length); kfree(p); return length; } static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); struct task_struct *task; void *page; int rv; /* A task may only write when it was the opener. */ if (file->private_data != current->mm) return -EPERM; rcu_read_lock(); task = pid_task(proc_pid(inode), PIDTYPE_PID); if (!task) { rcu_read_unlock(); return -ESRCH; } /* A task may only write its own attributes. */ if (current != task) { rcu_read_unlock(); return -EACCES; } /* Prevent changes to overridden credentials. */ if (current_cred() != current_real_cred()) { rcu_read_unlock(); return -EBUSY; } rcu_read_unlock(); if (count > PAGE_SIZE) count = PAGE_SIZE; /* No partial writes. */ if (*ppos != 0) return -EINVAL; page = memdup_user(buf, count); if (IS_ERR(page)) { rv = PTR_ERR(page); goto out; } /* Guard against adverse ptrace interaction */ rv = mutex_lock_interruptible(&current->signal->cred_guard_mutex); if (rv < 0) goto out_free; rv = security_setprocattr(PROC_I(inode)->op.lsmid, file->f_path.dentry->d_name.name, page, count); mutex_unlock(&current->signal->cred_guard_mutex); out_free: kfree(page); out: return rv; } static const struct file_operations proc_pid_attr_operations = { .open = proc_pid_attr_open, .read = proc_pid_attr_read, .write = proc_pid_attr_write, .llseek = generic_file_llseek, .release = mem_release, }; #define LSM_DIR_OPS(LSM) \ static int proc_##LSM##_attr_dir_iterate(struct file *filp, \ struct dir_context *ctx) \ { \ return proc_pident_readdir(filp, ctx, \ LSM##_attr_dir_stuff, \ ARRAY_SIZE(LSM##_attr_dir_stuff)); \ } \ \ static const struct file_operations proc_##LSM##_attr_dir_ops = { \ .read = generic_read_dir, \ .iterate_shared = proc_##LSM##_attr_dir_iterate, \ .llseek = default_llseek, \ }; \ \ static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \ struct dentry *dentry, unsigned int flags) \ { \ return proc_pident_lookup(dir, dentry, \ LSM##_attr_dir_stuff, \ LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \ } \ \ static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \ .lookup = proc_##LSM##_attr_dir_lookup, \ .getattr = pid_getattr, \ .setattr = proc_nochmod_setattr, \ } #ifdef CONFIG_SECURITY_SMACK static const struct pid_entry smack_attr_dir_stuff[] = { ATTR(LSM_ID_SMACK, "current", 0666), }; LSM_DIR_OPS(smack); #endif #ifdef CONFIG_SECURITY_APPARMOR static const struct pid_entry apparmor_attr_dir_stuff[] = { ATTR(LSM_ID_APPARMOR, "current", 0666), ATTR(LSM_ID_APPARMOR, "prev", 0444), ATTR(LSM_ID_APPARMOR, "exec", 0666), }; LSM_DIR_OPS(apparmor); #endif static const struct pid_entry attr_dir_stuff[] = { ATTR(LSM_ID_UNDEF, "current", 0666), ATTR(LSM_ID_UNDEF, "prev", 0444), ATTR(LSM_ID_UNDEF, "exec", 0666), ATTR(LSM_ID_UNDEF, "fscreate", 0666), ATTR(LSM_ID_UNDEF, "keycreate", 0666), ATTR(LSM_ID_UNDEF, "sockcreate", 0666), #ifdef CONFIG_SECURITY_SMACK DIR("smack", 0555, proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops), #endif #ifdef CONFIG_SECURITY_APPARMOR DIR("apparmor", 0555, proc_apparmor_attr_dir_inode_ops, proc_apparmor_attr_dir_ops), #endif }; static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx) { return proc_pident_readdir(file, ctx, attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); } static const struct file_operations proc_attr_dir_operations = { .read = generic_read_dir, .iterate_shared = proc_attr_dir_readdir, .llseek = generic_file_llseek, }; static struct dentry *proc_attr_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, attr_dir_stuff, attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff)); } static const struct inode_operations proc_attr_dir_inode_operations = { .lookup = proc_attr_dir_lookup, .getattr = pid_getattr, .setattr = proc_nochmod_setattr, }; #endif #ifdef CONFIG_ELF_CORE static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); struct mm_struct *mm; char buffer[PROC_NUMBUF]; size_t len; int ret; if (!task) return -ESRCH; ret = 0; mm = get_task_mm(task); if (mm) { unsigned long flags = __mm_flags_get_dumpable(mm); len = snprintf(buffer, sizeof(buffer), "%08lx\n", ((flags & MMF_DUMP_FILTER_MASK) >> MMF_DUMP_FILTER_SHIFT)); mmput(mm); ret = simple_read_from_buffer(buf, count, ppos, buffer, len); } put_task_struct(task); return ret; } static ssize_t proc_coredump_filter_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; struct mm_struct *mm; unsigned int val; int ret; int i; unsigned long mask; ret = kstrtouint_from_user(buf, count, 0, &val); if (ret < 0) return ret; ret = -ESRCH; task = get_proc_task(file_inode(file)); if (!task) goto out_no_task; mm = get_task_mm(task); if (!mm) goto out_no_mm; ret = 0; for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) { if (val & mask) mm_flags_set(i + MMF_DUMP_FILTER_SHIFT, mm); else mm_flags_clear(i + MMF_DUMP_FILTER_SHIFT, mm); } mmput(mm); out_no_mm: put_task_struct(task); out_no_task: if (ret < 0) return ret; return count; } static const struct file_operations proc_coredump_filter_operations = { .read = proc_coredump_filter_read, .write = proc_coredump_filter_write, .llseek = generic_file_llseek, }; #endif #ifdef CONFIG_TASK_IO_ACCOUNTING static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole) { struct task_io_accounting acct; int result; result = down_read_killable(&task->signal->exec_update_lock); if (result) return result; if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { result = -EACCES; goto out_unlock; } if (whole) { struct signal_struct *sig = task->signal; struct task_struct *t; guard(rcu)(); scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { acct = sig->ioac; __for_each_thread(sig, t) task_io_accounting_add(&acct, &t->ioac); } } else { acct = task->ioac; } seq_printf(m, "rchar: %llu\n" "wchar: %llu\n" "syscr: %llu\n" "syscw: %llu\n" "read_bytes: %llu\n" "write_bytes: %llu\n" "cancelled_write_bytes: %llu\n", (unsigned long long)acct.rchar, (unsigned long long)acct.wchar, (unsigned long long)acct.syscr, (unsigned long long)acct.syscw, (unsigned long long)acct.read_bytes, (unsigned long long)acct.write_bytes, (unsigned long long)acct.cancelled_write_bytes); result = 0; out_unlock: up_read(&task->signal->exec_update_lock); return result; } static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { return do_io_accounting(task, m, 0); } static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { return do_io_accounting(task, m, 1); } #endif /* CONFIG_TASK_IO_ACCOUNTING */ #ifdef CONFIG_USER_NS static int proc_id_map_open(struct inode *inode, struct file *file, const struct seq_operations *seq_ops) { struct user_namespace *ns = NULL; struct task_struct *task; struct seq_file *seq; int ret = -EINVAL; task = get_proc_task(inode); if (task) { rcu_read_lock(); ns = get_user_ns(task_cred_xxx(task, user_ns)); rcu_read_unlock(); put_task_struct(task); } if (!ns) goto err; ret = seq_open(file, seq_ops); if (ret) goto err_put_ns; seq = file->private_data; seq->private = ns; return 0; err_put_ns: put_user_ns(ns); err: return ret; } static int proc_id_map_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; put_user_ns(ns); return seq_release(inode, file); } static int proc_uid_map_open(struct inode *inode, struct file *file) { return proc_id_map_open(inode, file, &proc_uid_seq_operations); } static int proc_gid_map_open(struct inode *inode, struct file *file) { return proc_id_map_open(inode, file, &proc_gid_seq_operations); } static int proc_projid_map_open(struct inode *inode, struct file *file) { return proc_id_map_open(inode, file, &proc_projid_seq_operations); } static const struct file_operations proc_uid_map_operations = { .open = proc_uid_map_open, .write = proc_uid_map_write, .read = seq_read, .llseek = seq_lseek, .release = proc_id_map_release, }; static const struct file_operations proc_gid_map_operations = { .open = proc_gid_map_open, .write = proc_gid_map_write, .read = seq_read, .llseek = seq_lseek, .release = proc_id_map_release, }; static const struct file_operations proc_projid_map_operations = { .open = proc_projid_map_open, .write = proc_projid_map_write, .read = seq_read, .llseek = seq_lseek, .release = proc_id_map_release, }; static int proc_setgroups_open(struct inode *inode, struct file *file) { struct user_namespace *ns = NULL; struct task_struct *task; int ret; ret = -ESRCH; task = get_proc_task(inode); if (task) { rcu_read_lock(); ns = get_user_ns(task_cred_xxx(task, user_ns)); rcu_read_unlock(); put_task_struct(task); } if (!ns) goto err; if (file->f_mode & FMODE_WRITE) { ret = -EACCES; if (!ns_capable(ns, CAP_SYS_ADMIN)) goto err_put_ns; } ret = single_open(file, &proc_setgroups_show, ns); if (ret) goto err_put_ns; return 0; err_put_ns: put_user_ns(ns); err: return ret; } static int proc_setgroups_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; int ret = single_release(inode, file); put_user_ns(ns); return ret; } static const struct file_operations proc_setgroups_operations = { .open = proc_setgroups_open, .write = proc_setgroups_write, .read = seq_read, .llseek = seq_lseek, .release = proc_setgroups_release, }; #endif /* CONFIG_USER_NS */ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { int err = lock_trace(task); if (!err) { seq_printf(m, "%08x\n", task->personality); unlock_trace(task); } return err; } #ifdef CONFIG_LIVEPATCH static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { seq_printf(m, "%d\n", task->patch_state); return 0; } #endif /* CONFIG_LIVEPATCH */ #ifdef CONFIG_KSM static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct mm_struct *mm; mm = get_task_mm(task); if (mm) { seq_printf(m, "%lu\n", mm->ksm_merging_pages); mmput(mm); } return 0; } static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct mm_struct *mm; int ret = 0; mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm)); seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); seq_printf(m, "ksm_merge_any: %s\n", mm_flags_test(MMF_VM_MERGE_ANY, mm) ? "yes" : "no"); ret = mmap_read_lock_killable(mm); if (ret) { mmput(mm); return ret; } seq_printf(m, "ksm_mergeable: %s\n", ksm_process_mergeable(mm) ? "yes" : "no"); mmap_read_unlock(mm); mmput(mm); } return 0; } #endif /* CONFIG_KSM */ #ifdef CONFIG_KSTACK_ERASE_METRICS static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long prev_depth = THREAD_SIZE - (task->prev_lowest_stack & (THREAD_SIZE - 1)); unsigned long depth = THREAD_SIZE - (task->lowest_stack & (THREAD_SIZE - 1)); seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n", prev_depth, depth); return 0; } #endif /* CONFIG_KSTACK_ERASE_METRICS */ /* * Thread groups */ static const struct file_operations proc_task_operations; static const struct inode_operations proc_task_inode_operations; static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif REG("environ", S_IRUSR, proc_environ_operations), REG("auxv", S_IRUSR, proc_auxv_operations), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif #ifdef CONFIG_TIME_NS REG("timens_offsets", S_IRUGO|S_IWUSR, proc_timens_offsets_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), #endif REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), ONE("stat", S_IRUGO, proc_tgid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_pid_maps_operations), #ifdef CONFIG_NUMA REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), #endif REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), LNK("cwd", proc_cwd_link), LNK("root", proc_root_link), LNK("exe", proc_exe_link), REG("mounts", S_IRUGO, proc_mounts_operations), REG("mountinfo", S_IRUGO, proc_mountinfo_operations), REG("mountstats", S_IRUSR, proc_mountstats_operations), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif #ifdef CONFIG_KALLSYMS ONE("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHED_INFO ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP REG("latency", S_IRUGO, proc_lstats_operations), #endif #ifdef CONFIG_PROC_PID_CPUSET ONE("cpuset", S_IRUGO, proc_cpuset_show), #endif #ifdef CONFIG_CGROUPS ONE("cgroup", S_IRUGO, proc_cgroup_show), #endif #ifdef CONFIG_PROC_CPU_RESCTRL ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show), #endif ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDIT REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), REG("fail-nth", 0644, proc_fail_nth_operations), #endif #ifdef CONFIG_ELF_CORE REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tgid_io_accounting), #endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) REG("timers", S_IRUGO, proc_timers_operations), #endif REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations), #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif #ifdef CONFIG_KSTACK_ERASE_METRICS ONE("stack_depth", S_IRUGO, proc_stack_depth), #endif #ifdef CONFIG_PROC_PID_ARCH_STATUS ONE("arch_status", S_IRUGO, proc_pid_arch_status), #endif #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif #ifdef CONFIG_KSM ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) { return proc_pident_readdir(file, ctx, tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); } static const struct file_operations proc_tgid_base_operations = { .read = generic_read_dir, .iterate_shared = proc_tgid_base_readdir, .llseek = generic_file_llseek, }; struct pid *tgid_pidfd_to_pid(const struct file *file) { if (file->f_op != &proc_tgid_base_operations) return ERR_PTR(-EBADF); return proc_pid(file_inode(file)); } static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, tgid_base_stuff, tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff)); } static const struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, .getattr = pid_getattr, .setattr = proc_nochmod_setattr, .permission = proc_pid_permission, }; /** * proc_flush_pid - Remove dcache entries for @pid from the /proc dcache. * @pid: pid that should be flushed. * * This function walks a list of inodes (that belong to any proc * filesystem) that are attached to the pid and flushes them from * the dentry cache. * * It is safe and reasonable to cache /proc entries for a task until * that task exits. After that they just clog up the dcache with * useless entries, possibly causing useful dcache entries to be * flushed instead. This routine is provided to flush those useless * dcache entries when a process is reaped. * * NOTE: This routine is just an optimization so it does not guarantee * that no dcache entries will exist after a process is reaped * it just makes it very unlikely that any will persist. */ void proc_flush_pid(struct pid *pid) { proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock); } static struct dentry *proc_pid_instantiate(struct dentry * dentry, struct task_struct *task, const void *ptr) { struct inode *inode; inode = proc_pid_make_base_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) return ERR_PTR(-ENOENT); inode->i_op = &proc_tgid_base_inode_operations; inode->i_fop = &proc_tgid_base_operations; inode->i_flags|=S_IMMUTABLE; set_nlink(inode, nlink_tgid); pid_update_inode(task, inode); return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) { struct task_struct *task; unsigned tgid; struct proc_fs_info *fs_info; struct pid_namespace *ns; struct dentry *result = ERR_PTR(-ENOENT); tgid = name_to_int(&dentry->d_name); if (tgid == ~0U) goto out; fs_info = proc_sb_info(dentry->d_sb); ns = fs_info->pid_ns; rcu_read_lock(); task = find_task_by_pid_ns(tgid, ns); if (task) get_task_struct(task); rcu_read_unlock(); if (!task) goto out; /* Limit procfs to only ptraceable tasks */ if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) { if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS)) goto out_put_task; } result = proc_pid_instantiate(dentry, task, NULL); out_put_task: put_task_struct(task); out: return result; } /* * Find the first task with tgid >= tgid * */ struct tgid_iter { unsigned int tgid; struct task_struct *task; }; static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter) { struct pid *pid; if (iter.task) put_task_struct(iter.task); rcu_read_lock(); retry: iter.task = NULL; pid = find_ge_pid(iter.tgid, ns); if (pid) { iter.tgid = pid_nr_ns(pid, ns); iter.task = pid_task(pid, PIDTYPE_TGID); if (!iter.task) { iter.tgid += 1; goto retry; } get_task_struct(iter.task); } rcu_read_unlock(); return iter; } #define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2) /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file *file, struct dir_context *ctx) { struct tgid_iter iter; struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb); struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb); loff_t pos = ctx->pos; if (pos >= PID_MAX_LIMIT + TGID_OFFSET) return 0; if (pos == TGID_OFFSET - 2) { if (!dir_emit(ctx, "self", 4, self_inum, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } if (pos == TGID_OFFSET - 1) { if (!dir_emit(ctx, "thread-self", 11, thread_self_inum, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } iter.tgid = pos - TGID_OFFSET; iter.task = NULL; for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { char name[10 + 1]; unsigned int len; cond_resched(); if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE)) continue; len = snprintf(name, sizeof(name), "%u", iter.tgid); ctx->pos = iter.tgid + TGID_OFFSET; if (!proc_fill_cache(file, ctx, name, len, proc_pid_instantiate, iter.task, NULL)) { put_task_struct(iter.task); return 0; } } ctx->pos = PID_MAX_LIMIT + TGID_OFFSET; return 0; } /* * proc_tid_comm_permission is a special permission function exclusively * used for the node /proc/<pid>/task/<tid>/comm. * It bypasses generic permission checks in the case where a task of the same * task group attempts to access the node. * The rationale behind this is that glibc and bionic access this node for * cross thread naming (pthread_set/getname_np(!self)). However, if * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0, * which locks out the cross thread naming implementation. * This function makes sure that the node is always accessible for members of * same thread group. */ static int proc_tid_comm_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { bool is_same_tgroup; struct task_struct *task; task = get_proc_task(inode); if (!task) return -ESRCH; is_same_tgroup = same_thread_group(current, task); put_task_struct(task); if (likely(is_same_tgroup && !(mask & MAY_EXEC))) { /* This file (/proc/<pid>/task/<tid>/comm) can always be * read or written by the members of the corresponding * thread group. */ return 0; } return generic_permission(&nop_mnt_idmap, inode, mask); } static const struct inode_operations proc_tid_comm_inode_operations = { .setattr = proc_nochmod_setattr, .permission = proc_tid_comm_permission, }; /* * Tasks */ static const struct pid_entry tid_base_stuff[] = { DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif REG("environ", S_IRUSR, proc_environ_operations), REG("auxv", S_IRUSR, proc_auxv_operations), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), NOD("comm", S_IFREG|S_IRUGO|S_IWUSR, &proc_tid_comm_inode_operations, &proc_pid_set_comm_operations, {}), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), #endif REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), ONE("stat", S_IRUGO, proc_tid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_pid_maps_operations), #ifdef CONFIG_PROC_CHILDREN REG("children", S_IRUGO, proc_tid_children_operations), #endif #ifdef CONFIG_NUMA REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), #endif REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), LNK("cwd", proc_cwd_link), LNK("root", proc_root_link), LNK("exe", proc_exe_link), REG("mounts", S_IRUGO, proc_mounts_operations), REG("mountinfo", S_IRUGO, proc_mountinfo_operations), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif #ifdef CONFIG_KALLSYMS ONE("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHED_INFO ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP REG("latency", S_IRUGO, proc_lstats_operations), #endif #ifdef CONFIG_PROC_PID_CPUSET ONE("cpuset", S_IRUGO, proc_cpuset_show), #endif #ifdef CONFIG_CGROUPS ONE("cgroup", S_IRUGO, proc_cgroup_show), #endif #ifdef CONFIG_PROC_CPU_RESCTRL ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show), #endif ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDIT REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), REG("fail-nth", 0644, proc_fail_nth_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tid_io_accounting), #endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif #ifdef CONFIG_PROC_PID_ARCH_STATUS ONE("arch_status", S_IRUGO, proc_pid_arch_status), #endif #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif #ifdef CONFIG_KSM ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) { return proc_pident_readdir(file, ctx, tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); } static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, tid_base_stuff, tid_base_stuff + ARRAY_SIZE(tid_base_stuff)); } static const struct file_operations proc_tid_base_operations = { .read = generic_read_dir, .iterate_shared = proc_tid_base_readdir, .llseek = generic_file_llseek, }; static const struct inode_operations proc_tid_base_inode_operations = { .lookup = proc_tid_base_lookup, .getattr = pid_getattr, .setattr = proc_nochmod_setattr, }; static struct dentry *proc_task_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { struct inode *inode; inode = proc_pid_make_base_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) return ERR_PTR(-ENOENT); inode->i_op = &proc_tid_base_inode_operations; inode->i_fop = &proc_tid_base_operations; inode->i_flags |= S_IMMUTABLE; set_nlink(inode, nlink_tid); pid_update_inode(task, inode); return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { struct task_struct *task; struct task_struct *leader = get_proc_task(dir); unsigned tid; struct proc_fs_info *fs_info; struct pid_namespace *ns; struct dentry *result = ERR_PTR(-ENOENT); if (!leader) goto out_no_task; tid = name_to_int(&dentry->d_name); if (tid == ~0U) goto out; fs_info = proc_sb_info(dentry->d_sb); ns = fs_info->pid_ns; rcu_read_lock(); task = find_task_by_pid_ns(tid, ns); if (task) get_task_struct(task); rcu_read_unlock(); if (!task) goto out; if (!same_thread_group(leader, task)) goto out_drop_task; result = proc_task_instantiate(dentry, task, NULL); out_drop_task: put_task_struct(task); out: put_task_struct(leader); out_no_task: return result; } /* * Find the first tid of a thread group to return to user space. * * Usually this is just the thread group leader, but if the users * buffer was too small or there was a seek into the middle of the * directory we have more work todo. * * In the case of a short read we start with find_task_by_pid. * * In the case of a seek we start with the leader and walk nr * threads past it. */ static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos, struct pid_namespace *ns) { struct task_struct *pos, *task; unsigned long nr = f_pos; if (nr != f_pos) /* 32bit overflow? */ return NULL; rcu_read_lock(); task = pid_task(pid, PIDTYPE_PID); if (!task) goto fail; /* Attempt to start with the tid of a thread */ if (tid && nr) { pos = find_task_by_pid_ns(tid, ns); if (pos && same_thread_group(pos, task)) goto found; } /* If nr exceeds the number of threads there is nothing todo */ if (nr >= get_nr_threads(task)) goto fail; /* If we haven't found our starting place yet start * with the leader and walk nr threads forward. */ for_each_thread(task, pos) { if (!nr--) goto found; } fail: pos = NULL; goto out; found: get_task_struct(pos); out: rcu_read_unlock(); return pos; } /* * Find the next thread in the thread list. * Return NULL if there is an error or no next thread. * * The reference to the input task_struct is released. */ static struct task_struct *next_tid(struct task_struct *start) { struct task_struct *pos = NULL; rcu_read_lock(); if (pid_alive(start)) { pos = __next_thread(start); if (pos) get_task_struct(pos); } rcu_read_unlock(); put_task_struct(start); return pos; } /* for the /proc/TGID/task/ directories */ static int proc_task_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct task_struct *task; struct pid_namespace *ns; int tid; if (proc_inode_is_dead(inode)) return -ENOENT; if (!dir_emit_dots(file, ctx)) return 0; /* We cache the tgid value that the last readdir call couldn't * return and lseek resets it to 0. */ ns = proc_pid_ns(inode->i_sb); tid = (int)(intptr_t)file->private_data; file->private_data = NULL; for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; task = next_tid(task), ctx->pos++) { char name[10 + 1]; unsigned int len; tid = task_pid_nr_ns(task, ns); if (!tid) continue; /* The task has just exited. */ len = snprintf(name, sizeof(name), "%d", tid); if (!proc_fill_cache(file, ctx, name, len, proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first * pid for the next readir call */ file->private_data = (void *)(intptr_t)tid; put_task_struct(task); break; } } return 0; } static int proc_task_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct task_struct *p = get_proc_task(inode); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (p) { stat->nlink += get_nr_threads(p); put_task_struct(p); } return 0; } /* * proc_task_readdir() set @file->private_data to a positive integer * value, so casting that to u64 is safe. generic_llseek_cookie() will * set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is * here to catch any unexpected change in behavior either in * proc_task_readdir() or generic_llseek_cookie(). */ static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence) { u64 cookie = (u64)(intptr_t)file->private_data; loff_t off; off = generic_llseek_cookie(file, offset, whence, &cookie); WARN_ON_ONCE(cookie > INT_MAX); file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */ return off; } static const struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, .setattr = proc_nochmod_setattr, .permission = proc_pid_permission, }; static const struct file_operations proc_task_operations = { .read = generic_read_dir, .iterate_shared = proc_task_readdir, .llseek = proc_dir_llseek, }; void __init set_proc_pid_nlink(void) { nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); }
10 153 60 513 313 420 296 462 421 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Cryptographic scatter and gather helpers. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2002 Adam J. Richter <adam@yggdrasil.com> * Copyright (c) 2004 Jean-Luc Cooke <jlcooke@certainkey.com> * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_SCATTERWALK_H #define _CRYPTO_SCATTERWALK_H #include <crypto/algapi.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/scatterlist.h> static inline void scatterwalk_crypto_chain(struct scatterlist *head, struct scatterlist *sg, int num) { if (sg) sg_chain(head, num, sg); else sg_mark_end(head); } static inline void scatterwalk_start(struct scatter_walk *walk, struct scatterlist *sg) { walk->sg = sg; walk->offset = sg->offset; } /* * This is equivalent to scatterwalk_start(walk, sg) followed by * scatterwalk_skip(walk, pos). */ static inline void scatterwalk_start_at_pos(struct scatter_walk *walk, struct scatterlist *sg, unsigned int pos) { while (pos > sg->length) { pos -= sg->length; sg = sg_next(sg); } walk->sg = sg; walk->offset = sg->offset + pos; } static inline unsigned int scatterwalk_clamp(struct scatter_walk *walk, unsigned int nbytes) { unsigned int len_this_sg; unsigned int limit; if (walk->offset >= walk->sg->offset + walk->sg->length) scatterwalk_start(walk, sg_next(walk->sg)); len_this_sg = walk->sg->offset + walk->sg->length - walk->offset; /* * HIGHMEM case: the page may have to be mapped into memory. To avoid * the complexity of having to map multiple pages at once per sg entry, * clamp the returned length to not cross a page boundary. * * !HIGHMEM case: no mapping is needed; all pages of the sg entry are * already mapped contiguously in the kernel's direct map. For improved * performance, allow the walker to return data segments that cross a * page boundary. Do still cap the length to PAGE_SIZE, since some * users rely on that to avoid disabling preemption for too long when * using SIMD. It's also needed for when skcipher_walk uses a bounce * page due to the data not being aligned to the algorithm's alignmask. */ if (IS_ENABLED(CONFIG_HIGHMEM)) limit = PAGE_SIZE - offset_in_page(walk->offset); else limit = PAGE_SIZE; return min3(nbytes, len_this_sg, limit); } /* * Create a scatterlist that represents the remaining data in a walk. Uses * chaining to reference the original scatterlist, so this uses at most two * entries in @sg_out regardless of the number of entries in the original list. * Assumes that sg_init_table() was already done. */ static inline void scatterwalk_get_sglist(struct scatter_walk *walk, struct scatterlist sg_out[2]) { if (walk->offset >= walk->sg->offset + walk->sg->length) scatterwalk_start(walk, sg_next(walk->sg)); sg_set_page(sg_out, sg_page(walk->sg), walk->sg->offset + walk->sg->length - walk->offset, walk->offset); scatterwalk_crypto_chain(sg_out, sg_next(walk->sg), 2); } static inline void scatterwalk_map(struct scatter_walk *walk) { struct page *base_page = sg_page(walk->sg); unsigned int offset = walk->offset; void *addr; if (IS_ENABLED(CONFIG_HIGHMEM)) { struct page *page; page = base_page + (offset >> PAGE_SHIFT); offset = offset_in_page(offset); addr = kmap_local_page(page) + offset; } else { /* * When !HIGHMEM we allow the walker to return segments that * span a page boundary; see scatterwalk_clamp(). To make it * clear that in this case we're working in the linear buffer of * the whole sg entry in the kernel's direct map rather than * within the mapped buffer of a single page, compute the * address as an offset from the page_address() of the first * page of the sg entry. Either way the result is the address * in the direct map, but this makes it clearer what is really * going on. */ addr = page_address(base_page) + offset; } walk->__addr = addr; } /** * scatterwalk_next() - Get the next data buffer in a scatterlist walk * @walk: the scatter_walk * @total: the total number of bytes remaining, > 0 * * A virtual address for the next segment of data from the scatterlist will * be placed into @walk->addr. The caller must call scatterwalk_done_src() * or scatterwalk_done_dst() when it is done using this virtual address. * * Returns: the next number of bytes available, <= @total */ static inline unsigned int scatterwalk_next(struct scatter_walk *walk, unsigned int total) { unsigned int nbytes = scatterwalk_clamp(walk, total); scatterwalk_map(walk); return nbytes; } static inline void scatterwalk_unmap(struct scatter_walk *walk) { if (IS_ENABLED(CONFIG_HIGHMEM)) kunmap_local(walk->__addr); } static inline void scatterwalk_advance(struct scatter_walk *walk, unsigned int nbytes) { walk->offset += nbytes; } /** * scatterwalk_done_src() - Finish one step of a walk of source scatterlist * @walk: the scatter_walk * @nbytes: the number of bytes processed this step, less than or equal to the * number of bytes that scatterwalk_next() returned. * * Use this if the mapped address was not written to, i.e. it is source data. */ static inline void scatterwalk_done_src(struct scatter_walk *walk, unsigned int nbytes) { scatterwalk_unmap(walk); scatterwalk_advance(walk, nbytes); } /* * Flush the dcache of any pages that overlap the region * [offset, offset + nbytes) relative to base_page. * * This should be called only when ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE, to ensure * that all relevant code (including the call to sg_page() in the caller, if * applicable) gets fully optimized out when !ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE. */ static inline void __scatterwalk_flush_dcache_pages(struct page *base_page, unsigned int offset, unsigned int nbytes) { unsigned int num_pages; base_page += offset / PAGE_SIZE; offset %= PAGE_SIZE; /* * This is an overflow-safe version of * num_pages = DIV_ROUND_UP(offset + nbytes, PAGE_SIZE). */ num_pages = nbytes / PAGE_SIZE; num_pages += DIV_ROUND_UP(offset + (nbytes % PAGE_SIZE), PAGE_SIZE); for (unsigned int i = 0; i < num_pages; i++) flush_dcache_page(base_page + i); } /** * scatterwalk_done_dst() - Finish one step of a walk of destination scatterlist * @walk: the scatter_walk * @nbytes: the number of bytes processed this step, less than or equal to the * number of bytes that scatterwalk_next() returned. * * Use this if the mapped address may have been written to, i.e. it is * destination data. */ static inline void scatterwalk_done_dst(struct scatter_walk *walk, unsigned int nbytes) { scatterwalk_unmap(walk); if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE) __scatterwalk_flush_dcache_pages(sg_page(walk->sg), walk->offset, nbytes); scatterwalk_advance(walk, nbytes); } void scatterwalk_skip(struct scatter_walk *walk, unsigned int nbytes); void memcpy_from_scatterwalk(void *buf, struct scatter_walk *walk, unsigned int nbytes); void memcpy_to_scatterwalk(struct scatter_walk *walk, const void *buf, unsigned int nbytes); void memcpy_from_sglist(void *buf, struct scatterlist *sg, unsigned int start, unsigned int nbytes); void memcpy_to_sglist(struct scatterlist *sg, unsigned int start, const void *buf, unsigned int nbytes); void memcpy_sglist(struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes); /* In new code, please use memcpy_{from,to}_sglist() directly instead. */ static inline void scatterwalk_map_and_copy(void *buf, struct scatterlist *sg, unsigned int start, unsigned int nbytes, int out) { if (out) memcpy_to_sglist(sg, start, buf, nbytes); else memcpy_from_sglist(buf, sg, start, nbytes); } struct scatterlist *scatterwalk_ffwd(struct scatterlist dst[2], struct scatterlist *src, unsigned int len); #endif /* _CRYPTO_SCATTERWALK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* SPDX-License-Identifier: GPL-2.0 */ /* * PHY device list allow maintaining a list of PHY devices that are * part of a netdevice's link topology. PHYs can for example be chained, * as is the case when using a PHY that exposes an SFP module, on which an * SFP transceiver that embeds a PHY is connected. * * This list can then be used by userspace to leverage individual PHY * capabilities. */ #ifndef __PHY_LINK_TOPOLOGY_H #define __PHY_LINK_TOPOLOGY_H #include <linux/ethtool.h> #include <linux/netdevice.h> struct xarray; struct phy_device; struct sfp_bus; struct phy_link_topology { struct xarray phys; u32 next_phy_index; }; struct phy_device_node { enum phy_upstream upstream_type; union { struct net_device *netdev; struct phy_device *phydev; } upstream; struct sfp_bus *parent_sfp_bus; struct phy_device *phy; }; #if IS_ENABLED(CONFIG_PHYLIB) int phy_link_topo_add_phy(struct net_device *dev, struct phy_device *phy, enum phy_upstream upt, void *upstream); void phy_link_topo_del_phy(struct net_device *dev, struct phy_device *phy); static inline struct phy_device * phy_link_topo_get_phy(struct net_device *dev, u32 phyindex) { struct phy_link_topology *topo = dev->link_topo; struct phy_device_node *pdn; if (!topo) return NULL; pdn = xa_load(&topo->phys, phyindex); if (pdn) return pdn->phy; return NULL; } #else static inline int phy_link_topo_add_phy(struct net_device *dev, struct phy_device *phy, enum phy_upstream upt, void *upstream) { return 0; } static inline void phy_link_topo_del_phy(struct net_device *dev, struct phy_device *phy) { } static inline struct phy_device * phy_link_topo_get_phy(struct net_device *dev, u32 phyindex) { return NULL; } #endif #endif /* __PHY_LINK_TOPOLOGY_H */
18547 18542 494 496 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 // SPDX-License-Identifier: GPL-2.0-only /* Common code for 32 and 64-bit NUMA */ #include <linux/acpi.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/of.h> #include <linux/string.h> #include <linux/init.h> #include <linux/memblock.h> #include <linux/mmzone.h> #include <linux/ctype.h> #include <linux/nodemask.h> #include <linux/sched.h> #include <linux/topology.h> #include <linux/sort.h> #include <linux/numa_memblks.h> #include <asm/e820/api.h> #include <asm/proto.h> #include <asm/dma.h> #include <asm/numa.h> #include <asm/amd/nb.h> #include "mm_internal.h" int numa_off; static __init int numa_setup(char *opt) { if (!opt) return -EINVAL; if (!strncmp(opt, "off", 3)) numa_off = 1; if (!strncmp(opt, "fake=", 5)) return numa_emu_cmdline(opt + 5); if (!strncmp(opt, "noacpi", 6)) disable_srat(); if (!strncmp(opt, "nohmat", 6)) disable_hmat(); return 0; } early_param("numa", numa_setup); /* * apicid, cpu, node mappings */ s16 __apicid_to_node[MAX_LOCAL_APIC] = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; nodemask_t numa_phys_nodes_parsed __initdata; int numa_cpu_node(int cpu) { u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu); if (apicid != BAD_APICID) return __apicid_to_node[apicid]; return NUMA_NO_NODE; } int __init num_phys_nodes(void) { return bitmap_weight(numa_phys_nodes_parsed.bits, MAX_NUMNODES); } cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; EXPORT_SYMBOL(node_to_cpumask_map); /* * Map cpu index to node index */ DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); void numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); /* early setting, no percpu area yet */ if (cpu_to_node_map) { cpu_to_node_map[cpu] = node; return; } #ifdef CONFIG_DEBUG_PER_CPU_MAPS if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); dump_stack(); return; } #endif per_cpu(x86_cpu_to_node_map, cpu) = node; set_cpu_numa_node(cpu, node); } void numa_clear_node(int cpu) { numa_set_node(cpu, NUMA_NO_NODE); } /* * Allocate node_to_cpumask_map based on number of available nodes * Requires node_possible_map to be valid. * * Note: cpumask_of_node() is not valid until after this is done. * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) */ void __init setup_node_to_cpumask_map(void) { unsigned int node; /* setup nr_node_ids if not done yet */ if (nr_node_ids == MAX_NUMNODES) setup_nr_node_ids(); /* allocate the map */ for (node = 0; node < nr_node_ids; node++) alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); /* cpumask_of_node() will now work */ pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); } static int __init numa_register_nodes(void) { int nid; if (!memblock_validate_numa_coverage(SZ_1M)) return -EINVAL; /* Finally register nodes. */ for_each_node_mask(nid, node_possible_map) { unsigned long start_pfn, end_pfn; /* * Note, get_pfn_range_for_nid() depends on * memblock_set_node() having already happened */ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); if (start_pfn >= end_pfn) continue; alloc_node_data(nid); node_set_online(nid); } /* Dump memblock with node info and return. */ memblock_dump_all(); return 0; } /* * There are unfortunately some poorly designed mainboards around that * only connect memory to a single CPU. This breaks the 1:1 cpu->node * mapping. To avoid this fill in the mapping for all possible CPUs, * as the number of CPUs is not known yet. We round robin the existing * nodes. */ static void __init numa_init_array(void) { int rr, i; rr = first_node(node_online_map); for (i = 0; i < nr_cpu_ids; i++) { if (early_cpu_to_node(i) != NUMA_NO_NODE) continue; numa_set_node(i, rr); rr = next_node_in(rr, node_online_map); } } static int __init numa_init(int (*init_func)(void)) { int i; int ret; for (i = 0; i < MAX_LOCAL_APIC; i++) set_apicid_to_node(i, NUMA_NO_NODE); ret = numa_memblks_init(init_func, /* memblock_force_top_down */ true); if (ret < 0) return ret; ret = numa_register_nodes(); if (ret < 0) return ret; for (i = 0; i < nr_cpu_ids; i++) { int nid = early_cpu_to_node(i); if (nid == NUMA_NO_NODE) continue; if (!node_online(nid)) numa_clear_node(i); } numa_init_array(); return 0; } /** * dummy_numa_init - Fallback dummy NUMA init * * Used if there's no underlying NUMA architecture, NUMA initialization * fails, or NUMA is disabled on the command line. * * Must online at least one node and add memory blocks that cover all * allowed memory. This function must not fail. */ static int __init dummy_numa_init(void) { printk(KERN_INFO "%s\n", numa_off ? "NUMA turned off" : "No NUMA configuration found"); printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n", 0LLU, PFN_PHYS(max_pfn) - 1); node_set(0, numa_nodes_parsed); node_set(0, numa_phys_nodes_parsed); numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); return 0; } /** * x86_numa_init - Initialize NUMA * * Try each configured NUMA initialization method until one succeeds. The * last fallback is dummy single node config encompassing whole memory and * never fails. */ void __init x86_numa_init(void) { if (!numa_off) { #ifdef CONFIG_ACPI_NUMA if (!numa_init(x86_acpi_numa_init)) return; #endif #ifdef CONFIG_AMD_NUMA if (!numa_init(amd_numa_init)) return; #endif if (acpi_disabled && !numa_init(of_numa_init)) return; } numa_init(dummy_numa_init); } /* * A node may exist which has one or more Generic Initiators but no CPUs and no * memory. * * This function must be called after init_cpu_to_node(), to ensure that any * memoryless CPU nodes have already been brought online, and before the * node_data[nid] is needed for zone list setup in build_all_zonelists(). * * When this function is called, any nodes containing either memory and/or CPUs * will already be online and there is no need to do anything extra, even if * they also contain one or more Generic Initiators. */ void __init init_gi_nodes(void) { int nid; /* * Exclude this node from * bringup_nonboot_cpus * cpu_up * __try_online_node * register_node * because node_subsys is not initialized yet. * TODO remove dependency on node_online */ for_each_node_state(nid, N_GENERIC_INITIATOR) if (!node_online(nid)) node_set_online(nid); } /* * Setup early cpu_to_node. * * Populate cpu_to_node[] only if x86_cpu_to_apicid[], * and apicid_to_node[] tables have valid entries for a CPU. * This means we skip cpu_to_node[] initialisation for NUMA * emulation and faking node case (when running a kernel compiled * for NUMA on a non NUMA box), which is OK as cpu_to_node[] * is already initialized in a round robin manner at numa_init_array, * prior to this call, and this initialization is good enough * for the fake NUMA cases. * * Called before the per_cpu areas are setup. */ void __init init_cpu_to_node(void) { int cpu; u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); BUG_ON(cpu_to_apicid == NULL); for_each_possible_cpu(cpu) { int node = numa_cpu_node(cpu); if (node == NUMA_NO_NODE) continue; /* * Exclude this node from * bringup_nonboot_cpus * cpu_up * __try_online_node * register_node * because node_subsys is not initialized yet. * TODO remove dependency on node_online */ if (!node_online(node)) node_set_online(node); numa_set_node(cpu, node); } } #ifndef CONFIG_DEBUG_PER_CPU_MAPS # ifndef CONFIG_NUMA_EMU void numa_add_cpu(unsigned int cpu) { cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } void numa_remove_cpu(unsigned int cpu) { cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } # endif /* !CONFIG_NUMA_EMU */ #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ int __cpu_to_node(int cpu) { if (early_per_cpu_ptr(x86_cpu_to_node_map)) { printk(KERN_WARNING "cpu_to_node(%d): usage too early!\n", cpu); dump_stack(); return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; } return per_cpu(x86_cpu_to_node_map, cpu); } EXPORT_SYMBOL(__cpu_to_node); /* * Same function as cpu_to_node() but used if called before the * per_cpu areas are setup. */ int early_cpu_to_node(int cpu) { if (early_per_cpu_ptr(x86_cpu_to_node_map)) return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; if (!cpu_possible(cpu)) { printk(KERN_WARNING "early_cpu_to_node(%d): no per_cpu area!\n", cpu); dump_stack(); return NUMA_NO_NODE; } return per_cpu(x86_cpu_to_node_map, cpu); } void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable) { struct cpumask *mask; if (node == NUMA_NO_NODE) { /* early_cpu_to_node() already emits a warning and trace */ return; } mask = node_to_cpumask_map[node]; if (!cpumask_available(mask)) { pr_err("node_to_cpumask_map[%i] NULL\n", node); dump_stack(); return; } if (enable) cpumask_set_cpu(cpu, mask); else cpumask_clear_cpu(cpu, mask); printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n", enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, cpumask_pr_args(mask)); return; } # ifndef CONFIG_NUMA_EMU static void numa_set_cpumask(int cpu, bool enable) { debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); } void numa_add_cpu(unsigned int cpu) { numa_set_cpumask(cpu, true); } void numa_remove_cpu(unsigned int cpu) { numa_set_cpumask(cpu, false); } # endif /* !CONFIG_NUMA_EMU */ /* * Returns a pointer to the bitmask of CPUs on Node 'node'. */ const struct cpumask *cpumask_of_node(int node) { if ((unsigned)node >= nr_node_ids) { printk(KERN_WARNING "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n", node, nr_node_ids); dump_stack(); return cpu_none_mask; } if (!cpumask_available(node_to_cpumask_map[node])) { printk(KERN_WARNING "cpumask_of_node(%d): no node_to_cpumask_map!\n", node); dump_stack(); return cpu_online_mask; } return node_to_cpumask_map[node]; } EXPORT_SYMBOL(cpumask_of_node); #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ #ifdef CONFIG_NUMA_EMU void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, unsigned int nr_emu_nids) { int i, j; /* * Transform __apicid_to_node table to use emulated nids by * reverse-mapping phys_nid. The maps should always exist but fall * back to zero just in case. */ for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { if (__apicid_to_node[i] == NUMA_NO_NODE) continue; for (j = 0; j < nr_emu_nids; j++) if (__apicid_to_node[i] == emu_nid_to_phys[j]) break; __apicid_to_node[i] = j < nr_emu_nids ? j : 0; } } u64 __init numa_emu_dma_end(void) { return PFN_PHYS(MAX_DMA32_PFN); } #endif /* CONFIG_NUMA_EMU */
8 3 6 6 6 5 6 8659 8676 3980 8224 8682 8697 6687 8205 1914 1913 31 1226 1200 7 7 2 4 1 6 8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 // SPDX-License-Identifier: GPL-2.0-only /* * lib/bitmap.c * Helper functions for bitmap.h. */ #include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/ctype.h> #include <linux/device.h> #include <linux/export.h> #include <linux/slab.h> /** * DOC: bitmap introduction * * bitmaps provide an array of bits, implemented using an * array of unsigned longs. The number of valid bits in a * given bitmap does _not_ need to be an exact multiple of * BITS_PER_LONG. * * The possible unused bits in the last, partially used word * of a bitmap are 'don't care'. The implementation makes * no particular effort to keep them zero. It ensures that * their value will not affect the results of any operation. * The bitmap operations that return Boolean (bitmap_empty, * for example) or scalar (bitmap_weight, for example) results * carefully filter out these unused bits from impacting their * results. * * The byte ordering of bitmaps is more natural on little * endian architectures. See the big-endian headers * include/asm-ppc64/bitops.h and include/asm-s390/bitops.h * for the best explanations of this ordering. */ bool __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] != bitmap2[k]) return false; if (bits % BITS_PER_LONG) if ((bitmap1[k] ^ bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) return false; return true; } EXPORT_SYMBOL(__bitmap_equal); bool __bitmap_or_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, const unsigned long *bitmap3, unsigned int bits) { unsigned int k, lim = bits / BITS_PER_LONG; unsigned long tmp; for (k = 0; k < lim; ++k) { if ((bitmap1[k] | bitmap2[k]) != bitmap3[k]) return false; } if (!(bits % BITS_PER_LONG)) return true; tmp = (bitmap1[k] | bitmap2[k]) ^ bitmap3[k]; return (tmp & BITMAP_LAST_WORD_MASK(bits)) == 0; } EXPORT_SYMBOL(__bitmap_or_equal); void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits) { unsigned int k, lim = BITS_TO_LONGS(bits); for (k = 0; k < lim; ++k) dst[k] = ~src[k]; } EXPORT_SYMBOL(__bitmap_complement); /** * __bitmap_shift_right - logical right shift of the bits in a bitmap * @dst : destination bitmap * @src : source bitmap * @shift : shift by this many bits * @nbits : bitmap size, in bits * * Shifting right (dividing) means moving bits in the MS -> LS bit * direction. Zeros are fed into the vacated MS positions and the * LS bits shifted off the bottom are lost. */ void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, unsigned shift, unsigned nbits) { unsigned k, lim = BITS_TO_LONGS(nbits); unsigned off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG; unsigned long mask = BITMAP_LAST_WORD_MASK(nbits); for (k = 0; off + k < lim; ++k) { unsigned long upper, lower; /* * If shift is not word aligned, take lower rem bits of * word above and make them the top rem bits of result. */ if (!rem || off + k + 1 >= lim) upper = 0; else { upper = src[off + k + 1]; if (off + k + 1 == lim - 1) upper &= mask; upper <<= (BITS_PER_LONG - rem); } lower = src[off + k]; if (off + k == lim - 1) lower &= mask; lower >>= rem; dst[k] = lower | upper; } if (off) memset(&dst[lim - off], 0, off*sizeof(unsigned long)); } EXPORT_SYMBOL(__bitmap_shift_right); /** * __bitmap_shift_left - logical left shift of the bits in a bitmap * @dst : destination bitmap * @src : source bitmap * @shift : shift by this many bits * @nbits : bitmap size, in bits * * Shifting left (multiplying) means moving bits in the LS -> MS * direction. Zeros are fed into the vacated LS bit positions * and those MS bits shifted off the top are lost. */ void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits) { int k; unsigned int lim = BITS_TO_LONGS(nbits); unsigned int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG; for (k = lim - off - 1; k >= 0; --k) { unsigned long upper, lower; /* * If shift is not word aligned, take upper rem bits of * word below and make them the bottom rem bits of result. */ if (rem && k > 0) lower = src[k - 1] >> (BITS_PER_LONG - rem); else lower = 0; upper = src[k] << rem; dst[k + off] = lower | upper; } if (off) memset(dst, 0, off*sizeof(unsigned long)); } EXPORT_SYMBOL(__bitmap_shift_left); /** * bitmap_cut() - remove bit region from bitmap and right shift remaining bits * @dst: destination bitmap, might overlap with src * @src: source bitmap * @first: start bit of region to be removed * @cut: number of bits to remove * @nbits: bitmap size, in bits * * Set the n-th bit of @dst iff the n-th bit of @src is set and * n is less than @first, or the m-th bit of @src is set for any * m such that @first <= n < nbits, and m = n + @cut. * * In pictures, example for a big-endian 32-bit architecture: * * The @src bitmap is:: * * 31 63 * | | * 10000000 11000001 11110010 00010101 10000000 11000001 01110010 00010101 * | | | | * 16 14 0 32 * * if @cut is 3, and @first is 14, bits 14-16 in @src are cut and @dst is:: * * 31 63 * | | * 10110000 00011000 00110010 00010101 00010000 00011000 00101110 01000010 * | | | * 14 (bit 17 0 32 * from @src) * * Note that @dst and @src might overlap partially or entirely. * * This is implemented in the obvious way, with a shift and carry * step for each moved bit. Optimisation is left as an exercise * for the compiler. */ void bitmap_cut(unsigned long *dst, const unsigned long *src, unsigned int first, unsigned int cut, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits); unsigned long keep = 0, carry; int i; if (first % BITS_PER_LONG) { keep = src[first / BITS_PER_LONG] & (~0UL >> (BITS_PER_LONG - first % BITS_PER_LONG)); } memmove(dst, src, len * sizeof(*dst)); while (cut--) { for (i = first / BITS_PER_LONG; i < len; i++) { if (i < len - 1) carry = dst[i + 1] & 1UL; else carry = 0; dst[i] = (dst[i] >> 1) | (carry << (BITS_PER_LONG - 1)); } } dst[first / BITS_PER_LONG] &= ~0UL << (first % BITS_PER_LONG); dst[first / BITS_PER_LONG] |= keep; } EXPORT_SYMBOL(bitmap_cut); bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; unsigned int lim = bits/BITS_PER_LONG; unsigned long result = 0; for (k = 0; k < lim; k++) result |= (dst[k] = bitmap1[k] & bitmap2[k]); if (bits % BITS_PER_LONG) result |= (dst[k] = bitmap1[k] & bitmap2[k] & BITMAP_LAST_WORD_MASK(bits)); return result != 0; } EXPORT_SYMBOL(__bitmap_and); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; unsigned int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] | bitmap2[k]; } EXPORT_SYMBOL(__bitmap_or); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; unsigned int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] ^ bitmap2[k]; } EXPORT_SYMBOL(__bitmap_xor); bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; unsigned int lim = bits/BITS_PER_LONG; unsigned long result = 0; for (k = 0; k < lim; k++) result |= (dst[k] = bitmap1[k] & ~bitmap2[k]); if (bits % BITS_PER_LONG) result |= (dst[k] = bitmap1[k] & ~bitmap2[k] & BITMAP_LAST_WORD_MASK(bits)); return result != 0; } EXPORT_SYMBOL(__bitmap_andnot); void __bitmap_replace(unsigned long *dst, const unsigned long *old, const unsigned long *new, const unsigned long *mask, unsigned int nbits) { unsigned int k; unsigned int nr = BITS_TO_LONGS(nbits); for (k = 0; k < nr; k++) dst[k] = (old[k] & ~mask[k]) | (new[k] & mask[k]); } EXPORT_SYMBOL(__bitmap_replace); bool __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] & bitmap2[k]) return true; if (bits % BITS_PER_LONG) if ((bitmap1[k] & bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) return true; return false; } EXPORT_SYMBOL(__bitmap_intersects); bool __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] & ~bitmap2[k]) return false; if (bits % BITS_PER_LONG) if ((bitmap1[k] & ~bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) return false; return true; } EXPORT_SYMBOL(__bitmap_subset); #define BITMAP_WEIGHT(FETCH, bits) \ ({ \ unsigned int __bits = (bits), idx, w = 0; \ \ for (idx = 0; idx < __bits / BITS_PER_LONG; idx++) \ w += hweight_long(FETCH); \ \ if (__bits % BITS_PER_LONG) \ w += hweight_long((FETCH) & BITMAP_LAST_WORD_MASK(__bits)); \ \ w; \ }) unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int bits) { return BITMAP_WEIGHT(bitmap[idx], bits); } EXPORT_SYMBOL(__bitmap_weight); unsigned int __bitmap_weight_and(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { return BITMAP_WEIGHT(bitmap1[idx] & bitmap2[idx], bits); } EXPORT_SYMBOL(__bitmap_weight_and); unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { return BITMAP_WEIGHT(bitmap1[idx] & ~bitmap2[idx], bits); } EXPORT_SYMBOL(__bitmap_weight_andnot); unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { return BITMAP_WEIGHT(({dst[idx] = bitmap1[idx] | bitmap2[idx]; dst[idx]; }), bits); } EXPORT_SYMBOL(__bitmap_weighted_or); unsigned int __bitmap_weighted_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { return BITMAP_WEIGHT(({dst[idx] = bitmap1[idx] ^ bitmap2[idx]; dst[idx]; }), bits); } EXPORT_SYMBOL(__bitmap_weighted_xor); void __bitmap_set(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); const unsigned int size = start + len; int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); while (len - bits_to_set >= 0) { *p |= mask_to_set; len -= bits_to_set; bits_to_set = BITS_PER_LONG; mask_to_set = ~0UL; p++; } if (len) { mask_to_set &= BITMAP_LAST_WORD_MASK(size); *p |= mask_to_set; } } EXPORT_SYMBOL(__bitmap_set); void __bitmap_clear(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); const unsigned int size = start + len; int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); while (len - bits_to_clear >= 0) { *p &= ~mask_to_clear; len -= bits_to_clear; bits_to_clear = BITS_PER_LONG; mask_to_clear = ~0UL; p++; } if (len) { mask_to_clear &= BITMAP_LAST_WORD_MASK(size); *p &= ~mask_to_clear; } } EXPORT_SYMBOL(__bitmap_clear); /** * bitmap_find_next_zero_area_off - find a contiguous aligned zero area * @map: The address to base the search on * @size: The bitmap size in bits * @start: The bitnumber to start searching at * @nr: The number of zeroed bits we're looking for * @align_mask: Alignment mask for zero area * @align_offset: Alignment offset for zero area. * * The @align_mask should be one less than a power of 2; the effect is that * the bit offset of all zero areas this function finds plus @align_offset * is multiple of that power of 2. */ unsigned long bitmap_find_next_zero_area_off(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, unsigned long align_mask, unsigned long align_offset) { unsigned long index, end, i; again: index = find_next_zero_bit(map, size, start); /* Align allocation */ index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset; end = index + nr; if (end > size) return end; i = find_next_bit(map, end, index); if (i < end) { start = i + 1; goto again; } return index; } EXPORT_SYMBOL(bitmap_find_next_zero_area_off); /** * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap * @buf: pointer to a bitmap * @pos: a bit position in @buf (0 <= @pos < @nbits) * @nbits: number of valid bit positions in @buf * * Map the bit at position @pos in @buf (of length @nbits) to the * ordinal of which set bit it is. If it is not set or if @pos * is not a valid bit position, map to -1. * * If for example, just bits 4 through 7 are set in @buf, then @pos * values 4 through 7 will get mapped to 0 through 3, respectively, * and other @pos values will get mapped to -1. When @pos value 7 * gets mapped to (returns) @ord value 3 in this example, that means * that bit 7 is the 3rd (starting with 0th) set bit in @buf. * * The bit positions 0 through @bits are valid positions in @buf. */ static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigned int nbits) { if (pos >= nbits || !test_bit(pos, buf)) return -1; return bitmap_weight(buf, pos); } /** * bitmap_remap - Apply map defined by a pair of bitmaps to another bitmap * @dst: remapped result * @src: subset to be remapped * @old: defines domain of map * @new: defines range of map * @nbits: number of bits in each of these bitmaps * * Let @old and @new define a mapping of bit positions, such that * whatever position is held by the n-th set bit in @old is mapped * to the n-th set bit in @new. In the more general case, allowing * for the possibility that the weight 'w' of @new is less than the * weight of @old, map the position of the n-th set bit in @old to * the position of the m-th set bit in @new, where m == n % w. * * If either of the @old and @new bitmaps are empty, or if @src and * @dst point to the same location, then this routine copies @src * to @dst. * * The positions of unset bits in @old are mapped to themselves * (the identity map). * * Apply the above specified mapping to @src, placing the result in * @dst, clearing any bits previously set in @dst. * * For example, lets say that @old has bits 4 through 7 set, and * @new has bits 12 through 15 set. This defines the mapping of bit * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other * bit positions unchanged. So if say @src comes into this routine * with bits 1, 5 and 7 set, then @dst should leave with bits 1, * 13 and 15 set. */ void bitmap_remap(unsigned long *dst, const unsigned long *src, const unsigned long *old, const unsigned long *new, unsigned int nbits) { unsigned int oldbit, w; if (dst == src) /* following doesn't handle inplace remaps */ return; bitmap_zero(dst, nbits); w = bitmap_weight(new, nbits); for_each_set_bit(oldbit, src, nbits) { int n = bitmap_pos_to_ord(old, oldbit, nbits); if (n < 0 || w == 0) set_bit(oldbit, dst); /* identity map */ else set_bit(find_nth_bit(new, nbits, n % w), dst); } } EXPORT_SYMBOL(bitmap_remap); /** * bitmap_bitremap - Apply map defined by a pair of bitmaps to a single bit * @oldbit: bit position to be mapped * @old: defines domain of map * @new: defines range of map * @bits: number of bits in each of these bitmaps * * Let @old and @new define a mapping of bit positions, such that * whatever position is held by the n-th set bit in @old is mapped * to the n-th set bit in @new. In the more general case, allowing * for the possibility that the weight 'w' of @new is less than the * weight of @old, map the position of the n-th set bit in @old to * the position of the m-th set bit in @new, where m == n % w. * * The positions of unset bits in @old are mapped to themselves * (the identity map). * * Apply the above specified mapping to bit position @oldbit, returning * the new bit position. * * For example, lets say that @old has bits 4 through 7 set, and * @new has bits 12 through 15 set. This defines the mapping of bit * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other * bit positions unchanged. So if say @oldbit is 5, then this routine * returns 13. */ int bitmap_bitremap(int oldbit, const unsigned long *old, const unsigned long *new, int bits) { int w = bitmap_weight(new, bits); int n = bitmap_pos_to_ord(old, oldbit, bits); if (n < 0 || w == 0) return oldbit; else return find_nth_bit(new, bits, n % w); } EXPORT_SYMBOL(bitmap_bitremap); #ifdef CONFIG_NUMA /** * bitmap_onto - translate one bitmap relative to another * @dst: resulting translated bitmap * @orig: original untranslated bitmap * @relmap: bitmap relative to which translated * @bits: number of bits in each of these bitmaps * * Set the n-th bit of @dst iff there exists some m such that the * n-th bit of @relmap is set, the m-th bit of @orig is set, and * the n-th bit of @relmap is also the m-th _set_ bit of @relmap. * (If you understood the previous sentence the first time your * read it, you're overqualified for your current job.) * * In other words, @orig is mapped onto (surjectively) @dst, * using the map { <n, m> | the n-th bit of @relmap is the * m-th set bit of @relmap }. * * Any set bits in @orig above bit number W, where W is the * weight of (number of set bits in) @relmap are mapped nowhere. * In particular, if for all bits m set in @orig, m >= W, then * @dst will end up empty. In situations where the possibility * of such an empty result is not desired, one way to avoid it is * to use the bitmap_fold() operator, below, to first fold the * @orig bitmap over itself so that all its set bits x are in the * range 0 <= x < W. The bitmap_fold() operator does this by * setting the bit (m % W) in @dst, for each bit (m) set in @orig. * * Example [1] for bitmap_onto(): * Let's say @relmap has bits 30-39 set, and @orig has bits * 1, 3, 5, 7, 9 and 11 set. Then on return from this routine, * @dst will have bits 31, 33, 35, 37 and 39 set. * * When bit 0 is set in @orig, it means turn on the bit in * @dst corresponding to whatever is the first bit (if any) * that is turned on in @relmap. Since bit 0 was off in the * above example, we leave off that bit (bit 30) in @dst. * * When bit 1 is set in @orig (as in the above example), it * means turn on the bit in @dst corresponding to whatever * is the second bit that is turned on in @relmap. The second * bit in @relmap that was turned on in the above example was * bit 31, so we turned on bit 31 in @dst. * * Similarly, we turned on bits 33, 35, 37 and 39 in @dst, * because they were the 4th, 6th, 8th and 10th set bits * set in @relmap, and the 4th, 6th, 8th and 10th bits of * @orig (i.e. bits 3, 5, 7 and 9) were also set. * * When bit 11 is set in @orig, it means turn on the bit in * @dst corresponding to whatever is the twelfth bit that is * turned on in @relmap. In the above example, there were * only ten bits turned on in @relmap (30..39), so that bit * 11 was set in @orig had no affect on @dst. * * Example [2] for bitmap_fold() + bitmap_onto(): * Let's say @relmap has these ten bits set:: * * 40 41 42 43 45 48 53 61 74 95 * * (for the curious, that's 40 plus the first ten terms of the * Fibonacci sequence.) * * Further lets say we use the following code, invoking * bitmap_fold() then bitmap_onto, as suggested above to * avoid the possibility of an empty @dst result:: * * unsigned long *tmp; // a temporary bitmap's bits * * bitmap_fold(tmp, orig, bitmap_weight(relmap, bits), bits); * bitmap_onto(dst, tmp, relmap, bits); * * Then this table shows what various values of @dst would be, for * various @orig's. I list the zero-based positions of each set bit. * The tmp column shows the intermediate result, as computed by * using bitmap_fold() to fold the @orig bitmap modulo ten * (the weight of @relmap): * * =============== ============== ================= * @orig tmp @dst * 0 0 40 * 1 1 41 * 9 9 95 * 10 0 40 [#f1]_ * 1 3 5 7 1 3 5 7 41 43 48 61 * 0 1 2 3 4 0 1 2 3 4 40 41 42 43 45 * 0 9 18 27 0 9 8 7 40 61 74 95 * 0 10 20 30 0 40 * 0 11 22 33 0 1 2 3 40 41 42 43 * 0 12 24 36 0 2 4 6 40 42 45 53 * 78 102 211 1 2 8 41 42 74 [#f1]_ * =============== ============== ================= * * .. [#f1] * * For these marked lines, if we hadn't first done bitmap_fold() * into tmp, then the @dst result would have been empty. * * If either of @orig or @relmap is empty (no set bits), then @dst * will be returned empty. * * If (as explained above) the only set bits in @orig are in positions * m where m >= W, (where W is the weight of @relmap) then @dst will * once again be returned empty. * * All bits in @dst not set by the above rule are cleared. */ void bitmap_onto(unsigned long *dst, const unsigned long *orig, const unsigned long *relmap, unsigned int bits) { unsigned int n, m; /* same meaning as in above comment */ if (dst == orig) /* following doesn't handle inplace mappings */ return; bitmap_zero(dst, bits); /* * The following code is a more efficient, but less * obvious, equivalent to the loop: * for (m = 0; m < bitmap_weight(relmap, bits); m++) { * n = find_nth_bit(orig, bits, m); * if (test_bit(m, orig)) * set_bit(n, dst); * } */ m = 0; for_each_set_bit(n, relmap, bits) { /* m == bitmap_pos_to_ord(relmap, n, bits) */ if (test_bit(m, orig)) set_bit(n, dst); m++; } } /** * bitmap_fold - fold larger bitmap into smaller, modulo specified size * @dst: resulting smaller bitmap * @orig: original larger bitmap * @sz: specified size * @nbits: number of bits in each of these bitmaps * * For each bit oldbit in @orig, set bit oldbit mod @sz in @dst. * Clear all other bits in @dst. See further the comment and * Example [2] for bitmap_onto() for why and how to use this. */ void bitmap_fold(unsigned long *dst, const unsigned long *orig, unsigned int sz, unsigned int nbits) { unsigned int oldbit; if (dst == orig) /* following doesn't handle inplace mappings */ return; bitmap_zero(dst, nbits); for_each_set_bit(oldbit, orig, nbits) set_bit(oldbit % sz, dst); } #endif /* CONFIG_NUMA */ unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags) { return kmalloc_array(BITS_TO_LONGS(nbits), sizeof(unsigned long), flags); } EXPORT_SYMBOL(bitmap_alloc); unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags) { return bitmap_alloc(nbits, flags | __GFP_ZERO); } EXPORT_SYMBOL(bitmap_zalloc); unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node) { return kmalloc_array_node(BITS_TO_LONGS(nbits), sizeof(unsigned long), flags, node); } EXPORT_SYMBOL(bitmap_alloc_node); unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node) { return bitmap_alloc_node(nbits, flags | __GFP_ZERO, node); } EXPORT_SYMBOL(bitmap_zalloc_node); void bitmap_free(const unsigned long *bitmap) { kfree(bitmap); } EXPORT_SYMBOL(bitmap_free); static void devm_bitmap_free(void *data) { unsigned long *bitmap = data; bitmap_free(bitmap); } unsigned long *devm_bitmap_alloc(struct device *dev, unsigned int nbits, gfp_t flags) { unsigned long *bitmap; int ret; bitmap = bitmap_alloc(nbits, flags); if (!bitmap) return NULL; ret = devm_add_action_or_reset(dev, devm_bitmap_free, bitmap); if (ret) return NULL; return bitmap; } EXPORT_SYMBOL_GPL(devm_bitmap_alloc); unsigned long *devm_bitmap_zalloc(struct device *dev, unsigned int nbits, gfp_t flags) { return devm_bitmap_alloc(dev, nbits, flags | __GFP_ZERO); } EXPORT_SYMBOL_GPL(devm_bitmap_zalloc); #if BITS_PER_LONG == 64 /** * bitmap_from_arr32 - copy the contents of u32 array of bits to bitmap * @bitmap: array of unsigned longs, the destination bitmap * @buf: array of u32 (in host byte order), the source bitmap * @nbits: number of bits in @bitmap */ void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits) { unsigned int i, halfwords; halfwords = DIV_ROUND_UP(nbits, 32); for (i = 0; i < halfwords; i++) { bitmap[i/2] = (unsigned long) buf[i]; if (++i < halfwords) bitmap[i/2] |= ((unsigned long) buf[i]) << 32; } /* Clear tail bits in last word beyond nbits. */ if (nbits % BITS_PER_LONG) bitmap[(halfwords - 1) / 2] &= BITMAP_LAST_WORD_MASK(nbits); } EXPORT_SYMBOL(bitmap_from_arr32); /** * bitmap_to_arr32 - copy the contents of bitmap to a u32 array of bits * @buf: array of u32 (in host byte order), the dest bitmap * @bitmap: array of unsigned longs, the source bitmap * @nbits: number of bits in @bitmap */ void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits) { unsigned int i, halfwords; halfwords = DIV_ROUND_UP(nbits, 32); for (i = 0; i < halfwords; i++) { buf[i] = (u32) (bitmap[i/2] & UINT_MAX); if (++i < halfwords) buf[i] = (u32) (bitmap[i/2] >> 32); } /* Clear tail bits in last element of array beyond nbits. */ if (nbits % BITS_PER_LONG) buf[halfwords - 1] &= (u32) (UINT_MAX >> ((-nbits) & 31)); } EXPORT_SYMBOL(bitmap_to_arr32); #endif #if BITS_PER_LONG == 32 /** * bitmap_from_arr64 - copy the contents of u64 array of bits to bitmap * @bitmap: array of unsigned longs, the destination bitmap * @buf: array of u64 (in host byte order), the source bitmap * @nbits: number of bits in @bitmap */ void bitmap_from_arr64(unsigned long *bitmap, const u64 *buf, unsigned int nbits) { int n; for (n = nbits; n > 0; n -= 64) { u64 val = *buf++; *bitmap++ = val; if (n > 32) *bitmap++ = val >> 32; } /* * Clear tail bits in the last word beyond nbits. * * Negative index is OK because here we point to the word next * to the last word of the bitmap, except for nbits == 0, which * is tested implicitly. */ if (nbits % BITS_PER_LONG) bitmap[-1] &= BITMAP_LAST_WORD_MASK(nbits); } EXPORT_SYMBOL(bitmap_from_arr64); /** * bitmap_to_arr64 - copy the contents of bitmap to a u64 array of bits * @buf: array of u64 (in host byte order), the dest bitmap * @bitmap: array of unsigned longs, the source bitmap * @nbits: number of bits in @bitmap */ void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits) { const unsigned long *end = bitmap + BITS_TO_LONGS(nbits); while (bitmap < end) { *buf = *bitmap++; if (bitmap < end) *buf |= (u64)(*bitmap++) << 32; buf++; } /* Clear tail bits in the last element of array beyond nbits. */ if (nbits % 64) buf[-1] &= GENMASK_ULL((nbits - 1) % 64, 0); } EXPORT_SYMBOL(bitmap_to_arr64); #endif
27 27 3 3 21 21 5 5 29 29 29 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 // SPDX-License-Identifier: GPL-2.0-only /* * pcrypt - Parallel crypto wrapper. * * Copyright (C) 2009 secunet Security Networks AG * Copyright (C) 2009 Steffen Klassert <steffen.klassert@secunet.com> */ #include <crypto/algapi.h> #include <crypto/internal/aead.h> #include <linux/atomic.h> #include <linux/err.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/kobject.h> #include <linux/cpu.h> #include <crypto/pcrypt.h> static struct padata_instance *pencrypt; static struct padata_instance *pdecrypt; static struct kset *pcrypt_kset; struct pcrypt_instance_ctx { struct crypto_aead_spawn spawn; struct padata_shell *psenc; struct padata_shell *psdec; atomic_t tfm_count; }; struct pcrypt_aead_ctx { struct crypto_aead *child; unsigned int cb_cpu; }; static inline struct pcrypt_instance_ctx *pcrypt_tfm_ictx( struct crypto_aead *tfm) { return aead_instance_ctx(aead_alg_instance(tfm)); } static int pcrypt_aead_setkey(struct crypto_aead *parent, const u8 *key, unsigned int keylen) { struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(parent); return crypto_aead_setkey(ctx->child, key, keylen); } static int pcrypt_aead_setauthsize(struct crypto_aead *parent, unsigned int authsize) { struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(parent); return crypto_aead_setauthsize(ctx->child, authsize); } static void pcrypt_aead_serial(struct padata_priv *padata) { struct pcrypt_request *preq = pcrypt_padata_request(padata); struct aead_request *req = pcrypt_request_ctx(preq); aead_request_complete(req->base.data, padata->info); } static void pcrypt_aead_done(void *data, int err) { struct aead_request *req = data; struct pcrypt_request *preq = aead_request_ctx(req); struct padata_priv *padata = pcrypt_request_padata(preq); if (err == -EINPROGRESS) return; padata->info = err; padata_do_serial(padata); } static void pcrypt_aead_enc(struct padata_priv *padata) { struct pcrypt_request *preq = pcrypt_padata_request(padata); struct aead_request *req = pcrypt_request_ctx(preq); int ret; ret = crypto_aead_encrypt(req); if (ret == -EINPROGRESS || ret == -EBUSY) return; padata->info = ret; padata_do_serial(padata); } static int pcrypt_aead_encrypt(struct aead_request *req) { int err; struct pcrypt_request *preq = aead_request_ctx(req); struct aead_request *creq = pcrypt_request_ctx(preq); struct padata_priv *padata = pcrypt_request_padata(preq); struct crypto_aead *aead = crypto_aead_reqtfm(req); struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(aead); u32 flags = aead_request_flags(req); struct pcrypt_instance_ctx *ictx; ictx = pcrypt_tfm_ictx(aead); memset(padata, 0, sizeof(struct padata_priv)); padata->parallel = pcrypt_aead_enc; padata->serial = pcrypt_aead_serial; aead_request_set_tfm(creq, ctx->child); aead_request_set_callback(creq, flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, pcrypt_aead_done, req); aead_request_set_crypt(creq, req->src, req->dst, req->cryptlen, req->iv); aead_request_set_ad(creq, req->assoclen); err = padata_do_parallel(ictx->psenc, padata, &ctx->cb_cpu); if (!err) return -EINPROGRESS; if (err == -EBUSY) { /* try non-parallel mode */ return crypto_aead_encrypt(creq); } return err; } static void pcrypt_aead_dec(struct padata_priv *padata) { struct pcrypt_request *preq = pcrypt_padata_request(padata); struct aead_request *req = pcrypt_request_ctx(preq); int ret; ret = crypto_aead_decrypt(req); if (ret == -EINPROGRESS || ret == -EBUSY) return; padata->info = ret; padata_do_serial(padata); } static int pcrypt_aead_decrypt(struct aead_request *req) { int err; struct pcrypt_request *preq = aead_request_ctx(req); struct aead_request *creq = pcrypt_request_ctx(preq); struct padata_priv *padata = pcrypt_request_padata(preq); struct crypto_aead *aead = crypto_aead_reqtfm(req); struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(aead); u32 flags = aead_request_flags(req); struct pcrypt_instance_ctx *ictx; ictx = pcrypt_tfm_ictx(aead); memset(padata, 0, sizeof(struct padata_priv)); padata->parallel = pcrypt_aead_dec; padata->serial = pcrypt_aead_serial; aead_request_set_tfm(creq, ctx->child); aead_request_set_callback(creq, flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, pcrypt_aead_done, req); aead_request_set_crypt(creq, req->src, req->dst, req->cryptlen, req->iv); aead_request_set_ad(creq, req->assoclen); err = padata_do_parallel(ictx->psdec, padata, &ctx->cb_cpu); if (!err) return -EINPROGRESS; if (err == -EBUSY) { /* try non-parallel mode */ return crypto_aead_decrypt(creq); } return err; } static int pcrypt_aead_init_tfm(struct crypto_aead *tfm) { int cpu_index; struct aead_instance *inst = aead_alg_instance(tfm); struct pcrypt_instance_ctx *ictx = aead_instance_ctx(inst); struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_aead *cipher; cpu_index = (unsigned int)atomic_inc_return(&ictx->tfm_count) % cpumask_weight(cpu_online_mask); ctx->cb_cpu = cpumask_nth(cpu_index, cpu_online_mask); cipher = crypto_spawn_aead(&ictx->spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; crypto_aead_set_reqsize(tfm, sizeof(struct pcrypt_request) + sizeof(struct aead_request) + crypto_aead_reqsize(cipher)); return 0; } static void pcrypt_aead_exit_tfm(struct crypto_aead *tfm) { struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_aead(ctx->child); } static void pcrypt_free(struct aead_instance *inst) { struct pcrypt_instance_ctx *ctx = aead_instance_ctx(inst); crypto_drop_aead(&ctx->spawn); padata_free_shell(ctx->psdec); padata_free_shell(ctx->psenc); kfree(inst); } static int pcrypt_init_instance(struct crypto_instance *inst, struct crypto_alg *alg) { if (snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME, "pcrypt(%s)", alg->cra_driver_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; memcpy(inst->alg.cra_name, alg->cra_name, CRYPTO_MAX_ALG_NAME); inst->alg.cra_priority = alg->cra_priority + 100; inst->alg.cra_blocksize = alg->cra_blocksize; inst->alg.cra_alignmask = alg->cra_alignmask; return 0; } static int pcrypt_create_aead(struct crypto_template *tmpl, struct rtattr **tb, struct crypto_attr_type *algt) { struct pcrypt_instance_ctx *ctx; struct aead_instance *inst; struct aead_alg *alg; u32 mask = crypto_algt_inherited_mask(algt); int err; inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; err = -ENOMEM; ctx = aead_instance_ctx(inst); ctx->psenc = padata_alloc_shell(pencrypt); if (!ctx->psenc) goto err_free_inst; ctx->psdec = padata_alloc_shell(pdecrypt); if (!ctx->psdec) goto err_free_inst; err = crypto_grab_aead(&ctx->spawn, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_aead_alg(&ctx->spawn); err = pcrypt_init_instance(aead_crypto_instance(inst), &alg->base); if (err) goto err_free_inst; inst->alg.base.cra_flags |= CRYPTO_ALG_ASYNC; inst->alg.ivsize = crypto_aead_alg_ivsize(alg); inst->alg.maxauthsize = crypto_aead_alg_maxauthsize(alg); inst->alg.base.cra_ctxsize = sizeof(struct pcrypt_aead_ctx); inst->alg.init = pcrypt_aead_init_tfm; inst->alg.exit = pcrypt_aead_exit_tfm; inst->alg.setkey = pcrypt_aead_setkey; inst->alg.setauthsize = pcrypt_aead_setauthsize; inst->alg.encrypt = pcrypt_aead_encrypt; inst->alg.decrypt = pcrypt_aead_decrypt; inst->free = pcrypt_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: pcrypt_free(inst); } return err; } static int pcrypt_create(struct crypto_template *tmpl, struct rtattr **tb) { struct crypto_attr_type *algt; algt = crypto_get_attr_type(tb); if (IS_ERR(algt)) return PTR_ERR(algt); switch (algt->type & algt->mask & CRYPTO_ALG_TYPE_MASK) { case CRYPTO_ALG_TYPE_AEAD: return pcrypt_create_aead(tmpl, tb, algt); } return -EINVAL; } static int pcrypt_sysfs_add(struct padata_instance *pinst, const char *name) { int ret; pinst->kobj.kset = pcrypt_kset; ret = kobject_add(&pinst->kobj, NULL, "%s", name); if (!ret) kobject_uevent(&pinst->kobj, KOBJ_ADD); return ret; } static int pcrypt_init_padata(struct padata_instance **pinst, const char *name) { int ret = -ENOMEM; *pinst = padata_alloc(name); if (!*pinst) return ret; ret = pcrypt_sysfs_add(*pinst, name); if (ret) padata_free(*pinst); return ret; } static struct crypto_template pcrypt_tmpl = { .name = "pcrypt", .create = pcrypt_create, .module = THIS_MODULE, }; static int __init pcrypt_init(void) { int err = -ENOMEM; pcrypt_kset = kset_create_and_add("pcrypt", NULL, kernel_kobj); if (!pcrypt_kset) goto err; err = pcrypt_init_padata(&pencrypt, "pencrypt"); if (err) goto err_unreg_kset; err = pcrypt_init_padata(&pdecrypt, "pdecrypt"); if (err) goto err_deinit_pencrypt; return crypto_register_template(&pcrypt_tmpl); err_deinit_pencrypt: padata_free(pencrypt); err_unreg_kset: kset_unregister(pcrypt_kset); err: return err; } static void __exit pcrypt_exit(void) { crypto_unregister_template(&pcrypt_tmpl); padata_free(pencrypt); padata_free(pdecrypt); kset_unregister(pcrypt_kset); } module_init(pcrypt_init); module_exit(pcrypt_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); MODULE_DESCRIPTION("Parallel crypto wrapper"); MODULE_ALIAS_CRYPTO("pcrypt");
6 6 103 104 104 61 61 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 IBM Corporation * * Author: * Mimi Zohar <zohar@us.ibm.com> */ #include <linux/module.h> #include <linux/init.h> #include <linux/file.h> #include <linux/binfmts.h> #include <linux/fs.h> #include <linux/xattr.h> #include <linux/magic.h> #include <linux/ima.h> #include <linux/evm.h> #include <linux/fsverity.h> #include <keys/system_keyring.h> #include <uapi/linux/fsverity.h> #include "ima.h" #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM static char *ima_appraise_cmdline_default __initdata; core_param(ima_appraise, ima_appraise_cmdline_default, charp, 0); void __init ima_appraise_parse_cmdline(void) { const char *str = ima_appraise_cmdline_default; bool sb_state = arch_get_secureboot(); int appraisal_state = ima_appraise; if (!str) return; if (strncmp(str, "off", 3) == 0) appraisal_state = 0; else if (strncmp(str, "log", 3) == 0) appraisal_state = IMA_APPRAISE_LOG; else if (strncmp(str, "fix", 3) == 0) appraisal_state = IMA_APPRAISE_FIX; else if (strncmp(str, "enforce", 7) == 0) appraisal_state = IMA_APPRAISE_ENFORCE; else pr_err("invalid \"%s\" appraise option", str); /* If appraisal state was changed, but secure boot is enabled, * keep its default */ if (sb_state) { if (!(appraisal_state & IMA_APPRAISE_ENFORCE)) pr_info("Secure boot enabled: ignoring ima_appraise=%s option", str); } else { ima_appraise = appraisal_state; } } #endif /* * is_ima_appraise_enabled - return appraise status * * Only return enabled, if not in ima_appraise="fix" or "log" modes. */ bool is_ima_appraise_enabled(void) { return ima_appraise & IMA_APPRAISE_ENFORCE; } /* * ima_must_appraise - set appraise flag * * Return 1 to appraise or hash */ int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func) { struct lsm_prop prop; if (!ima_appraise) return 0; security_current_getlsmprop_subj(&prop); return ima_match_policy(idmap, inode, current_cred(), &prop, func, mask, IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL, NULL); } static int ima_fix_xattr(struct dentry *dentry, struct ima_iint_cache *iint) { int rc, offset; u8 algo = iint->ima_hash->algo; if (algo <= HASH_ALGO_SHA1) { offset = 1; iint->ima_hash->xattr.sha1.type = IMA_XATTR_DIGEST; } else { offset = 0; iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG; iint->ima_hash->xattr.ng.algo = algo; } rc = __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, &iint->ima_hash->xattr.data[offset], (sizeof(iint->ima_hash->xattr) - offset) + iint->ima_hash->length, 0); return rc; } /* Return specific func appraised cached result */ enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: return iint->ima_mmap_status; case BPRM_CHECK: return iint->ima_bprm_status; case CREDS_CHECK: return iint->ima_creds_status; case FILE_CHECK: case POST_SETATTR: return iint->ima_file_status; case MODULE_CHECK ... MAX_CHECK - 1: default: return iint->ima_read_status; } } static void ima_set_cache_status(struct ima_iint_cache *iint, enum ima_hooks func, enum integrity_status status) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->ima_mmap_status = status; break; case BPRM_CHECK: iint->ima_bprm_status = status; break; case CREDS_CHECK: iint->ima_creds_status = status; break; case FILE_CHECK: case POST_SETATTR: iint->ima_file_status = status; break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->ima_read_status = status; break; } } static void ima_cache_flags(struct ima_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->flags |= (IMA_MMAP_APPRAISED | IMA_APPRAISED); break; case BPRM_CHECK: iint->flags |= (IMA_BPRM_APPRAISED | IMA_APPRAISED); break; case CREDS_CHECK: iint->flags |= (IMA_CREDS_APPRAISED | IMA_APPRAISED); break; case FILE_CHECK: case POST_SETATTR: iint->flags |= (IMA_FILE_APPRAISED | IMA_APPRAISED); break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->flags |= (IMA_READ_APPRAISED | IMA_APPRAISED); break; } } enum hash_algo ima_get_hash_algo(const struct evm_ima_xattr_data *xattr_value, int xattr_len) { struct signature_v2_hdr *sig; enum hash_algo ret; if (!xattr_value || xattr_len < 2) /* return default hash algo */ return ima_hash_algo; switch (xattr_value->type) { case IMA_VERITY_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 3 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case EVM_IMA_XATTR_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 2 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ ret = xattr_value->data[0]; if (ret < HASH_ALGO__LAST) return ret; break; case IMA_XATTR_DIGEST: /* this is for backward compatibility */ if (xattr_len == 21) { unsigned int zero = 0; if (!memcmp(&xattr_value->data[16], &zero, 4)) return HASH_ALGO_MD5; else return HASH_ALGO_SHA1; } else if (xattr_len == 17) return HASH_ALGO_MD5; break; } /* return default hash algo */ return ima_hash_algo; } int ima_read_xattr(struct dentry *dentry, struct evm_ima_xattr_data **xattr_value, int xattr_len) { int ret; ret = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, (char **)xattr_value, xattr_len, GFP_NOFS); if (ret == -EOPNOTSUPP) ret = 0; return ret; } /* * xattr_verify - verify xattr digest or signature * * Verify whether the hash or signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int xattr_verify(enum ima_hooks func, struct ima_iint_cache *iint, struct evm_ima_xattr_data *xattr_value, int xattr_len, enum integrity_status *status, const char **cause) { struct signature_v2_hdr *sig; int rc = -EINVAL, hash_start = 0; int mask; switch (xattr_value->type) { case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ hash_start = 1; fallthrough; case IMA_XATTR_DIGEST: if (*status != INTEGRITY_PASS_IMMUTABLE) { if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) *cause = "verity-signature-required"; else *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } clear_bit(IMA_DIGSIG, &iint->atomic_flags); } else { set_bit(IMA_DIGSIG, &iint->atomic_flags); } if (xattr_len - sizeof(xattr_value->type) - hash_start >= iint->ima_hash->length) /* * xattr length may be longer. md5 hash in previous * version occupied 20 bytes in xattr, instead of 16 */ rc = memcmp(&xattr_value->data[hash_start], iint->ima_hash->digest, iint->ima_hash->length); else rc = -EINVAL; if (rc) { *cause = "invalid-hash"; *status = INTEGRITY_FAIL; break; } *status = INTEGRITY_PASS; break; case EVM_IMA_XATTR_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); mask = IMA_DIGSIG_REQUIRED | IMA_VERITY_REQUIRED; if ((iint->flags & mask) == mask) { *cause = "verity-signature-required"; *status = INTEGRITY_FAIL; break; } sig = (typeof(sig))xattr_value; if (sig->version > 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } if ((iint->flags & IMA_SIGV3_REQUIRED) && sig->version != 3) { *cause = "IMA-sigv3-required"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length, iint->ima_hash->algo); if (rc == -EOPNOTSUPP) { *status = INTEGRITY_UNKNOWN; break; } if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_digsig_verify(INTEGRITY_KEYRING_PLATFORM, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length, iint->ima_hash->algo); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; case IMA_VERITY_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); if (iint->flags & IMA_DIGSIG_REQUIRED) { if (!(iint->flags & IMA_VERITY_REQUIRED)) { *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } } sig = (typeof(sig))xattr_value; if (sig->version != 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length, iint->ima_hash->algo); if (rc == -EOPNOTSUPP) { *status = INTEGRITY_UNKNOWN; break; } else if (rc) { *cause = "invalid-verity-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; default: *status = INTEGRITY_UNKNOWN; *cause = "unknown-ima-data"; break; } return rc; } /* * modsig_verify - verify modsig signature * * Verify whether the signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int modsig_verify(enum ima_hooks func, const struct modsig *modsig, enum integrity_status *status, const char **cause) { int rc; rc = integrity_modsig_verify(INTEGRITY_KEYRING_IMA, modsig); if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_modsig_verify(INTEGRITY_KEYRING_PLATFORM, modsig); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } return rc; } /* * ima_check_blacklist - determine if the binary is blacklisted. * * Add the hash of the blacklisted binary to the measurement list, based * on policy. * * Returns -EPERM if the hash is blacklisted. */ int ima_check_blacklist(struct ima_iint_cache *iint, const struct modsig *modsig, int pcr) { enum hash_algo hash_algo; const u8 *digest = NULL; u32 digestsize = 0; int rc = 0; if (!(iint->flags & IMA_CHECK_BLACKLIST)) return 0; if (iint->flags & IMA_MODSIG_ALLOWED && modsig) { ima_get_modsig_digest(modsig, &hash_algo, &digest, &digestsize); rc = is_binary_blacklisted(digest, digestsize); } else if (iint->flags & IMA_DIGSIG_REQUIRED && iint->ima_hash) rc = is_binary_blacklisted(iint->ima_hash->digest, iint->ima_hash->length); if ((rc == -EPERM) && (iint->flags & IMA_MEASURE)) process_buffer_measurement(&nop_mnt_idmap, NULL, digest, digestsize, "blacklisted-hash", NONE, pcr, NULL, false, NULL, 0); return rc; } /* * ima_appraise_measurement - appraise file measurement * * Call evm_verifyxattr() to verify the integrity of 'security.ima'. * Assuming success, compare the xattr hash with the collected measurement. * * Return 0 on success, error code otherwise */ int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig, bool bprm_is_check) { static const char op[] = "appraise_data"; int audit_msgno = AUDIT_INTEGRITY_DATA; const char *cause = "unknown"; struct dentry *dentry = file_dentry(file); struct inode *inode = d_backing_inode(dentry); enum integrity_status status = INTEGRITY_UNKNOWN; int rc = xattr_len; bool try_modsig = iint->flags & IMA_MODSIG_ALLOWED && modsig; /* If not appraising a modsig, we need an xattr. */ if (!(inode->i_opflags & IOP_XATTR) && !try_modsig) return INTEGRITY_UNKNOWN; /* * Unlike any of the other LSM hooks where the kernel enforces file * integrity, enforcing file integrity for the bprm_creds_for_exec() * LSM hook with the AT_EXECVE_CHECK flag is left up to the discretion * of the script interpreter(userspace). Differentiate kernel and * userspace enforced integrity audit messages. */ if (bprm_is_check) audit_msgno = AUDIT_INTEGRITY_USERSPACE; /* If reading the xattr failed and there's no modsig, error out. */ if (rc <= 0 && !try_modsig) { if (rc && rc != -ENODATA) goto out; if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) cause = "verity-signature-required"; else cause = "IMA-signature-required"; } else { cause = "missing-hash"; } status = INTEGRITY_NOLABEL; if (file->f_mode & FMODE_CREATED) iint->flags |= IMA_NEW_FILE; if ((iint->flags & IMA_NEW_FILE) && (!(iint->flags & IMA_DIGSIG_REQUIRED) || (inode->i_size == 0))) status = INTEGRITY_PASS; goto out; } status = evm_verifyxattr(dentry, XATTR_NAME_IMA, xattr_value, rc < 0 ? 0 : rc); switch (status) { case INTEGRITY_PASS: case INTEGRITY_PASS_IMMUTABLE: case INTEGRITY_UNKNOWN: break; case INTEGRITY_NOXATTRS: /* No EVM protected xattrs. */ /* It's fine not to have xattrs when using a modsig. */ if (try_modsig) break; fallthrough; case INTEGRITY_NOLABEL: /* No security.evm xattr. */ cause = "missing-HMAC"; goto out; case INTEGRITY_FAIL_IMMUTABLE: set_bit(IMA_DIGSIG, &iint->atomic_flags); cause = "invalid-fail-immutable"; goto out; case INTEGRITY_FAIL: /* Invalid HMAC/signature. */ cause = "invalid-HMAC"; goto out; default: WARN_ONCE(true, "Unexpected integrity status %d\n", status); } if (xattr_value) rc = xattr_verify(func, iint, xattr_value, xattr_len, &status, &cause); /* * If we have a modsig and either no imasig or the imasig's key isn't * known, then try verifying the modsig. */ if (try_modsig && (!xattr_value || xattr_value->type == IMA_XATTR_DIGEST_NG || rc == -ENOKEY)) rc = modsig_verify(func, modsig, &status, &cause); out: /* * File signatures on some filesystems can not be properly verified. * When such filesystems are mounted by an untrusted mounter or on a * system not willing to accept such a risk, fail the file signature * verification. */ if ((inode->i_sb->s_iflags & SB_I_IMA_UNVERIFIABLE_SIGNATURE) && ((inode->i_sb->s_iflags & SB_I_UNTRUSTED_MOUNTER) || (iint->flags & IMA_FAIL_UNVERIFIABLE_SIGS))) { status = INTEGRITY_FAIL; cause = "unverifiable-signature"; integrity_audit_msg(audit_msgno, inode, filename, op, cause, rc, 0); } else if (status != INTEGRITY_PASS) { /* Fix mode, but don't replace file signatures. */ if ((ima_appraise & IMA_APPRAISE_FIX) && !try_modsig && (!xattr_value || xattr_value->type != EVM_IMA_XATTR_DIGSIG)) { if (!ima_fix_xattr(dentry, iint)) status = INTEGRITY_PASS; } else if (status == INTEGRITY_NOLABEL) { if (!evm_fix_hmac(dentry, XATTR_NAME_IMA, (const char *)xattr_value, xattr_len)) status = INTEGRITY_PASS; } /* * Permit new files with file/EVM portable signatures, but * without data. */ if (inode->i_size == 0 && iint->flags & IMA_NEW_FILE && test_bit(IMA_DIGSIG, &iint->atomic_flags)) { status = INTEGRITY_PASS; } integrity_audit_msg(audit_msgno, inode, filename, op, cause, rc, 0); } else { ima_cache_flags(iint, func); } ima_set_cache_status(iint, func, status); return status; } /* * ima_update_xattr - update 'security.ima' hash value */ void ima_update_xattr(struct ima_iint_cache *iint, struct file *file) { struct dentry *dentry = file_dentry(file); int rc = 0; /* do not collect and update hash for digital signatures */ if (test_bit(IMA_DIGSIG, &iint->atomic_flags)) return; if ((iint->ima_file_status != INTEGRITY_PASS) && !(iint->flags & IMA_HASH)) return; rc = ima_collect_measurement(iint, file, NULL, 0, ima_hash_algo, NULL); if (rc < 0) return; inode_lock(file_inode(file)); ima_fix_xattr(dentry, iint); inode_unlock(file_inode(file)); } /** * ima_inode_post_setattr - reflect file metadata changes * @idmap: idmap of the mount the inode was found from * @dentry: pointer to the affected dentry * @ia_valid: for the UID and GID status * * Changes to a dentry's metadata might result in needing to appraise. * * This function is called from notify_change(), which expects the caller * to lock the inode's i_mutex. */ static void ima_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ia_valid) { struct inode *inode = d_backing_inode(dentry); struct ima_iint_cache *iint; int action; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode) || !(inode->i_opflags & IOP_XATTR)) return; action = ima_must_appraise(idmap, inode, MAY_ACCESS, POST_SETATTR); iint = ima_iint_find(inode); if (iint) { set_bit(IMA_CHANGE_ATTR, &iint->atomic_flags); if (!action) clear_bit(IMA_UPDATE_XATTR, &iint->atomic_flags); } } /* * ima_protect_xattr - protect 'security.ima' * * Ensure that not just anyone can modify or remove 'security.ima'. */ static int ima_protect_xattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { if (strcmp(xattr_name, XATTR_NAME_IMA) == 0) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return 1; } return 0; } /* * ima_reset_appraise_flags - reset ima_iint_cache flags * * @digsig: whether to clear/set IMA_DIGSIG flag, tristate values * 0: clear IMA_DIGSIG * 1: set IMA_DIGSIG * -1: don't change IMA_DIGSIG * */ static void ima_reset_appraise_flags(struct inode *inode, int digsig) { struct ima_iint_cache *iint; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode)) return; iint = ima_iint_find(inode); if (!iint) return; iint->measured_pcrs = 0; set_bit(IMA_CHANGE_XATTR, &iint->atomic_flags); if (digsig == 1) set_bit(IMA_DIGSIG, &iint->atomic_flags); else if (digsig == 0) clear_bit(IMA_DIGSIG, &iint->atomic_flags); } /** * validate_hash_algo() - Block setxattr with unsupported hash algorithms * @dentry: object of the setxattr() * @xattr_value: userland supplied xattr value * @xattr_value_len: length of xattr_value * * The xattr value is mapped to its hash algorithm, and this algorithm * must be built in the kernel for the setxattr to be allowed. * * Emit an audit message when the algorithm is invalid. * * Return: 0 on success, else an error. */ static int validate_hash_algo(struct dentry *dentry, const struct evm_ima_xattr_data *xattr_value, size_t xattr_value_len) { char *path = NULL, *pathbuf = NULL; enum hash_algo xattr_hash_algo; const char *errmsg = "unavailable-hash-algorithm"; unsigned int allowed_hashes; xattr_hash_algo = ima_get_hash_algo(xattr_value, xattr_value_len); allowed_hashes = atomic_read(&ima_setxattr_allowed_hash_algorithms); if (allowed_hashes) { /* success if the algorithm is allowed in the ima policy */ if (allowed_hashes & (1U << xattr_hash_algo)) return 0; /* * We use a different audit message when the hash algorithm * is denied by a policy rule, instead of not being built * in the kernel image */ errmsg = "denied-hash-algorithm"; } else { if (likely(xattr_hash_algo == ima_hash_algo)) return 0; /* allow any xattr using an algorithm built in the kernel */ if (crypto_has_alg(hash_algo_name[xattr_hash_algo], 0, 0)) return 0; } pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf) return -EACCES; path = dentry_path(dentry, pathbuf, PATH_MAX); integrity_audit_msg(AUDIT_INTEGRITY_DATA, d_inode(dentry), path, "set_data", errmsg, -EACCES, 0); kfree(pathbuf); return -EACCES; } static int ima_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len, int flags) { const struct evm_ima_xattr_data *xvalue = xattr_value; int digsig = 0; int result; int err; result = ima_protect_xattr(dentry, xattr_name, xattr_value, xattr_value_len); if (result == 1) { if (!xattr_value_len || (xvalue->type >= IMA_XATTR_LAST)) return -EINVAL; err = validate_hash_algo(dentry, xvalue, xattr_value_len); if (err) return err; digsig = (xvalue->type == EVM_IMA_XATTR_DIGSIG); } else if (!strcmp(xattr_name, XATTR_NAME_EVM) && xattr_value_len > 0) { digsig = (xvalue->type == EVM_XATTR_PORTABLE_DIGSIG); } else { digsig = -1; } if (result == 1 || evm_revalidate_status(xattr_name)) { ima_reset_appraise_flags(d_backing_inode(dentry), digsig); if (result == 1) result = 0; } return result; } static int ima_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { if (evm_revalidate_status(acl_name)) ima_reset_appraise_flags(d_backing_inode(dentry), -1); return 0; } static int ima_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name) { int result, digsig = -1; result = ima_protect_xattr(dentry, xattr_name, NULL, 0); if (result == 1 || evm_revalidate_status(xattr_name)) { if (!strcmp(xattr_name, XATTR_NAME_IMA)) digsig = 0; ima_reset_appraise_flags(d_backing_inode(dentry), digsig); if (result == 1) result = 0; } return result; } static int ima_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return ima_inode_set_acl(idmap, dentry, acl_name, NULL); } static struct security_hook_list ima_appraise_hooks[] __ro_after_init = { LSM_HOOK_INIT(inode_post_setattr, ima_inode_post_setattr), LSM_HOOK_INIT(inode_setxattr, ima_inode_setxattr), LSM_HOOK_INIT(inode_set_acl, ima_inode_set_acl), LSM_HOOK_INIT(inode_removexattr, ima_inode_removexattr), LSM_HOOK_INIT(inode_remove_acl, ima_inode_remove_acl), }; void __init init_ima_appraise_lsm(const struct lsm_id *lsmid) { security_add_hooks(ima_appraise_hooks, ARRAY_SIZE(ima_appraise_hooks), lsmid); }
167 167 34 167 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM rpm #if !defined(_TRACE_RUNTIME_POWER_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_RUNTIME_POWER_H #include <linux/ktime.h> #include <linux/tracepoint.h> struct device; /* * The rpm_internal events are used for tracing some important * runtime pm internal functions. */ DECLARE_EVENT_CLASS(rpm_internal, TP_PROTO(struct device *dev, int flags), TP_ARGS(dev, flags), TP_STRUCT__entry( __string( name, dev_name(dev) ) __field( int, flags ) __field( int , usage_count ) __field( int , disable_depth ) __field( int , runtime_auto ) __field( int , request_pending ) __field( int , irq_safe ) __field( int , child_count ) ), TP_fast_assign( __assign_str(name); __entry->flags = flags; __entry->usage_count = atomic_read( &dev->power.usage_count); __entry->disable_depth = dev->power.disable_depth; __entry->runtime_auto = dev->power.runtime_auto; __entry->request_pending = dev->power.request_pending; __entry->irq_safe = dev->power.irq_safe; __entry->child_count = atomic_read( &dev->power.child_count); ), TP_printk("%s flags-%x cnt-%-2d dep-%-2d auto-%-1d p-%-1d" " irq-%-1d child-%d", __get_str(name), __entry->flags, __entry->usage_count, __entry->disable_depth, __entry->runtime_auto, __entry->request_pending, __entry->irq_safe, __entry->child_count ) ); DEFINE_EVENT(rpm_internal, rpm_suspend, TP_PROTO(struct device *dev, int flags), TP_ARGS(dev, flags) ); DEFINE_EVENT(rpm_internal, rpm_resume, TP_PROTO(struct device *dev, int flags), TP_ARGS(dev, flags) ); DEFINE_EVENT(rpm_internal, rpm_idle, TP_PROTO(struct device *dev, int flags), TP_ARGS(dev, flags) ); DEFINE_EVENT(rpm_internal, rpm_usage, TP_PROTO(struct device *dev, int flags), TP_ARGS(dev, flags) ); TRACE_EVENT(rpm_return_int, TP_PROTO(struct device *dev, unsigned long ip, int ret), TP_ARGS(dev, ip, ret), TP_STRUCT__entry( __string( name, dev_name(dev)) __field( unsigned long, ip ) __field( int, ret ) ), TP_fast_assign( __assign_str(name); __entry->ip = ip; __entry->ret = ret; ), TP_printk("%pS:%s ret=%d", (void *)__entry->ip, __get_str(name), __entry->ret) ); #define RPM_STATUS_STRINGS \ EM(RPM_INVALID, "RPM_INVALID") \ EM(RPM_ACTIVE, "RPM_ACTIVE") \ EM(RPM_RESUMING, "RPM_RESUMING") \ EM(RPM_SUSPENDED, "RPM_SUSPENDED") \ EMe(RPM_SUSPENDING, "RPM_SUSPENDING") /* Enums require being exported to userspace, for user tool parsing. */ #undef EM #undef EMe #define EM(a, b) TRACE_DEFINE_ENUM(a); #define EMe(a, b) TRACE_DEFINE_ENUM(a); RPM_STATUS_STRINGS /* * Now redefine the EM() and EMe() macros to map the enums to the strings that * will be printed in the output. */ #undef EM #undef EMe #define EM(a, b) { a, b }, #define EMe(a, b) { a, b } TRACE_EVENT(rpm_status, TP_PROTO(struct device *dev, enum rpm_status status), TP_ARGS(dev, status), TP_STRUCT__entry( __string(name, dev_name(dev)) __field(int, status) ), TP_fast_assign( __assign_str(name); __entry->status = status; ), TP_printk("%s status=%s", __get_str(name), __print_symbolic(__entry->status, RPM_STATUS_STRINGS)) ); #endif /* _TRACE_RUNTIME_POWER_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
13 13 8 7 12 12 12 12 12 12 12 12 8 8 8 8 4 4 4 4 4 4 4 4 4 4 4 760 760 5 412 411 412 409 10 412 90 14 410 5 5 5 5 8 8 8 8 8 8 8 2 8 8 8 8 8 8 8 6 2 1 2 1 2 2 2 6 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/swap_state.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie * * Rewritten to use page cache, (C) 1998 Stephen Tweedie */ #include <linux/mm.h> #include <linux/gfp.h> #include <linux/kernel_stat.h> #include <linux/mempolicy.h> #include <linux/swap.h> #include <linux/leafops.h> #include <linux/init.h> #include <linux/pagemap.h> #include <linux/folio_batch.h> #include <linux/backing-dev.h> #include <linux/blkdev.h> #include <linux/migrate.h> #include <linux/vmalloc.h> #include <linux/huge_mm.h> #include <linux/shmem_fs.h> #include "internal.h" #include "swap_table.h" #include "swap.h" /* * swapper_space is a fiction, retained to simplify the path through * vmscan's shrink_folio_list. */ static const struct address_space_operations swap_aops = { .dirty_folio = noop_dirty_folio, #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, #endif }; struct address_space swap_space __read_mostly = { .a_ops = &swap_aops, }; static bool enable_vma_readahead __read_mostly = true; #define SWAP_RA_ORDER_CEILING 5 #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) #define SWAP_RA_VAL(addr, win, hits) \ (((addr) & PAGE_MASK) | \ (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ ((hits) & SWAP_RA_HITS_MASK)) /* Initial readahead hits is 4 to start up with a small window */ #define GET_SWAP_RA_VAL(vma) \ (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); void show_swap_cache_info(void) { printk("%lu pages in swap cache\n", total_swapcache_pages()); printk("Free swap = %ldkB\n", K(get_nr_swap_pages())); printk("Total swap = %lukB\n", K(total_swap_pages)); } /** * swap_cache_get_folio - Looks up a folio in the swap cache. * @entry: swap entry used for the lookup. * * A found folio will be returned unlocked and with its refcount increased. * * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. * Return: Returns the found folio on success, NULL otherwise. The caller * must lock and check if the folio still matches the swap entry before * use (e.g., folio_matches_swap_entry). */ struct folio *swap_cache_get_folio(swp_entry_t entry) { unsigned long swp_tb; struct folio *folio; for (;;) { swp_tb = swap_table_get(__swap_entry_to_cluster(entry), swp_cluster_offset(entry)); if (!swp_tb_is_folio(swp_tb)) return NULL; folio = swp_tb_to_folio(swp_tb); if (likely(folio_try_get(folio))) return folio; } return NULL; } /** * swap_cache_has_folio - Check if a swap slot has cache. * @entry: swap entry indicating the slot. * * Context: Caller must ensure @entry is valid and protect the swap * device with reference count or locks. */ bool swap_cache_has_folio(swp_entry_t entry) { unsigned long swp_tb; swp_tb = swap_table_get(__swap_entry_to_cluster(entry), swp_cluster_offset(entry)); return swp_tb_is_folio(swp_tb); } /** * swap_cache_get_shadow - Looks up a shadow in the swap cache. * @entry: swap entry used for the lookup. * * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. * Return: Returns either NULL or an XA_VALUE (shadow). */ void *swap_cache_get_shadow(swp_entry_t entry) { unsigned long swp_tb; swp_tb = swap_table_get(__swap_entry_to_cluster(entry), swp_cluster_offset(entry)); if (swp_tb_is_shadow(swp_tb)) return swp_tb_to_shadow(swp_tb); return NULL; } void __swap_cache_add_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry) { unsigned int ci_off = swp_cluster_offset(entry), ci_end; unsigned long nr_pages = folio_nr_pages(folio); unsigned long pfn = folio_pfn(folio); unsigned long old_tb; VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); ci_end = ci_off + nr_pages; do { old_tb = __swap_table_get(ci, ci_off); VM_WARN_ON_ONCE(swp_tb_is_folio(old_tb)); __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); } while (++ci_off < ci_end); folio_ref_add(folio, nr_pages); folio_set_swapcache(folio); folio->swap = entry; node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); } /** * swap_cache_add_folio - Add a folio into the swap cache. * @folio: The folio to be added. * @entry: The swap entry corresponding to the folio. * @gfp: gfp_mask for XArray node allocation. * @shadowp: If a shadow is found, return the shadow. * * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. */ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp) { int err; void *shadow = NULL; unsigned long old_tb; struct swap_info_struct *si; struct swap_cluster_info *ci; unsigned int ci_start, ci_off, ci_end; unsigned long nr_pages = folio_nr_pages(folio); si = __swap_entry_to_info(entry); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; ci_off = ci_start; ci = swap_cluster_lock(si, swp_offset(entry)); if (unlikely(!ci->table)) { err = -ENOENT; goto failed; } do { old_tb = __swap_table_get(ci, ci_off); if (unlikely(swp_tb_is_folio(old_tb))) { err = -EEXIST; goto failed; } if (unlikely(!__swp_tb_get_count(old_tb))) { err = -ENOENT; goto failed; } if (swp_tb_is_shadow(old_tb)) shadow = swp_tb_to_shadow(old_tb); } while (++ci_off < ci_end); __swap_cache_add_folio(ci, folio, entry); swap_cluster_unlock(ci); if (shadowp) *shadowp = shadow; return 0; failed: swap_cluster_unlock(ci); return err; } /** * __swap_cache_del_folio - Removes a folio from the swap cache. * @ci: The locked swap cluster. * @folio: The folio. * @entry: The first swap entry that the folio corresponds to. * @shadow: shadow value to be filled in the swap cache. * * Removes a folio from the swap cache and fills a shadow in place. * This won't put the folio's refcount. The caller has to do that. * * Context: Caller must ensure the folio is locked and in the swap cache * using the index of @entry, and lock the cluster that holds the entries. */ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow) { int count; unsigned long old_tb; struct swap_info_struct *si; unsigned int ci_start, ci_off, ci_end; bool folio_swapped = false, need_free = false; unsigned long nr_pages = folio_nr_pages(folio); VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci); VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); si = __swap_entry_to_info(entry); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; ci_off = ci_start; do { old_tb = __swap_table_get(ci, ci_off); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != folio); count = __swp_tb_get_count(old_tb); if (count) folio_swapped = true; else need_free = true; /* If shadow is NULL, we sets an empty shadow. */ __swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow, count)); } while (++ci_off < ci_end); folio->swap.val = 0; folio_clear_swapcache(folio); node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); if (!folio_swapped) { __swap_cluster_free_entries(si, ci, ci_start, nr_pages); } else if (need_free) { ci_off = ci_start; do { if (!__swp_tb_get_count(__swap_table_get(ci, ci_off))) __swap_cluster_free_entries(si, ci, ci_off, 1); } while (++ci_off < ci_end); } } /** * swap_cache_del_folio - Removes a folio from the swap cache. * @folio: The folio. * * Same as __swap_cache_del_folio, but handles lock and refcount. The * caller must ensure the folio is either clean or has a swap count * equal to zero, or it may cause data loss. * * Context: Caller must ensure the folio is locked and in the swap cache. */ void swap_cache_del_folio(struct folio *folio) { struct swap_cluster_info *ci; swp_entry_t entry = folio->swap; ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); __swap_cache_del_folio(ci, folio, entry, NULL); swap_cluster_unlock(ci); folio_ref_sub(folio, folio_nr_pages(folio)); } /** * __swap_cache_replace_folio - Replace a folio in the swap cache. * @ci: The locked swap cluster. * @old: The old folio to be replaced. * @new: The new folio. * * Replace an existing folio in the swap cache with a new folio. The * caller is responsible for setting up the new folio's flag and swap * entries. Replacement will take the new folio's swap entry value as * the starting offset to override all slots covered by the new folio. * * Context: Caller must ensure both folios are locked, and lock the * cluster that holds the old folio to be replaced. */ void __swap_cache_replace_folio(struct swap_cluster_info *ci, struct folio *old, struct folio *new) { swp_entry_t entry = new->swap; unsigned long nr_pages = folio_nr_pages(new); unsigned int ci_off = swp_cluster_offset(entry); unsigned int ci_end = ci_off + nr_pages; unsigned long pfn = folio_pfn(new); unsigned long old_tb; VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); VM_WARN_ON_ONCE(!entry.val); /* Swap cache still stores N entries instead of a high-order entry */ do { old_tb = __swap_table_get(ci, ci_off); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); } while (++ci_off < ci_end); /* * If the old folio is partially replaced (e.g., splitting a large * folio, the old folio is shrunk, and new split sub folios replace * the shrunk part), ensure the new folio doesn't overlap it. */ if (IS_ENABLED(CONFIG_DEBUG_VM) && folio_order(old) != folio_order(new)) { ci_off = swp_cluster_offset(old->swap); ci_end = ci_off + folio_nr_pages(old); while (ci_off++ < ci_end) WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old); } } /* * If we are the only user, then try to free up the swap cache. * * Its ok to check the swapcache flag without the folio lock * here because we are going to recheck again inside * folio_free_swap() _with_ the lock. * - Marcelo */ void free_swap_cache(struct folio *folio) { if (folio_test_swapcache(folio) && !folio_mapped(folio) && folio_trylock(folio)) { folio_free_swap(folio); folio_unlock(folio); } } /* * Freeing a folio and also freeing any swap cache associated with * this folio if it is the last user. */ void free_folio_and_swap_cache(struct folio *folio) { free_swap_cache(folio); if (!is_huge_zero_folio(folio)) folio_put(folio); } /* * Passed an array of pages, drop them all from swapcache and then release * them. They are removed from the LRU and freed if this is their last use. */ void free_pages_and_swap_cache(struct encoded_page **pages, int nr) { struct folio_batch folios; unsigned int refs[FOLIO_BATCH_SIZE]; folio_batch_init(&folios); for (int i = 0; i < nr; i++) { struct folio *folio = page_folio(encoded_page_ptr(pages[i])); free_swap_cache(folio); refs[folios.nr] = 1; if (unlikely(encoded_page_flags(pages[i]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) refs[folios.nr] = encoded_nr_pages(pages[++i]); if (folio_batch_add(&folios, folio) == 0) folios_put_refs(&folios, refs); } if (folios.nr) folios_put_refs(&folios, refs); } static inline bool swap_use_vma_readahead(void) { return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); } /** * swap_update_readahead - Update the readahead statistics of VMA or globally. * @folio: the swap cache folio that just got hit. * @vma: the VMA that should be updated, could be NULL for global update. * @addr: the addr that triggered the swapin, ignored if @vma is NULL. */ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { bool readahead, vma_ra = swap_use_vma_readahead(); /* * At the moment, we don't support PG_readahead for anon THP * so let's bail out rather than confusing the readahead stat. */ if (unlikely(folio_test_large(folio))) return; readahead = folio_test_clear_readahead(folio); if (vma && vma_ra) { unsigned long ra_val; int win, hits; ra_val = GET_SWAP_RA_VAL(vma); win = SWAP_RA_WIN(ra_val); hits = SWAP_RA_HITS(ra_val); if (readahead) hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(addr, win, hits)); } if (readahead) { count_vm_event(SWAP_RA_HIT); if (!vma || !vma_ra) atomic_inc(&swapin_readahead_hits); } } /** * __swap_cache_prepare_and_add - Prepare the folio and add it to swap cache. * @entry: swap entry to be bound to the folio. * @folio: folio to be added. * @gfp: memory allocation flags for charge, can be 0 if @charged if true. * @charged: if the folio is already charged. * * Update the swap_map and add folio as swap cache, typically before swapin. * All swap slots covered by the folio must have a non-zero swap count. * * Context: Caller must protect the swap device with reference count or locks. * Return: Returns the folio being added on success. Returns the existing folio * if @entry is already cached. Returns NULL if raced with swapin or swapoff. */ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, struct folio *folio, gfp_t gfp, bool charged) { struct folio *swapcache = NULL; void *shadow; int ret; __folio_set_locked(folio); __folio_set_swapbacked(folio); if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) goto failed; for (;;) { ret = swap_cache_add_folio(folio, entry, &shadow); if (!ret) break; /* * Large order allocation needs special handling on * race: if a smaller folio exists in cache, swapin needs * to fallback to order 0, and doing a swap cache lookup * might return a folio that is irrelevant to the faulting * entry because @entry is aligned down. Just return NULL. */ if (ret != -EEXIST || folio_test_large(folio)) goto failed; swapcache = swap_cache_get_folio(entry); if (swapcache) goto failed; } memcg1_swapin(entry, folio_nr_pages(folio)); if (shadow) workingset_refault(folio, shadow); /* Caller will initiate read into locked folio */ folio_add_lru(folio); return folio; failed: folio_unlock(folio); return swapcache; } /** * swap_cache_alloc_folio - Allocate folio for swapped out slot in swap cache. * @entry: the swapped out swap entry to be binded to the folio. * @gfp_mask: memory allocation flags * @mpol: NUMA memory allocation policy to be applied * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * @new_page_allocated: sets true if allocation happened, false otherwise * * Allocate a folio in the swap cache for one swap slot, typically before * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by * @entry must have a non-zero swap count (swapped out). * Currently only supports order 0. * * Context: Caller must protect the swap device with reference count or locks. * Return: Returns the existing folio if @entry is cached already. Returns * NULL if failed due to -ENOMEM or @entry have a swap count < 1. */ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated) { struct swap_info_struct *si = __swap_entry_to_info(entry); struct folio *folio; struct folio *result = NULL; *new_page_allocated = false; /* Check the swap cache again for readahead path. */ folio = swap_cache_get_folio(entry); if (folio) return folio; /* Skip allocation for unused and bad swap slot for readahead. */ if (!swap_entry_swapped(si, entry)) return NULL; /* Allocate a new folio to be added into the swap cache. */ folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); if (!folio) return NULL; /* Try add the new folio, returns existing folio or NULL on failure. */ result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false); if (result == folio) *new_page_allocated = true; else folio_put(folio); return result; } /** * swapin_folio - swap-in one or multiple entries skipping readahead. * @entry: starting swap entry to swap in * @folio: a new allocated and charged folio * * Reads @entry into @folio, @folio will be added to the swap cache. * If @folio is a large folio, the @entry will be rounded down to align * with the folio size. * * Return: returns pointer to @folio on success. If folio is a large folio * and this raced with another swapin, NULL will be returned to allow fallback * to order 0. Else, if another folio was already added to the swap cache, * return that swap cache folio instead. */ struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) { struct folio *swapcache; pgoff_t offset = swp_offset(entry); unsigned long nr_pages = folio_nr_pages(folio); entry = swp_entry(swp_type(entry), round_down(offset, nr_pages)); swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true); if (swapcache == folio) swap_read_folio(folio, NULL); return swapcache; } /* * Locate a page of swap in physical memory, reserving swap cache space * and reading the disk if it is not already cached. * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. */ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug) { struct swap_info_struct *si; bool page_allocated; struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; si = get_swap_device(entry); if (!si) return NULL; mpol = get_vma_policy(vma, addr, 0, &ilx); folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, &page_allocated); mpol_cond_put(mpol); if (page_allocated) swap_read_folio(folio, plug); put_swap_device(si); return folio; } static unsigned int __swapin_nr_pages(unsigned long prev_offset, unsigned long offset, int hits, int max_pages, int prev_win) { unsigned int pages, last_ra; /* * This heuristic has been found to work well on both sequential and * random loads, swapping to hard disk or to SSD: please don't ask * what the "+ 2" means, it just happens to work well, that's all. */ pages = hits + 2; if (pages == 2) { /* * We can have no readahead hits to judge by: but must not get * stuck here forever, so check for an adjacent offset instead * (and don't even bother to check whether swap type is same). */ if (offset != prev_offset + 1 && offset != prev_offset - 1) pages = 1; } else { unsigned int roundup = 4; while (roundup < pages) roundup <<= 1; pages = roundup; } if (pages > max_pages) pages = max_pages; /* Don't shrink readahead too fast */ last_ra = prev_win / 2; if (pages < last_ra) pages = last_ra; return pages; } static unsigned long swapin_nr_pages(unsigned long offset) { static unsigned long prev_offset; unsigned int hits, pages, max_pages; static atomic_t last_readahead_pages; max_pages = 1 << READ_ONCE(page_cluster); if (max_pages <= 1) return 1; hits = atomic_xchg(&swapin_readahead_hits, 0); pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, max_pages, atomic_read(&last_readahead_pages)); if (!hits) WRITE_ONCE(prev_offset, offset); atomic_set(&last_readahead_pages, pages); return pages; } /** * swap_cluster_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags * @mpol: NUMA memory allocation policy to be applied * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * * Returns the struct folio for entry and addr, after queueing swapin. * * Primitive swap readahead code. We simply read an aligned block of * (1 << page_cluster) entries in the swap area. This method is chosen * because it doesn't cost us any seek time. We also make sure to queue * the 'original' request together with the readahead ones... * * Note: it is intentional that the same NUMA policy and interleave index * are used for every page of the readahead: neighbouring pages on swap * are fairly likely to have been swapped out from the same node. */ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx) { struct folio *folio; unsigned long entry_offset = swp_offset(entry); unsigned long offset = entry_offset; unsigned long start_offset, end_offset; unsigned long mask; struct swap_info_struct *si = __swap_entry_to_info(entry); struct blk_plug plug; struct swap_iocb *splug = NULL; bool page_allocated; mask = swapin_nr_pages(offset) - 1; if (!mask) goto skip; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; if (!start_offset) /* First page is swap header. */ start_offset++; if (end_offset >= si->max) end_offset = si->max - 1; blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ folio = swap_cache_alloc_folio( swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx, &page_allocated); if (!folio) continue; if (page_allocated) { swap_read_folio(folio, &splug); if (offset != entry_offset) { folio_set_readahead(folio); count_vm_event(SWAP_RA); } } folio_put(folio); } blk_finish_plug(&plug); swap_read_unplug(splug); lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, &page_allocated); if (unlikely(page_allocated)) swap_read_folio(folio, NULL); return folio; } static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, unsigned long *end) { struct vm_area_struct *vma = vmf->vma; unsigned long ra_val; unsigned long faddr, prev_faddr, left, right; unsigned int max_win, hits, prev_win, win; max_win = 1 << min(READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); if (max_win == 1) return 1; faddr = vmf->address; ra_val = GET_SWAP_RA_VAL(vma); prev_faddr = SWAP_RA_ADDR(ra_val); prev_win = SWAP_RA_WIN(ra_val); hits = SWAP_RA_HITS(ra_val); win = __swapin_nr_pages(PFN_DOWN(prev_faddr), PFN_DOWN(faddr), hits, max_win, prev_win); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); if (win == 1) return 1; if (faddr == prev_faddr + PAGE_SIZE) left = faddr; else if (prev_faddr == faddr + PAGE_SIZE) left = faddr - (win << PAGE_SHIFT) + PAGE_SIZE; else left = faddr - (((win - 1) / 2) << PAGE_SHIFT); right = left + (win << PAGE_SHIFT); if ((long)left < 0) left = 0; *start = max3(left, vma->vm_start, faddr & PMD_MASK); *end = min3(right, vma->vm_end, (faddr & PMD_MASK) + PMD_SIZE); return win; } /** * swap_vma_readahead - swap in pages in hope we need them soon * @targ_entry: swap entry of the targeted memory * @gfp_mask: memory allocation flags * @mpol: NUMA memory allocation policy to be applied * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * @vmf: fault information * * Returns the struct folio for entry and addr, after queueing swapin. * * Primitive swap readahead code. We simply read in a few pages whose * virtual addresses are around the fault address in the same vma. * * Caller must hold read mmap_lock if vmf->vma is not NULL. * */ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) { struct blk_plug plug; struct swap_iocb *splug = NULL; struct folio *folio; pte_t *pte = NULL, pentry; int win; unsigned long start, end, addr; pgoff_t ilx; bool page_allocated; win = swap_vma_ra_win(vmf, &start, &end); if (win == 1) goto skip; ilx = targ_ilx - PFN_DOWN(vmf->address - start); blk_start_plug(&plug); for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) { struct swap_info_struct *si = NULL; softleaf_t entry; if (!pte++) { pte = pte_offset_map(vmf->pmd, addr); if (!pte) break; } pentry = ptep_get_lockless(pte); entry = softleaf_from_pte(pentry); if (!softleaf_is_swap(entry)) continue; pte_unmap(pte); pte = NULL; /* * Readahead entry may come from a device that we are not * holding a reference to, try to grab a reference, or skip. */ if (swp_type(entry) != swp_type(targ_entry)) { si = get_swap_device(entry); if (!si) continue; } folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, &page_allocated); if (si) put_swap_device(si); if (!folio) continue; if (page_allocated) { swap_read_folio(folio, &splug); if (addr != vmf->address) { folio_set_readahead(folio); count_vm_event(SWAP_RA); } } folio_put(folio); } if (pte) pte_unmap(pte); blk_finish_plug(&plug); swap_read_unplug(splug); lru_add_drain(); skip: /* The folio was likely read above, so no need for plugging here */ folio = swap_cache_alloc_folio(targ_entry, gfp_mask, mpol, targ_ilx, &page_allocated); if (unlikely(page_allocated)) swap_read_folio(folio, NULL); return folio; } /** * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags * @vmf: fault information * * Returns the struct folio for entry and addr, after queueing swapin. * * It's a main entry function for swap readahead. By the configuration, * it will read ahead blocks by cluster-based(ie, physical disk based) * or vma-based(ie, virtual address based on faulty address) readahead. */ struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) { struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); folio = swap_use_vma_readahead() ? swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : swap_cluster_readahead(entry, gfp_mask, mpol, ilx); mpol_cond_put(mpol); return folio; } #ifdef CONFIG_SYSFS static ssize_t vma_ra_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", str_true_false(enable_vma_readahead)); } static ssize_t vma_ra_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { ssize_t ret; ret = kstrtobool(buf, &enable_vma_readahead); if (ret) return ret; return count; } static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); static struct attribute *swap_attrs[] = { &vma_ra_enabled_attr.attr, NULL, }; static const struct attribute_group swap_attr_group = { .attrs = swap_attrs, }; static int __init swap_init(void) { int err; struct kobject *swap_kobj; swap_kobj = kobject_create_and_add("swap", mm_kobj); if (!swap_kobj) { pr_err("failed to create swap kobject\n"); return -ENOMEM; } err = sysfs_create_group(swap_kobj, &swap_attr_group); if (err) { pr_err("failed to register swap group\n"); goto delete_obj; } /* Swap cache writeback is LRU based, no tags for it */ mapping_set_no_writeback_tags(&swap_space); return 0; delete_obj: kobject_put(swap_kobj); return err; } subsys_initcall(swap_init); #endif
1 1 1 1 1 1 2743 5 5 2 3 12 2 12 2 11 3 5 16 17 16 17 17 12 5 25 25 21 19 19 1 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * Frame handler other utility functions for HSR and PRP. */ #include "hsr_slave.h" #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include "hsr_main.h" #include "hsr_device.h" #include "hsr_forward.h" #include "hsr_framereg.h" bool hsr_invalid_dan_ingress_frame(__be16 protocol) { return (protocol != htons(ETH_P_PRP) && protocol != htons(ETH_P_HSR)); } static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct hsr_port *port; struct hsr_priv *hsr; __be16 protocol; /* Packets from dev_loopback_xmit() do not have L2 header, bail out */ if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: skb invalid", __func__); return RX_HANDLER_PASS; } port = hsr_port_get_rcu(skb->dev); if (!port) goto finish_pass; hsr = port->hsr; if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) { /* Directly kill frames sent by ourselves */ kfree_skb(skb); goto finish_consume; } /* For HSR, only tagged frames are expected (unless the device offloads * HSR tag removal), but for PRP there could be non tagged frames as * well from Single attached nodes (SANs). */ protocol = eth_hdr(skb)->h_proto; if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && port->type != HSR_PT_INTERLINK && hsr->proto_ops->invalid_dan_ingress_frame && hsr->proto_ops->invalid_dan_ingress_frame(protocol)) goto finish_pass; skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); if ((!hsr->prot_version && protocol == htons(ETH_P_PRP)) || protocol == htons(ETH_P_HSR)) { if (!pskb_may_pull(skb, ETH_HLEN + HSR_HLEN)) { kfree_skb(skb); goto finish_consume; } skb_set_network_header(skb, ETH_HLEN + HSR_HLEN); } skb_reset_mac_len(skb); /* Only the frames received over the interlink port will assign a * sequence number and require synchronisation vs other sender. */ if (port->type == HSR_PT_INTERLINK) { spin_lock_bh(&hsr->seqnr_lock); hsr_forward_skb(skb, port); spin_unlock_bh(&hsr->seqnr_lock); } else { hsr_forward_skb(skb, port); } finish_consume: return RX_HANDLER_CONSUMED; finish_pass: return RX_HANDLER_PASS; } bool hsr_port_exists(const struct net_device *dev) { return rcu_access_pointer(dev->rx_handler) == hsr_handle_frame; } static int hsr_check_dev_ok(struct net_device *dev, struct netlink_ext_ack *extack) { /* Don't allow HSR on non-ethernet like devices */ if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN) { NL_SET_ERR_MSG_MOD(extack, "Cannot use loopback or non-ethernet device as HSR slave."); return -EINVAL; } /* Don't allow enslaving hsr devices */ if (is_hsr_master(dev)) { NL_SET_ERR_MSG_MOD(extack, "Cannot create trees of HSR devices."); return -EINVAL; } if (hsr_port_exists(dev)) { NL_SET_ERR_MSG_MOD(extack, "This device is already a HSR slave."); return -EINVAL; } if (is_vlan_dev(dev)) { NL_SET_ERR_MSG_MOD(extack, "HSR on top of VLAN is not yet supported in this driver."); return -EINVAL; } if (dev->priv_flags & IFF_DONT_BRIDGE) { NL_SET_ERR_MSG_MOD(extack, "This device does not support bridging."); return -EOPNOTSUPP; } /* HSR over bonded devices has not been tested, but I'm not sure it * won't work... */ return 0; } /* Setup device to be added to the HSR bridge. */ static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev, struct hsr_port *port, struct netlink_ext_ack *extack) { struct netdev_lag_upper_info lag_upper_info; struct net_device *hsr_dev; struct hsr_port *master; int res; /* Don't use promiscuous mode for offload since L2 frame forward * happens at the offloaded hardware. */ if (!port->hsr->fwd_offloaded) { res = dev_set_promiscuity(dev, 1); if (res) return res; } master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); hsr_dev = master->dev; lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_BROADCAST; lag_upper_info.hash_type = NETDEV_LAG_HASH_UNKNOWN; res = netdev_master_upper_dev_link(dev, hsr_dev, NULL, &lag_upper_info, extack); if (res) goto fail_upper_dev_link; res = netdev_rx_handler_register(dev, hsr_handle_frame, port); if (res) goto fail_rx_handler; dev_disable_lro(dev); return 0; fail_rx_handler: netdev_upper_dev_unlink(dev, hsr_dev); fail_upper_dev_link: if (!port->hsr->fwd_offloaded) dev_set_promiscuity(dev, -1); return res; } int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, enum hsr_port_type type, struct netlink_ext_ack *extack) { struct hsr_port *port, *master; int res; if (type != HSR_PT_MASTER) { res = hsr_check_dev_ok(dev, extack); if (res) return res; } port = hsr_port_get_hsr(hsr, type); if (port) return -EBUSY; /* This port already exists */ port = kzalloc_obj(*port); if (!port) return -ENOMEM; port->hsr = hsr; port->dev = dev; port->type = type; ether_addr_copy(port->original_macaddress, dev->dev_addr); list_add_tail_rcu(&port->port_list, &hsr->ports); if (type != HSR_PT_MASTER) { res = hsr_portdev_setup(hsr, dev, port, extack); if (res) goto fail_dev_setup; } master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); return 0; fail_dev_setup: list_del_rcu(&port->port_list); kfree_rcu(port, rcu); return res; } void hsr_del_port(struct hsr_port *port) { struct hsr_priv *hsr; struct hsr_port *master; hsr = port->hsr; master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); list_del_rcu(&port->port_list); if (port != master) { netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); netdev_rx_handler_unregister(port->dev); if (!port->hsr->fwd_offloaded) dev_set_promiscuity(port->dev, -1); netdev_upper_dev_unlink(port->dev, master->dev); if (hsr->prot_version == PRP_V1 && port->type == HSR_PT_SLAVE_B) { eth_hw_addr_set(port->dev, port->original_macaddress); call_netdevice_notifiers(NETDEV_CHANGEADDR, port->dev); } } kfree_rcu(port, rcu); }
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_COREDUMP_H #define _LINUX_SCHED_COREDUMP_H #include <linux/mm_types.h> #define SUID_DUMP_DISABLE 0 /* No setuid dumping */ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ static inline unsigned long __mm_flags_get_dumpable(const struct mm_struct *mm) { /* * By convention, dumpable bits are contained in first 32 bits of the * bitmap, so we can simply access this first unsigned long directly. */ return __mm_flags_get_word(mm); } static inline void __mm_flags_set_mask_dumpable(struct mm_struct *mm, int value) { __mm_flags_set_mask_bits_word(mm, MMF_DUMPABLE_MASK, value); } extern void set_dumpable(struct mm_struct *mm, int value); /* * This returns the actual value of the suid_dumpable flag. For things * that are using this for checking for privilege transitions, it must * test against SUID_DUMP_USER rather than treating it as a boolean * value. */ static inline int __get_dumpable(unsigned long mm_flags) { return mm_flags & MMF_DUMPABLE_MASK; } static inline int get_dumpable(struct mm_struct *mm) { unsigned long flags = __mm_flags_get_dumpable(mm); return __get_dumpable(flags); } #endif /* _LINUX_SCHED_COREDUMP_H */
470 238 471 472 397 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 // SPDX-License-Identifier: GPL-2.0 /* * memfd_create system call and file sealing support * * Code was originally included in shmem.c, and broken out to facilitate * use by hugetlbfs as well as tmpfs. */ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/khugepaged.h> #include <linux/syscalls.h> #include <linux/hugetlb.h> #include <linux/shmem_fs.h> #include <linux/memfd.h> #include <linux/pid_namespace.h> #include <uapi/linux/memfd.h> #include "swap.h" /* * We need a tag: a new tag would expand every xa_node by 8 bytes, * so reuse a tag which we firmly believe is never set or cleared on tmpfs * or hugetlbfs because they are memory only filesystems. */ #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE #define LAST_SCAN 4 /* about 150ms max */ static bool memfd_folio_has_extra_refs(struct folio *folio) { return folio_ref_count(folio) != folio_expected_ref_count(folio); } static void memfd_tag_pins(struct xa_state *xas) { struct folio *folio; int latency = 0; lru_add_drain(); xas_lock_irq(xas); xas_for_each(xas, folio, ULONG_MAX) { if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio)) xas_set_mark(xas, MEMFD_TAG_PINNED); if (++latency < XA_CHECK_SCHED) continue; latency = 0; xas_pause(xas); xas_unlock_irq(xas); cond_resched(); xas_lock_irq(xas); } xas_unlock_irq(xas); } /* * This is a helper function used by memfd_pin_user_pages() in GUP (gup.c). * It is mainly called to allocate a folio in a memfd when the caller * (memfd_pin_folios()) cannot find a folio in the page cache at a given * index in the mapping. */ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) { #ifdef CONFIG_HUGETLB_PAGE struct folio *folio; gfp_t gfp_mask; if (is_file_hugepages(memfd)) { /* * The folio would most likely be accessed by a DMA driver, * therefore, we have zone memory constraints where we can * alloc from. Also, the folio will be pinned for an indefinite * amount of time, so it is not expected to be migrated away. */ struct inode *inode = file_inode(memfd); struct hstate *h = hstate_file(memfd); int err = -ENOMEM; long nr_resv; gfp_mask = htlb_alloc_mask(h); gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE); idx >>= huge_page_order(h); nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, EMPTY_VMA_FLAGS); if (nr_resv < 0) return ERR_PTR(nr_resv); folio = alloc_hugetlb_folio_reserve(h, numa_node_id(), NULL, gfp_mask); if (folio) { u32 hash; /* * Zero the folio to prevent information leaks to userspace. * Use folio_zero_user() which is optimized for huge/gigantic * pages. Pass 0 as addr_hint since this is not a faulting path * and we don't have a user virtual address yet. */ folio_zero_user(folio, 0); /* * Mark the folio uptodate before adding to page cache, * as required by filemap.c and other hugetlb paths. */ __folio_mark_uptodate(folio); /* * Serialize hugepage allocation and instantiation to prevent * races with concurrent allocations, as required by all other * callers of hugetlb_add_to_page_cache(). */ hash = hugetlb_fault_mutex_hash(memfd->f_mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); err = hugetlb_add_to_page_cache(folio, memfd->f_mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); if (err) { folio_put(folio); goto err_unresv; } hugetlb_set_folio_subpool(folio, subpool_inode(inode)); folio_unlock(folio); return folio; } err_unresv: if (nr_resv > 0) hugetlb_unreserve_pages(inode, idx, idx + 1, 0); return ERR_PTR(err); } #endif return shmem_read_folio(memfd->f_mapping, idx); } /* * Setting SEAL_WRITE requires us to verify there's no pending writer. However, * via get_user_pages(), drivers might have some pending I/O without any active * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios * and see whether it has an elevated ref-count. If so, we tag them and wait for * them to be dropped. * The caller must guarantee that no new user will acquire writable references * to those folios to avoid races. */ static int memfd_wait_for_pins(struct address_space *mapping) { XA_STATE(xas, &mapping->i_pages, 0); struct folio *folio; int error, scan; memfd_tag_pins(&xas); error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { int latency = 0; if (!xas_marked(&xas, MEMFD_TAG_PINNED)) break; if (!scan) lru_add_drain_all(); else if (schedule_timeout_killable((HZ << scan) / 200)) scan = LAST_SCAN; xas_set(&xas, 0); xas_lock_irq(&xas); xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) { bool clear = true; if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio)) { /* * On the last scan, we clean up all those tags * we inserted; but make a note that we still * found folios pinned. */ if (scan == LAST_SCAN) error = -EBUSY; else clear = false; } if (clear) xas_clear_mark(&xas, MEMFD_TAG_PINNED); if (++latency < XA_CHECK_SCHED) continue; latency = 0; xas_pause(&xas); xas_unlock_irq(&xas); cond_resched(); xas_lock_irq(&xas); } xas_unlock_irq(&xas); } return error; } static unsigned int *memfd_file_seals_ptr(struct file *file) { if (shmem_file(file)) return &SHMEM_I(file_inode(file))->seals; #ifdef CONFIG_HUGETLBFS if (is_file_hugepages(file)) return &HUGETLBFS_I(file_inode(file))->seals; #endif return NULL; } #define F_ALL_SEALS (F_SEAL_SEAL | \ F_SEAL_EXEC | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ F_SEAL_WRITE | \ F_SEAL_FUTURE_WRITE) int memfd_add_seals(struct file *file, unsigned int seals) { struct inode *inode = file_inode(file); unsigned int *file_seals; int error; /* * SEALING * Sealing allows multiple parties to share a tmpfs or hugetlbfs file * but restrict access to a specific subset of file operations. Seals * can only be added, but never removed. This way, mutually untrusted * parties can share common memory regions with a well-defined policy. * A malicious peer can thus never perform unwanted operations on a * shared object. * * Seals are only supported on special tmpfs or hugetlbfs files and * always affect the whole underlying inode. Once a seal is set, it * may prevent some kinds of access to the file. Currently, the * following seals are defined: * SEAL_SEAL: Prevent further seals from being set on this file * SEAL_SHRINK: Prevent the file from shrinking * SEAL_GROW: Prevent the file from growing * SEAL_WRITE: Prevent write access to the file * SEAL_EXEC: Prevent modification of the exec bits in the file mode * * As we don't require any trust relationship between two parties, we * must prevent seals from being removed. Therefore, sealing a file * only adds a given set of seals to the file, it never touches * existing seals. Furthermore, the "setting seals"-operation can be * sealed itself, which basically prevents any further seal from being * added. * * Semantics of sealing are only defined on volatile files. Only * anonymous tmpfs and hugetlbfs files support sealing. More * importantly, seals are never written to disk. Therefore, there's * no plan to support it on other file types. */ if (!(file->f_mode & FMODE_WRITE)) return -EPERM; if (seals & ~(unsigned int)F_ALL_SEALS) return -EINVAL; inode_lock(inode); file_seals = memfd_file_seals_ptr(file); if (!file_seals) { error = -EINVAL; goto unlock; } if (*file_seals & F_SEAL_SEAL) { error = -EPERM; goto unlock; } /* * SEAL_EXEC implies SEAL_WRITE, making W^X from the start. */ if (seals & F_SEAL_EXEC && inode->i_mode & 0111) seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE; if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { error = mapping_deny_writable(file->f_mapping); if (error) goto unlock; error = memfd_wait_for_pins(file->f_mapping); if (error) { mapping_allow_writable(file->f_mapping); goto unlock; } } *file_seals |= seals; error = 0; unlock: inode_unlock(inode); return error; } int memfd_get_seals(struct file *file) { unsigned int *seals = memfd_file_seals_ptr(file); return seals ? *seals : -EINVAL; } long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg) { long error; switch (cmd) { case F_ADD_SEALS: error = memfd_add_seals(file, arg); break; case F_GET_SEALS: error = memfd_get_seals(file); break; default: error = -EINVAL; break; } return error; } #define MFD_NAME_PREFIX "memfd:" #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC) static int check_sysctl_memfd_noexec(unsigned int *flags) { #ifdef CONFIG_SYSCTL struct pid_namespace *ns = task_active_pid_ns(current); int sysctl = pidns_memfd_noexec_scope(ns); if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL) *flags |= MFD_NOEXEC_SEAL; else *flags |= MFD_EXEC; } if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) { pr_err_ratelimited( "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n", current->comm, task_pid_nr(current), sysctl); return -EACCES; } #endif return 0; } static inline bool is_write_sealed(unsigned int seals) { return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); } static int check_write_seal(vm_flags_t *vm_flags_ptr) { vm_flags_t vm_flags = *vm_flags_ptr; vm_flags_t mask = vm_flags & (VM_SHARED | VM_WRITE); /* If a private mapping then writability is irrelevant. */ if (!(mask & VM_SHARED)) return 0; /* * New PROT_WRITE and MAP_SHARED mmaps are not allowed when * write seals are active. */ if (mask & VM_WRITE) return -EPERM; /* * This is a read-only mapping, disallow mprotect() from making a * write-sealed mapping writable in future. */ *vm_flags_ptr &= ~VM_MAYWRITE; return 0; } int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr) { int err = 0; unsigned int *seals_ptr = memfd_file_seals_ptr(file); unsigned int seals = seals_ptr ? *seals_ptr : 0; if (is_write_sealed(seals)) err = check_write_seal(vm_flags_ptr); return err; } static int sanitize_flags(unsigned int *flags_ptr) { unsigned int flags = *flags_ptr; if (!(flags & MFD_HUGETLB)) { if (flags & ~MFD_ALL_FLAGS) return -EINVAL; } else { /* Allow huge page size encoding in flags. */ if (flags & ~(MFD_ALL_FLAGS | (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) return -EINVAL; } /* Invalid if both EXEC and NOEXEC_SEAL are set.*/ if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL)) return -EINVAL; return check_sysctl_memfd_noexec(flags_ptr); } static char *alloc_name(const char __user *uname) { int error; char *name; long len; name = kmalloc(NAME_MAX + 1, GFP_KERNEL); if (!name) return ERR_PTR(-ENOMEM); memcpy(name, MFD_NAME_PREFIX, MFD_NAME_PREFIX_LEN); /* returned length does not include terminating zero */ len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1); if (len < 0) { error = -EFAULT; goto err_name; } else if (len > MFD_NAME_MAX_LEN) { error = -EINVAL; goto err_name; } return name; err_name: kfree(name); return ERR_PTR(error); } struct file *memfd_alloc_file(const char *name, unsigned int flags) { unsigned int *file_seals; struct file *file; struct inode *inode; int err = 0; if (flags & MFD_HUGETLB) { file = hugetlb_file_setup(name, 0, mk_vma_flags(VMA_NORESERVE_BIT), HUGETLB_ANONHUGE_INODE, (flags >> MFD_HUGE_SHIFT) & MFD_HUGE_MASK); } else { file = shmem_file_setup(name, 0, mk_vma_flags(VMA_NORESERVE_BIT)); } if (IS_ERR(file)) return file; inode = file_inode(file); err = security_inode_init_security_anon(inode, &QSTR(MEMFD_ANON_NAME), NULL); if (err) { fput(file); file = ERR_PTR(err); return file; } file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_LARGEFILE; if (flags & MFD_NOEXEC_SEAL) { inode->i_mode &= ~0111; file_seals = memfd_file_seals_ptr(file); if (file_seals) { *file_seals &= ~F_SEAL_SEAL; *file_seals |= F_SEAL_EXEC; } } else if (flags & MFD_ALLOW_SEALING) { /* MFD_EXEC and MFD_ALLOW_SEALING are set */ file_seals = memfd_file_seals_ptr(file); if (file_seals) *file_seals &= ~F_SEAL_SEAL; } return file; } SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) { char *name __free(kfree) = NULL; unsigned int fd_flags; int error; error = sanitize_flags(&flags); if (error < 0) return error; name = alloc_name(uname); if (IS_ERR(name)) return PTR_ERR(name); fd_flags = (flags & MFD_CLOEXEC) ? O_CLOEXEC : 0; return FD_ADD(fd_flags, memfd_alloc_file(name, flags)); }
2493 24 2160 2160 2167 2156 13 13 13 7 7 5 8 13 13 13 13 13 1328 1327 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ #include <linux/dcache.h> #include <linux/fs.h> #include <linux/gfp.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/srcu.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" /* * Clear all of the marks on an inode when it is being evicted from core */ void __fsnotify_inode_delete(struct inode *inode) { fsnotify_clear_marks_by_inode(inode); } EXPORT_SYMBOL_GPL(__fsnotify_inode_delete); void __fsnotify_vfsmount_delete(struct vfsmount *mnt) { fsnotify_clear_marks_by_mount(mnt); } void __fsnotify_mntns_delete(struct mnt_namespace *mntns) { fsnotify_clear_marks_by_mntns(mntns); } void fsnotify_sb_delete(struct super_block *sb) { struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); /* Were any marks ever added to any object on this sb? */ if (!sbinfo) return; fsnotify_unmount_inodes(sbinfo); fsnotify_clear_marks_by_sb(sb); /* Wait for outstanding object references from connectors */ wait_var_event(fsnotify_sb_watched_objects(sb), !atomic_long_read(fsnotify_sb_watched_objects(sb))); WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT)); WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_PRE_CONTENT)); } void fsnotify_sb_free(struct super_block *sb) { if (sb->s_fsnotify_info) { WARN_ON_ONCE(!list_empty(&sb->s_fsnotify_info->inode_conn_list)); kfree(sb->s_fsnotify_info); } } /* * Given an inode, first check if we care what happens to our children. Inotify * and dnotify both tell their parents about events. If we care about any event * on a child we run all of our children and set a dentry flag saying that the * parent cares. Thus when an event happens on a child it can quickly tell * if there is a need to find a parent and send the event to the parent. */ void fsnotify_set_children_dentry_flags(struct inode *inode) { struct dentry *alias; if (!S_ISDIR(inode->i_mode)) return; spin_lock(&inode->i_lock); /* run all of the dentries associated with this inode. Since this is a * directory, there damn well better only be one item on this list */ for_each_alias(alias, inode) { struct dentry *child; /* run all of the children of the original inode and fix their * d_flags to indicate parental interest (their parent is the * original inode) */ spin_lock(&alias->d_lock); hlist_for_each_entry(child, &alias->d_children, d_sib) { if (!child->d_inode) continue; spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; spin_unlock(&child->d_lock); } spin_unlock(&alias->d_lock); } spin_unlock(&inode->i_lock); } /* * Lazily clear false positive PARENT_WATCHED flag for child whose parent had * stopped watching children. */ static void fsnotify_clear_child_dentry_flag(struct inode *pinode, struct dentry *dentry) { spin_lock(&dentry->d_lock); /* * d_lock is a sufficient barrier to prevent observing a non-watched * parent state from before the fsnotify_set_children_dentry_flags() * or fsnotify_update_flags() call that had set PARENT_WATCHED. */ if (!fsnotify_inode_watches_children(pinode)) dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; spin_unlock(&dentry->d_lock); } /* Are inode/sb/mount interested in parent and name info with this event? */ static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask, __u32 mask) { __u32 marks_mask = 0; /* We only send parent/name to inode/sb/mount for events on non-dir */ if (mask & FS_ISDIR) return false; /* * All events that are possible on child can also may be reported with * parent/name info to inode/sb/mount. Otherwise, a watching parent * could result in events reported with unexpected name info to sb/mount. */ BUILD_BUG_ON(FS_EVENTS_POSS_ON_CHILD & ~FS_EVENTS_POSS_TO_PARENT); /* Did either inode/sb/mount subscribe for events with parent/name? */ marks_mask |= fsnotify_parent_needed_mask( READ_ONCE(inode->i_fsnotify_mask)); marks_mask |= fsnotify_parent_needed_mask( READ_ONCE(inode->i_sb->s_fsnotify_mask)); marks_mask |= fsnotify_parent_needed_mask(mnt_mask); /* Did they subscribe for this event with parent/name info? */ return mask & marks_mask; } /* Are there any inode/mount/sb objects that watch for these events? */ static inline __u32 fsnotify_object_watched(struct inode *inode, __u32 mnt_mask, __u32 mask) { __u32 marks_mask = READ_ONCE(inode->i_fsnotify_mask) | mnt_mask | READ_ONCE(inode->i_sb->s_fsnotify_mask); return mask & marks_mask & ALL_FSNOTIFY_EVENTS; } /* Report pre-content event with optional range info */ int fsnotify_pre_content(const struct path *path, const loff_t *ppos, size_t count) { struct file_range range; /* Report page aligned range only when pos is known */ if (!ppos) return fsnotify_path(path, FS_PRE_ACCESS); range.path = path; range.pos = PAGE_ALIGN_DOWN(*ppos); range.count = PAGE_ALIGN(*ppos + count) - range.pos; return fsnotify_parent(path->dentry, FS_PRE_ACCESS, &range, FSNOTIFY_EVENT_FILE_RANGE); } /* * Notify this dentry's parent about a child's events with child name info * if parent is watching or if inode/sb/mount are interested in events with * parent and name info. * * Notify only the child without name info if parent is not watching and * inode/sb/mount are not interested in events with parent and name info. */ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { const struct path *path = fsnotify_data_path(data, data_type); __u32 mnt_mask = path ? READ_ONCE(real_mount(path->mnt)->mnt_fsnotify_mask) : 0; struct inode *inode = d_inode(dentry); struct dentry *parent; bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED; bool parent_needed, parent_interested; __u32 p_mask; struct inode *p_inode = NULL; struct name_snapshot name; struct qstr *file_name = NULL; int ret = 0; /* Optimize the likely case of nobody watching this path */ if (likely(!parent_watched && !fsnotify_object_watched(inode, mnt_mask, mask))) return 0; parent = NULL; parent_needed = fsnotify_event_needs_parent(inode, mnt_mask, mask); if (!parent_watched && !parent_needed) goto notify; /* Does parent inode care about events on children? */ parent = dget_parent(dentry); p_inode = parent->d_inode; p_mask = fsnotify_inode_watches_children(p_inode); if (unlikely(parent_watched && !p_mask)) fsnotify_clear_child_dentry_flag(p_inode, dentry); /* * Include parent/name in notification either if some notification * groups require parent info or the parent is interested in this event. * The parent interest in ACCESS/MODIFY events does not apply to special * files, where read/write are not on the filesystem of the parent and * events can provide an undesirable side-channel for information * exfiltration. */ parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS && !(data_type == FSNOTIFY_EVENT_PATH && d_is_special(dentry) && (mask & (FS_ACCESS | FS_MODIFY))); if (parent_needed || parent_interested) { /* When notifying parent, child should be passed as data */ WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type)); /* Notify both parent and child with child name info */ take_dentry_name_snapshot(&name, dentry); file_name = &name.name; if (parent_interested) mask |= FS_EVENT_ON_CHILD; } notify: ret = fsnotify(mask, data, data_type, p_inode, file_name, inode, 0); if (file_name) release_dentry_name_snapshot(&name); dput(parent); return ret; } EXPORT_SYMBOL_GPL(__fsnotify_parent); static int fsnotify_handle_inode_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie) { const struct path *path = fsnotify_data_path(data, data_type); struct inode *inode = fsnotify_data_inode(data, data_type); const struct fsnotify_ops *ops = group->ops; if (WARN_ON_ONCE(!ops->handle_inode_event)) return 0; if (WARN_ON_ONCE(!inode && !dir)) return 0; if ((inode_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) && path && d_unlinked(path->dentry)) return 0; /* Check interest of this mark in case event was sent with two marks */ if (!(mask & inode_mark->mask & ALL_FSNOTIFY_EVENTS)) return 0; return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie); } static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct fsnotify_mark *parent_mark = fsnotify_iter_parent_mark(iter_info); int ret; if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) || WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info))) return 0; /* * For FS_RENAME, 'dir' is old dir and 'data' is new dentry. * The only ->handle_inode_event() backend that supports FS_RENAME is * dnotify, where it means file was renamed within same parent. */ if (mask & FS_RENAME) { struct dentry *moved = fsnotify_data_dentry(data, data_type); if (dir != moved->d_parent->d_inode) return 0; } if (parent_mark) { ret = fsnotify_handle_inode_event(group, parent_mark, mask, data, data_type, dir, name, 0); if (ret) return ret; } if (!inode_mark) return 0; /* * Some events can be sent on both parent dir and child marks (e.g. * FS_ATTRIB). If both parent dir and child are watching, report the * event once to parent dir with name (if interested) and once to child * without name (if interested). * * In any case regardless whether the parent is watching or not, the * child watcher is expecting an event without the FS_EVENT_ON_CHILD * flag. The file name is expected if and only if this is a directory * event. */ mask &= ~FS_EVENT_ON_CHILD; if (!(mask & ALL_FSNOTIFY_DIRENT_EVENTS)) { dir = NULL; name = NULL; } return fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type, dir, name, cookie); } static int send_to_group(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_group *group = NULL; __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); __u32 marks_mask = 0; __u32 marks_ignore_mask = 0; bool is_dir = mask & FS_ISDIR; struct fsnotify_mark *mark; int type; if (!iter_info->report_mask) return 0; /* clear ignored on inode modification */ if (mask & FS_MODIFY) { fsnotify_foreach_iter_mark_type(iter_info, mark, type) { if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) mark->ignore_mask = 0; } } /* Are any of the group marks interested in this event? */ fsnotify_foreach_iter_mark_type(iter_info, mark, type) { group = mark->group; marks_mask |= mark->mask; marks_ignore_mask |= fsnotify_effective_ignore_mask(mark, is_dir, type); } pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", __func__, group, mask, marks_mask, marks_ignore_mask, data, data_type, dir, cookie); if (!(test_mask & marks_mask & ~marks_ignore_mask)) return 0; if (group->ops->handle_event) { return group->ops->handle_event(group, mask, data, data_type, dir, file_name, cookie, iter_info); } return fsnotify_handle_event(group, mask, data, data_type, dir, file_name, cookie, iter_info); } static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector *const *connp) { struct fsnotify_mark_connector *conn; struct hlist_node *node = NULL; conn = srcu_dereference(*connp, &fsnotify_mark_srcu); if (conn) node = srcu_dereference(conn->list.first, &fsnotify_mark_srcu); return hlist_entry_safe(node, struct fsnotify_mark, obj_list); } struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark) { struct hlist_node *node = NULL; if (mark) node = srcu_dereference(mark->obj_list.next, &fsnotify_mark_srcu); return hlist_entry_safe(node, struct fsnotify_mark, obj_list); } /* * iter_info is a multi head priority queue of marks. * Pick a subset of marks from queue heads, all with the same group * and set the report_mask to a subset of the selected marks. * Returns false if there are no more groups to iterate. */ static bool fsnotify_iter_select_report_types( struct fsnotify_iter_info *iter_info) { struct fsnotify_group *max_prio_group = NULL; struct fsnotify_mark *mark; int type; /* Choose max prio group among groups of all queue heads */ fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && fsnotify_compare_groups(max_prio_group, mark->group) > 0) max_prio_group = mark->group; } if (!max_prio_group) return false; /* Set the report mask for marks from same group as max prio group */ iter_info->current_group = max_prio_group; iter_info->report_mask = 0; fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && mark->group == iter_info->current_group) { /* * FSNOTIFY_ITER_TYPE_PARENT indicates that this inode * is watching children and interested in this event, * which is an event possible on child. * But is *this mark* watching children? */ if (type == FSNOTIFY_ITER_TYPE_PARENT && !(mark->mask & FS_EVENT_ON_CHILD) && !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD)) continue; fsnotify_iter_set_report_type(iter_info, type); } } return true; } /* * Pop from iter_info multi head queue, the marks that belong to the group of * current iteration step. */ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *mark; int type; /* * We cannot use fsnotify_foreach_iter_mark_type() here because we * may need to advance a mark of type X that belongs to current_group * but was not selected for reporting. */ fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && mark->group == iter_info->current_group) iter_info->marks[type] = fsnotify_next_mark(iter_info->marks[type]); } } /* * fsnotify - This is the main call to fsnotify. * * The VFS calls into hook specific functions in linux/fsnotify.h. * Those functions then in turn call here. Here will call out to all of the * registered fsnotify_group. Those groups can then use the notification event * in whatever means they feel necessary. * * @mask: event type and flags * @data: object that event happened on * @data_type: type of object for fanotify_data_XXX() accessors * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to * @file_name: optional file name associated with event * @inode: optional inode associated with event - * If @dir and @inode are both non-NULL, event may be * reported to both. * @cookie: inotify rename cookie */ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, struct inode *inode, u32 cookie) { const struct path *path = fsnotify_data_path(data, data_type); struct super_block *sb = fsnotify_data_sb(data, data_type); const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type); struct fsnotify_sb_info *sbinfo = sb ? fsnotify_sb_info(sb) : NULL; struct fsnotify_iter_info iter_info = {}; struct mount *mnt = NULL; struct inode *inode2 = NULL; struct dentry *moved; int inode2_type; int ret = 0; __u32 test_mask, marks_mask = 0; if (path) mnt = real_mount(path->mnt); if (!inode) { /* Dirent event - report on TYPE_INODE to dir */ inode = dir; /* For FS_RENAME, inode is old_dir and inode2 is new_dir */ if (mask & FS_RENAME) { moved = fsnotify_data_dentry(data, data_type); inode2 = moved->d_parent->d_inode; inode2_type = FSNOTIFY_ITER_TYPE_INODE2; } } else if (mask & FS_EVENT_ON_CHILD) { /* * Event on child - report on TYPE_PARENT to dir if it is * watching children and on TYPE_INODE to child. */ inode2 = dir; inode2_type = FSNOTIFY_ITER_TYPE_PARENT; } /* * Optimization: srcu_read_lock() has a memory barrier which can * be expensive. It protects walking the *_fsnotify_marks lists. * However, if we do not walk the lists, we do not have to do * SRCU because we have no references to any objects and do not * need SRCU to keep them "alive". */ if ((!sbinfo || !sbinfo->sb_marks) && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && (!inode2 || !inode2->i_fsnotify_marks) && (!mnt_data || !mnt_data->ns->n_fsnotify_marks)) return 0; if (sb) marks_mask |= READ_ONCE(sb->s_fsnotify_mask); if (mnt) marks_mask |= READ_ONCE(mnt->mnt_fsnotify_mask); if (inode) marks_mask |= READ_ONCE(inode->i_fsnotify_mask); if (inode2) marks_mask |= READ_ONCE(inode2->i_fsnotify_mask); if (mnt_data) marks_mask |= READ_ONCE(mnt_data->ns->n_fsnotify_mask); /* * If this is a modify event we may need to clear some ignore masks. * In that case, the object with ignore masks will have the FS_MODIFY * event in its mask. * Otherwise, return if none of the marks care about this type of event. */ test_mask = (mask & ALL_FSNOTIFY_EVENTS); if (!(test_mask & marks_mask)) return 0; iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); if (sbinfo) { iter_info.marks[FSNOTIFY_ITER_TYPE_SB] = fsnotify_first_mark(&sbinfo->sb_marks); } if (mnt) { iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); } if (inode) { iter_info.marks[FSNOTIFY_ITER_TYPE_INODE] = fsnotify_first_mark(&inode->i_fsnotify_marks); } if (inode2) { iter_info.marks[inode2_type] = fsnotify_first_mark(&inode2->i_fsnotify_marks); } if (mnt_data) { iter_info.marks[FSNOTIFY_ITER_TYPE_MNTNS] = fsnotify_first_mark(&mnt_data->ns->n_fsnotify_marks); } /* * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark * ignore masks are properly reflected for mount/sb mark notifications. * That's why this traversal is so complicated... */ while (fsnotify_iter_select_report_types(&iter_info)) { ret = send_to_group(mask, data, data_type, dir, file_name, cookie, &iter_info); if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) goto out; fsnotify_iter_next(&iter_info); } ret = 0; out: srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx); return ret; } EXPORT_SYMBOL_GPL(fsnotify); #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS /* * At open time we check fsnotify_sb_has_priority_watchers(), call the open perm * hook and set the FMODE_NONOTIFY_ mode bits accordignly. * Later, fsnotify permission hooks do not check if there are permission event * watches, but that there were permission event watches at open time. */ int fsnotify_open_perm_and_set_mode(struct file *file) { struct dentry *dentry = file->f_path.dentry, *parent; struct super_block *sb = dentry->d_sb; __u32 mnt_mask, p_mask = 0; /* Is it a file opened by fanotify? */ if (FMODE_FSNOTIFY_NONE(file->f_mode)) return 0; /* * Permission events is a super set of pre-content events, so if there * are no permission event watchers, there are also no pre-content event * watchers and this is implied from the single FMODE_NONOTIFY_PERM bit. */ if (likely(!fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT))) { file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); return 0; } /* * OK, there are some permission event watchers. Check if anybody is * watching for permission events on *this* file. */ mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask); p_mask = fsnotify_object_watched(d_inode(dentry), mnt_mask, ALL_FSNOTIFY_PERM_EVENTS); if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) { parent = dget_parent(dentry); p_mask |= fsnotify_inode_watches_children(d_inode(parent)); dput(parent); } /* * Legacy FAN_ACCESS_PERM events have very high performance overhead, * so unlikely to be used in the wild. If they are used there will be * no optimizations at all. */ if (unlikely(p_mask & FS_ACCESS_PERM)) { /* Enable all permission and pre-content events */ file_set_fsnotify_mode(file, 0); goto open_perm; } /* * Pre-content events are only supported on regular files. * If there are pre-content event watchers and no permission access * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that. * That is the common case with HSM service. */ if (d_is_reg(dentry) && (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS)) { file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM); goto open_perm; } /* Nobody watching permission and pre-content events on this file */ file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); open_perm: /* * Send open perm events depending on object masks and regardless of * FMODE_NONOTIFY_PERM. */ if (file->f_flags & __FMODE_EXEC && p_mask & FS_OPEN_EXEC_PERM) { int ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM); if (ret) return ret; } if (p_mask & FS_OPEN_PERM) return fsnotify_path(&file->f_path, FS_OPEN_PERM); return 0; } #endif void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt) { struct fsnotify_mnt data = { .ns = ns, .mnt_id = real_mount(mnt)->mnt_id_unique, }; if (WARN_ON_ONCE(!ns)) return; /* * This is an optimization as well as making sure fsnotify_init() has * been called. */ if (!ns->n_fsnotify_marks) return; fsnotify(mask, &data, FSNOTIFY_EVENT_MNT, NULL, NULL, NULL, 0); } static __init int fsnotify_init(void) { int ret; BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) panic("initializing fsnotify_mark_srcu"); fsnotify_init_connector_caches(); return 0; } core_initcall(fsnotify_init);
126 2 123 6 121 4 121 91 90 3 3 58 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/ip.h> #include <linux/sctp.h> #include <net/ip.h> #include <net/ip6_checksum.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <net/sctp/checksum.h> #include <net/ip_vs.h> static int sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, unsigned int sctphoff); static int sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { struct ip_vs_service *svc; struct sctp_chunkhdr _schunkh, *sch; struct sctphdr *sh, _sctph; __be16 _ports[2], *ports = NULL; if (likely(!ip_vs_iph_icmp(iph))) { sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); if (sh) { sch = skb_header_pointer(skb, iph->len + sizeof(_sctph), sizeof(_schunkh), &_schunkh); if (sch) { if (sch->type == SCTP_CID_ABORT || !(sysctl_sloppy_sctp(ipvs) || sch->type == SCTP_CID_INIT)) return 1; ports = &sh->source; } } } else { ports = skb_header_pointer( skb, iph->len, sizeof(_ports), &_ports); } if (!ports) { *verdict = NF_DROP; return 0; } if (likely(!ip_vs_iph_inverse(iph))) svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->daddr, ports[1]); else svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->saddr, ports[0]); if (svc) { int ignored; if (ip_vs_todrop(ipvs)) { /* * It seems that we are very loaded. * We have to drop this packet :( */ *verdict = NF_DROP; return 0; } /* * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); if (!*cpp && ignored <= 0) { if (!ignored) *verdict = ip_vs_leave(svc, skb, pd, iph); else *verdict = NF_DROP; return 0; } } /* NF_ACCEPT */ return 1; } static void sctp_nat_csum(struct sk_buff *skb, struct sctphdr *sctph, unsigned int sctphoff) { sctph->checksum = sctp_compute_cksum(skb, sctphoff); skb->ip_summed = CHECKSUM_UNNECESSARY; } static int sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct sctphdr *sctph; unsigned int sctphoff = iph->len; bool payload_csum = false; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!sctp_csum_check(cp->af, skb, pp, sctphoff)) return 0; /* Call application helper if needed */ ret = ip_vs_app_pkt_out(cp, skb, iph); if (ret == 0) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 2) payload_csum = true; } sctph = (void *) skb_network_header(skb) + sctphoff; /* Only update csum if we really have to */ if (sctph->source != cp->vport || payload_csum || skb->ip_summed == CHECKSUM_PARTIAL) { sctph->source = cp->vport; if (!skb_is_gso(skb)) sctp_nat_csum(skb, sctph, sctphoff); } else { skb->ip_summed = CHECKSUM_UNNECESSARY; } return 1; } static int sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct sctphdr *sctph; unsigned int sctphoff = iph->len; bool payload_csum = false; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!sctp_csum_check(cp->af, skb, pp, sctphoff)) return 0; /* Call application helper if needed */ ret = ip_vs_app_pkt_in(cp, skb, iph); if (ret == 0) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 2) payload_csum = true; } sctph = (void *) skb_network_header(skb) + sctphoff; /* Only update csum if we really have to */ if (sctph->dest != cp->dport || payload_csum || (skb->ip_summed == CHECKSUM_PARTIAL && !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CRC))) { sctph->dest = cp->dport; if (!skb_is_gso(skb)) sctp_nat_csum(skb, sctph, sctphoff); } else if (skb->ip_summed != CHECKSUM_PARTIAL) { skb->ip_summed = CHECKSUM_UNNECESSARY; } return 1; } static int sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, unsigned int sctphoff) { struct sctphdr *sh; __le32 cmp, val; sh = (struct sctphdr *)(skb->data + sctphoff); cmp = sh->checksum; val = sctp_compute_cksum(skb, sctphoff); if (val != cmp) { /* CRC failure, dump it. */ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } return 1; } enum ipvs_sctp_event_t { IP_VS_SCTP_DATA = 0, /* DATA, SACK, HEARTBEATs */ IP_VS_SCTP_INIT, IP_VS_SCTP_INIT_ACK, IP_VS_SCTP_COOKIE_ECHO, IP_VS_SCTP_COOKIE_ACK, IP_VS_SCTP_SHUTDOWN, IP_VS_SCTP_SHUTDOWN_ACK, IP_VS_SCTP_SHUTDOWN_COMPLETE, IP_VS_SCTP_ERROR, IP_VS_SCTP_ABORT, IP_VS_SCTP_EVENT_LAST }; /* RFC 2960, 3.2 Chunk Field Descriptions */ static __u8 sctp_events[] = { [SCTP_CID_DATA] = IP_VS_SCTP_DATA, [SCTP_CID_INIT] = IP_VS_SCTP_INIT, [SCTP_CID_INIT_ACK] = IP_VS_SCTP_INIT_ACK, [SCTP_CID_SACK] = IP_VS_SCTP_DATA, [SCTP_CID_HEARTBEAT] = IP_VS_SCTP_DATA, [SCTP_CID_HEARTBEAT_ACK] = IP_VS_SCTP_DATA, [SCTP_CID_ABORT] = IP_VS_SCTP_ABORT, [SCTP_CID_SHUTDOWN] = IP_VS_SCTP_SHUTDOWN, [SCTP_CID_SHUTDOWN_ACK] = IP_VS_SCTP_SHUTDOWN_ACK, [SCTP_CID_ERROR] = IP_VS_SCTP_ERROR, [SCTP_CID_COOKIE_ECHO] = IP_VS_SCTP_COOKIE_ECHO, [SCTP_CID_COOKIE_ACK] = IP_VS_SCTP_COOKIE_ACK, [SCTP_CID_ECN_ECNE] = IP_VS_SCTP_DATA, [SCTP_CID_ECN_CWR] = IP_VS_SCTP_DATA, [SCTP_CID_SHUTDOWN_COMPLETE] = IP_VS_SCTP_SHUTDOWN_COMPLETE, }; /* SCTP States: * See RFC 2960, 4. SCTP Association State Diagram * * New states (not in diagram): * - INIT1 state: use shorter timeout for dropped INIT packets * - REJECTED state: use shorter timeout if INIT is rejected with ABORT * - INIT, COOKIE_SENT, COOKIE_REPLIED, COOKIE states: for better debugging * * The states are as seen in real server. In the diagram, INIT1, INIT, * COOKIE_SENT and COOKIE_REPLIED processing happens in CLOSED state. * * States as per packets from client (C) and server (S): * * Setup of client connection: * IP_VS_SCTP_S_INIT1: First C:INIT sent, wait for S:INIT-ACK * IP_VS_SCTP_S_INIT: Next C:INIT sent, wait for S:INIT-ACK * IP_VS_SCTP_S_COOKIE_SENT: S:INIT-ACK sent, wait for C:COOKIE-ECHO * IP_VS_SCTP_S_COOKIE_REPLIED: C:COOKIE-ECHO sent, wait for S:COOKIE-ACK * * Setup of server connection: * IP_VS_SCTP_S_COOKIE_WAIT: S:INIT sent, wait for C:INIT-ACK * IP_VS_SCTP_S_COOKIE: C:INIT-ACK sent, wait for S:COOKIE-ECHO * IP_VS_SCTP_S_COOKIE_ECHOED: S:COOKIE-ECHO sent, wait for C:COOKIE-ACK */ #define sNO IP_VS_SCTP_S_NONE #define sI1 IP_VS_SCTP_S_INIT1 #define sIN IP_VS_SCTP_S_INIT #define sCS IP_VS_SCTP_S_COOKIE_SENT #define sCR IP_VS_SCTP_S_COOKIE_REPLIED #define sCW IP_VS_SCTP_S_COOKIE_WAIT #define sCO IP_VS_SCTP_S_COOKIE #define sCE IP_VS_SCTP_S_COOKIE_ECHOED #define sES IP_VS_SCTP_S_ESTABLISHED #define sSS IP_VS_SCTP_S_SHUTDOWN_SENT #define sSR IP_VS_SCTP_S_SHUTDOWN_RECEIVED #define sSA IP_VS_SCTP_S_SHUTDOWN_ACK_SENT #define sRJ IP_VS_SCTP_S_REJECTED #define sCL IP_VS_SCTP_S_CLOSED static const __u8 sctp_states [IP_VS_DIR_LAST][IP_VS_SCTP_EVENT_LAST][IP_VS_SCTP_S_LAST] = { { /* INPUT */ /* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ /* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* i */{sI1, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN}, /* i_a */{sCW, sCW, sCW, sCS, sCR, sCO, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* c_e */{sCR, sIN, sIN, sCR, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* c_a */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sES, sES, sSS, sSR, sSA, sRJ, sCL}, /* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL}, /* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sCL, sSR, sCL, sRJ, sCL}, /* s_c */{sCL, sCL, sCL, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sCL, sRJ, sCL}, /* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCL, sES, sSS, sSR, sSA, sRJ, sCL}, /* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, }, { /* OUTPUT */ /* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ /* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* i */{sCW, sCW, sCW, sCW, sCW, sCW, sCW, sCW, sES, sCW, sCW, sCW, sCW, sCW}, /* i_a */{sCS, sCS, sCS, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* c_e */{sCE, sCE, sCE, sCE, sCE, sCE, sCE, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* c_a */{sES, sES, sES, sES, sES, sES, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL}, /* s */{sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSR, sSA, sRJ, sCL}, /* s_a */{sSA, sSA, sSA, sSA, sSA, sCW, sCO, sCE, sES, sSA, sSA, sSA, sRJ, sCL}, /* s_c */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* err */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* ab */{sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, }, { /* INPUT-ONLY */ /* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ /* d */{sES, sI1, sIN, sCS, sCR, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* i */{sI1, sIN, sIN, sIN, sIN, sIN, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN}, /* i_a */{sCE, sCE, sCE, sCE, sCE, sCE, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* c_e */{sES, sES, sES, sES, sES, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* c_a */{sES, sI1, sIN, sES, sES, sCW, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL}, /* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL}, /* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sCL, sCL, sSR, sCL, sRJ, sCL}, /* s_c */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sCL, sCL, sRJ, sCL}, /* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, }, }; #define IP_VS_SCTP_MAX_RTO ((60 + 1) * HZ) /* Timeout table[state] */ static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = { [IP_VS_SCTP_S_NONE] = 2 * HZ, [IP_VS_SCTP_S_INIT1] = (0 + 3 + 1) * HZ, [IP_VS_SCTP_S_INIT] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_COOKIE_SENT] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_COOKIE_REPLIED] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_COOKIE_WAIT] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_COOKIE] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_COOKIE_ECHOED] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_ESTABLISHED] = 15 * 60 * HZ, [IP_VS_SCTP_S_SHUTDOWN_SENT] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_REJECTED] = (0 + 3 + 1) * HZ, [IP_VS_SCTP_S_CLOSED] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_LAST] = 2 * HZ, }; static const char *sctp_state_name_table[IP_VS_SCTP_S_LAST + 1] = { [IP_VS_SCTP_S_NONE] = "NONE", [IP_VS_SCTP_S_INIT1] = "INIT1", [IP_VS_SCTP_S_INIT] = "INIT", [IP_VS_SCTP_S_COOKIE_SENT] = "C-SENT", [IP_VS_SCTP_S_COOKIE_REPLIED] = "C-REPLIED", [IP_VS_SCTP_S_COOKIE_WAIT] = "C-WAIT", [IP_VS_SCTP_S_COOKIE] = "COOKIE", [IP_VS_SCTP_S_COOKIE_ECHOED] = "C-ECHOED", [IP_VS_SCTP_S_ESTABLISHED] = "ESTABLISHED", [IP_VS_SCTP_S_SHUTDOWN_SENT] = "S-SENT", [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = "S-RECEIVED", [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = "S-ACK-SENT", [IP_VS_SCTP_S_REJECTED] = "REJECTED", [IP_VS_SCTP_S_CLOSED] = "CLOSED", [IP_VS_SCTP_S_LAST] = "BUG!", }; static const char *sctp_state_name(int state) { if (state >= IP_VS_SCTP_S_LAST) return "ERR!"; if (sctp_state_name_table[state]) return sctp_state_name_table[state]; return "?"; } static inline void set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, int direction, const struct sk_buff *skb) { struct sctp_chunkhdr _sctpch, *sch; unsigned char chunk_type; int event, next_state; int ihl, cofs; #ifdef CONFIG_IP_VS_IPV6 ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); #else ihl = ip_hdrlen(skb); #endif cofs = ihl + sizeof(struct sctphdr); sch = skb_header_pointer(skb, cofs, sizeof(_sctpch), &_sctpch); if (sch == NULL) return; chunk_type = sch->type; /* * Section 3: Multiple chunks can be bundled into one SCTP packet * up to the MTU size, except for the INIT, INIT ACK, and * SHUTDOWN COMPLETE chunks. These chunks MUST NOT be bundled with * any other chunk in a packet. * * Section 3.3.7: DATA chunks MUST NOT be bundled with ABORT. Control * chunks (except for INIT, INIT ACK, and SHUTDOWN COMPLETE) MAY be * bundled with an ABORT, but they MUST be placed before the ABORT * in the SCTP packet or they will be ignored by the receiver. */ if ((sch->type == SCTP_CID_COOKIE_ECHO) || (sch->type == SCTP_CID_COOKIE_ACK)) { int clen = ntohs(sch->length); if (clen >= sizeof(_sctpch)) { sch = skb_header_pointer(skb, cofs + ALIGN(clen, 4), sizeof(_sctpch), &_sctpch); if (sch && sch->type == SCTP_CID_ABORT) chunk_type = sch->type; } } event = (chunk_type < sizeof(sctp_events)) ? sctp_events[chunk_type] : IP_VS_SCTP_DATA; /* Update direction to INPUT_ONLY if necessary * or delete NO_OUTPUT flag if output packet detected */ if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { if (direction == IP_VS_DIR_OUTPUT) cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; else direction = IP_VS_DIR_INPUT_ONLY; } next_state = sctp_states[direction][event][cp->state]; if (next_state != cp->state) { struct ip_vs_dest *dest = cp->dest; IP_VS_DBG_BUF(8, "%s %s %s:%d->" "%s:%d state: %s->%s conn->refcnt:%d\n", pd->pp->name, ((direction == IP_VS_DIR_OUTPUT) ? "output " : "input "), IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), sctp_state_name(cp->state), sctp_state_name(next_state), refcount_read(&cp->refcnt)); if (dest) { if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && (next_state != IP_VS_SCTP_S_ESTABLISHED)) { atomic_dec(&dest->activeconns); atomic_inc(&dest->inactconns); cp->flags |= IP_VS_CONN_F_INACTIVE; } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && (next_state == IP_VS_SCTP_S_ESTABLISHED)) { atomic_inc(&dest->activeconns); atomic_dec(&dest->inactconns); cp->flags &= ~IP_VS_CONN_F_INACTIVE; } } if (next_state == IP_VS_SCTP_S_ESTABLISHED) ip_vs_control_assure_ct(cp); } if (likely(pd)) cp->timeout = pd->timeout_table[cp->state = next_state]; else /* What to do ? */ cp->timeout = sctp_timeouts[cp->state = next_state]; } static void sctp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) { spin_lock_bh(&cp->lock); set_sctp_state(pd, cp, direction, skb); spin_unlock_bh(&cp->lock); } static inline __u16 sctp_app_hashkey(__be16 port) { return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port) & SCTP_APP_TAB_MASK; } static int sctp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_SCTP); hash = sctp_app_hashkey(port); list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]); atomic_inc(&pd->appcnt); out: return ret; } static void sctp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_SCTP); atomic_dec(&pd->appcnt); list_del_rcu(&inc->p_list); } static int sctp_app_conn_bind(struct ip_vs_conn *cp) { struct netns_ipvs *ipvs = cp->ipvs; int hash; struct ip_vs_app *inc; int result = 0; /* Default binding: bind app only for NAT */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) return 0; /* Lookup application incarnations and bind the right one */ hash = sctp_app_hashkey(cp->vport); list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" "%s:%u to app %s on port %u\n", __func__, IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), inc->name, ntohs(inc->port)); cp->app = inc; if (inc->init_conn) result = inc->init_conn(inc, cp); break; } } return result; } /* --------------------------------------------- * timeouts is netns related now. * --------------------------------------------- */ static int __ip_vs_sctp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE); pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, sizeof(sctp_timeouts)); if (!pd->timeout_table) return -ENOMEM; return 0; } static void __ip_vs_sctp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { kfree(pd->timeout_table); } struct ip_vs_protocol ip_vs_protocol_sctp = { .name = "SCTP", .protocol = IPPROTO_SCTP, .num_states = IP_VS_SCTP_S_LAST, .dont_defrag = 0, .init = NULL, .exit = NULL, .init_netns = __ip_vs_sctp_init, .exit_netns = __ip_vs_sctp_exit, .register_app = sctp_register_app, .unregister_app = sctp_unregister_app, .conn_schedule = sctp_conn_schedule, .conn_in_get = ip_vs_conn_in_get_proto, .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = sctp_snat_handler, .dnat_handler = sctp_dnat_handler, .state_name = sctp_state_name, .state_transition = sctp_state_transition, .app_conn_bind = sctp_app_conn_bind, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, };
86 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* SCTP kernel reference Implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel reference Implementation * * Various protocol defined structures. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Xingang Guo <xingang.guo@intel.com> * randall@sctp.chicago.il.us * kmorneau@cisco.com * qxie1@email.mot.com * Sridhar Samudrala <sri@us.ibm.com> * Kevin Gao <kevin.gao@intel.com> * * Any bugs reported given to us we will try to fix... any fixes shared will * be incorporated into the next SCTP release. */ #ifndef __LINUX_SCTP_H__ #define __LINUX_SCTP_H__ #include <linux/in.h> /* We need in_addr. */ #include <linux/in6.h> /* We need in6_addr. */ #include <linux/skbuff.h> #include <uapi/linux/sctp.h> /* Section 3.1. SCTP Common Header Format */ struct sctphdr { __be16 source; __be16 dest; __be32 vtag; __le32 checksum; }; static inline struct sctphdr *sctp_hdr(const struct sk_buff *skb) { return (struct sctphdr *)skb_transport_header(skb); } /* Section 3.2. Chunk Field Descriptions. */ struct sctp_chunkhdr { __u8 type; __u8 flags; __be16 length; }; /* Section 3.2. Chunk Type Values. * [Chunk Type] identifies the type of information contained in the Chunk * Value field. It takes a value from 0 to 254. The value of 255 is * reserved for future use as an extension field. */ enum sctp_cid { SCTP_CID_DATA = 0, SCTP_CID_INIT = 1, SCTP_CID_INIT_ACK = 2, SCTP_CID_SACK = 3, SCTP_CID_HEARTBEAT = 4, SCTP_CID_HEARTBEAT_ACK = 5, SCTP_CID_ABORT = 6, SCTP_CID_SHUTDOWN = 7, SCTP_CID_SHUTDOWN_ACK = 8, SCTP_CID_ERROR = 9, SCTP_CID_COOKIE_ECHO = 10, SCTP_CID_COOKIE_ACK = 11, SCTP_CID_ECN_ECNE = 12, SCTP_CID_ECN_CWR = 13, SCTP_CID_SHUTDOWN_COMPLETE = 14, /* AUTH Extension Section 4.1 */ SCTP_CID_AUTH = 0x0F, /* sctp ndata 5.1. I-DATA */ SCTP_CID_I_DATA = 0x40, /* PR-SCTP Sec 3.2 */ SCTP_CID_FWD_TSN = 0xC0, /* Use hex, as defined in ADDIP sec. 3.1 */ SCTP_CID_ASCONF = 0xC1, SCTP_CID_I_FWD_TSN = 0xC2, SCTP_CID_ASCONF_ACK = 0x80, SCTP_CID_RECONF = 0x82, SCTP_CID_PAD = 0x84, }; /* enum */ /* Section 3.2 * Chunk Types are encoded such that the highest-order two bits specify * the action that must be taken if the processing endpoint does not * recognize the Chunk Type. */ enum { SCTP_CID_ACTION_DISCARD = 0x00, SCTP_CID_ACTION_DISCARD_ERR = 0x40, SCTP_CID_ACTION_SKIP = 0x80, SCTP_CID_ACTION_SKIP_ERR = 0xc0, }; enum { SCTP_CID_ACTION_MASK = 0xc0, }; /* This flag is used in Chunk Flags for ABORT and SHUTDOWN COMPLETE. * * 3.3.7 Abort Association (ABORT) (6): * The T bit is set to 0 if the sender had a TCB that it destroyed. * If the sender did not have a TCB it should set this bit to 1. */ enum { SCTP_CHUNK_FLAG_T = 0x01 }; /* * Set the T bit * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 14 |Reserved |T| Length = 4 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: 8 bits * * Reserved: 7 bits * Set to 0 on transmit and ignored on receipt. * * T bit: 1 bit * The T bit is set to 0 if the sender had a TCB that it destroyed. If * the sender did NOT have a TCB it should set this bit to 1. * * Note: Special rules apply to this chunk for verification, please * see Section 8.5.1 for details. */ #define sctp_test_T_bit(c) ((c)->chunk_hdr->flags & SCTP_CHUNK_FLAG_T) /* RFC 2960 * Section 3.2.1 Optional/Variable-length Parmaeter Format. */ struct sctp_paramhdr { __be16 type; __be16 length; }; enum sctp_param { /* RFC 2960 Section 3.3.5 */ SCTP_PARAM_HEARTBEAT_INFO = cpu_to_be16(1), /* RFC 2960 Section 3.3.2.1 */ SCTP_PARAM_IPV4_ADDRESS = cpu_to_be16(5), SCTP_PARAM_IPV6_ADDRESS = cpu_to_be16(6), SCTP_PARAM_STATE_COOKIE = cpu_to_be16(7), SCTP_PARAM_UNRECOGNIZED_PARAMETERS = cpu_to_be16(8), SCTP_PARAM_COOKIE_PRESERVATIVE = cpu_to_be16(9), SCTP_PARAM_HOST_NAME_ADDRESS = cpu_to_be16(11), SCTP_PARAM_SUPPORTED_ADDRESS_TYPES = cpu_to_be16(12), SCTP_PARAM_ECN_CAPABLE = cpu_to_be16(0x8000), /* AUTH Extension Section 3 */ SCTP_PARAM_RANDOM = cpu_to_be16(0x8002), SCTP_PARAM_CHUNKS = cpu_to_be16(0x8003), SCTP_PARAM_HMAC_ALGO = cpu_to_be16(0x8004), /* Add-IP: Supported Extensions, Section 4.2 */ SCTP_PARAM_SUPPORTED_EXT = cpu_to_be16(0x8008), /* PR-SCTP Sec 3.1 */ SCTP_PARAM_FWD_TSN_SUPPORT = cpu_to_be16(0xc000), /* Add-IP Extension. Section 3.2 */ SCTP_PARAM_ADD_IP = cpu_to_be16(0xc001), SCTP_PARAM_DEL_IP = cpu_to_be16(0xc002), SCTP_PARAM_ERR_CAUSE = cpu_to_be16(0xc003), SCTP_PARAM_SET_PRIMARY = cpu_to_be16(0xc004), SCTP_PARAM_SUCCESS_REPORT = cpu_to_be16(0xc005), SCTP_PARAM_ADAPTATION_LAYER_IND = cpu_to_be16(0xc006), /* RE-CONFIG. Section 4 */ SCTP_PARAM_RESET_OUT_REQUEST = cpu_to_be16(0x000d), SCTP_PARAM_RESET_IN_REQUEST = cpu_to_be16(0x000e), SCTP_PARAM_RESET_TSN_REQUEST = cpu_to_be16(0x000f), SCTP_PARAM_RESET_RESPONSE = cpu_to_be16(0x0010), SCTP_PARAM_RESET_ADD_OUT_STREAMS = cpu_to_be16(0x0011), SCTP_PARAM_RESET_ADD_IN_STREAMS = cpu_to_be16(0x0012), }; /* enum */ /* RFC 2960 Section 3.2.1 * The Parameter Types are encoded such that the highest-order two bits * specify the action that must be taken if the processing endpoint does * not recognize the Parameter Type. * */ enum { SCTP_PARAM_ACTION_DISCARD = cpu_to_be16(0x0000), SCTP_PARAM_ACTION_DISCARD_ERR = cpu_to_be16(0x4000), SCTP_PARAM_ACTION_SKIP = cpu_to_be16(0x8000), SCTP_PARAM_ACTION_SKIP_ERR = cpu_to_be16(0xc000), }; enum { SCTP_PARAM_ACTION_MASK = cpu_to_be16(0xc000), }; /* RFC 2960 Section 3.3.1 Payload Data (DATA) (0) */ struct sctp_datahdr { __be32 tsn; __be16 stream; __be16 ssn; __u32 ppid; }; struct sctp_data_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_datahdr data_hdr; }; struct sctp_idatahdr { __be32 tsn; __be16 stream; __be16 reserved; __be32 mid; union { __u32 ppid; __be32 fsn; }; }; struct sctp_idata_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_idatahdr data_hdr; }; /* DATA Chuck Specific Flags */ enum { SCTP_DATA_MIDDLE_FRAG = 0x00, SCTP_DATA_LAST_FRAG = 0x01, SCTP_DATA_FIRST_FRAG = 0x02, SCTP_DATA_NOT_FRAG = 0x03, SCTP_DATA_UNORDERED = 0x04, SCTP_DATA_SACK_IMM = 0x08, }; enum { SCTP_DATA_FRAG_MASK = 0x03, }; /* RFC 2960 Section 3.3.2 Initiation (INIT) (1) * * This chunk is used to initiate a SCTP association between two * endpoints. */ struct sctp_inithdr { __be32 init_tag; __be32 a_rwnd; __be16 num_outbound_streams; __be16 num_inbound_streams; __be32 initial_tsn; /* __u8 params[]; */ }; struct sctp_init_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_inithdr init_hdr; }; /* Section 3.3.2.1. IPv4 Address Parameter (5) */ struct sctp_ipv4addr_param { struct sctp_paramhdr param_hdr; struct in_addr addr; }; /* Section 3.3.2.1. IPv6 Address Parameter (6) */ struct sctp_ipv6addr_param { struct sctp_paramhdr param_hdr; struct in6_addr addr; }; /* Section 3.3.2.1 Cookie Preservative (9) */ struct sctp_cookie_preserve_param { struct sctp_paramhdr param_hdr; __be32 lifespan_increment; }; /* Section 3.3.2.1 Host Name Address (11) */ struct sctp_hostname_param { struct sctp_paramhdr param_hdr; uint8_t hostname[]; }; /* Section 3.3.2.1 Supported Address Types (12) */ struct sctp_supported_addrs_param { struct sctp_paramhdr param_hdr; __be16 types[]; }; /* ADDIP Section 3.2.6 Adaptation Layer Indication */ struct sctp_adaptation_ind_param { struct sctp_paramhdr param_hdr; __be32 adaptation_ind; }; /* ADDIP Section 4.2.7 Supported Extensions Parameter */ struct sctp_supported_ext_param { struct sctp_paramhdr param_hdr; __u8 chunks[]; }; /* AUTH Section 3.1 Random */ struct sctp_random_param { struct sctp_paramhdr param_hdr; __u8 random_val[]; }; /* AUTH Section 3.2 Chunk List */ struct sctp_chunks_param { struct sctp_paramhdr param_hdr; __u8 chunks[]; }; /* AUTH Section 3.3 HMAC Algorithm */ struct sctp_hmac_algo_param { struct sctp_paramhdr param_hdr; __be16 hmac_ids[]; }; /* RFC 2960. Section 3.3.3 Initiation Acknowledgement (INIT ACK) (2): * The INIT ACK chunk is used to acknowledge the initiation of an SCTP * association. */ struct sctp_initack_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_inithdr init_hdr; }; /* Section 3.3.3.1 State Cookie (7) */ struct sctp_cookie_param { struct sctp_paramhdr p; __u8 body[]; }; /* Section 3.3.3.1 Unrecognized Parameters (8) */ struct sctp_unrecognized_param { struct sctp_paramhdr param_hdr; struct sctp_paramhdr unrecognized; }; /* * 3.3.4 Selective Acknowledgement (SACK) (3): * * This chunk is sent to the peer endpoint to acknowledge received DATA * chunks and to inform the peer endpoint of gaps in the received * subsequences of DATA chunks as represented by their TSNs. */ struct sctp_gap_ack_block { __be16 start; __be16 end; }; union sctp_sack_variable { struct sctp_gap_ack_block gab; __be32 dup; }; struct sctp_sackhdr { __be32 cum_tsn_ack; __be32 a_rwnd; __be16 num_gap_ack_blocks; __be16 num_dup_tsns; /* union sctp_sack_variable variable[]; */ }; struct sctp_sack_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_sackhdr sack_hdr; }; /* RFC 2960. Section 3.3.5 Heartbeat Request (HEARTBEAT) (4): * * An endpoint should send this chunk to its peer endpoint to probe the * reachability of a particular destination transport address defined in * the present association. */ struct sctp_heartbeathdr { struct sctp_paramhdr info; }; struct sctp_heartbeat_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_heartbeathdr hb_hdr; }; /* PAD chunk could be bundled with heartbeat chunk to probe pmtu */ struct sctp_pad_chunk { struct sctp_chunkhdr uh; }; /* For the abort and shutdown ACK we must carry the init tag in the * common header. Just the common header is all that is needed with a * chunk descriptor. */ struct sctp_abort_chunk { struct sctp_chunkhdr uh; }; /* For the graceful shutdown we must carry the tag (in common header) * and the highest consecutive acking value. */ struct sctp_shutdownhdr { __be32 cum_tsn_ack; }; struct sctp_shutdown_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_shutdownhdr shutdown_hdr; }; /* RFC 2960. Section 3.3.10 Operation Error (ERROR) (9) */ struct sctp_errhdr { __be16 cause; __be16 length; /* __u8 variable[]; */ }; struct sctp_operr_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_errhdr err_hdr; }; /* RFC 2960 3.3.10 - Operation Error * * Cause Code: 16 bits (unsigned integer) * * Defines the type of error conditions being reported. * Cause Code * Value Cause Code * --------- ---------------- * 1 Invalid Stream Identifier * 2 Missing Mandatory Parameter * 3 Stale Cookie Error * 4 Out of Resource * 5 Unresolvable Address * 6 Unrecognized Chunk Type * 7 Invalid Mandatory Parameter * 8 Unrecognized Parameters * 9 No User Data * 10 Cookie Received While Shutting Down */ enum sctp_error { SCTP_ERROR_NO_ERROR = cpu_to_be16(0x00), SCTP_ERROR_INV_STRM = cpu_to_be16(0x01), SCTP_ERROR_MISS_PARAM = cpu_to_be16(0x02), SCTP_ERROR_STALE_COOKIE = cpu_to_be16(0x03), SCTP_ERROR_NO_RESOURCE = cpu_to_be16(0x04), SCTP_ERROR_DNS_FAILED = cpu_to_be16(0x05), SCTP_ERROR_UNKNOWN_CHUNK = cpu_to_be16(0x06), SCTP_ERROR_INV_PARAM = cpu_to_be16(0x07), SCTP_ERROR_UNKNOWN_PARAM = cpu_to_be16(0x08), SCTP_ERROR_NO_DATA = cpu_to_be16(0x09), SCTP_ERROR_COOKIE_IN_SHUTDOWN = cpu_to_be16(0x0a), /* SCTP Implementation Guide: * 11 Restart of an association with new addresses * 12 User Initiated Abort * 13 Protocol Violation * 14 Restart of an Association with New Encapsulation Port */ SCTP_ERROR_RESTART = cpu_to_be16(0x0b), SCTP_ERROR_USER_ABORT = cpu_to_be16(0x0c), SCTP_ERROR_PROTO_VIOLATION = cpu_to_be16(0x0d), SCTP_ERROR_NEW_ENCAP_PORT = cpu_to_be16(0x0e), /* ADDIP Section 3.3 New Error Causes * * Four new Error Causes are added to the SCTP Operational Errors, * primarily for use in the ASCONF-ACK chunk. * * Value Cause Code * --------- ---------------- * 0x00A0 Request to Delete Last Remaining IP Address. * 0x00A1 Operation Refused Due to Resource Shortage. * 0x00A2 Request to Delete Source IP Address. * 0x00A3 Association Aborted due to illegal ASCONF-ACK * 0x00A4 Request refused - no authorization. */ SCTP_ERROR_DEL_LAST_IP = cpu_to_be16(0x00A0), SCTP_ERROR_RSRC_LOW = cpu_to_be16(0x00A1), SCTP_ERROR_DEL_SRC_IP = cpu_to_be16(0x00A2), SCTP_ERROR_ASCONF_ACK = cpu_to_be16(0x00A3), SCTP_ERROR_REQ_REFUSED = cpu_to_be16(0x00A4), /* AUTH Section 4. New Error Cause * * This section defines a new error cause that will be sent if an AUTH * chunk is received with an unsupported HMAC identifier. * illustrates the new error cause. * * Cause Code Error Cause Name * -------------------------------------------------------------- * 0x0105 Unsupported HMAC Identifier */ SCTP_ERROR_UNSUP_HMAC = cpu_to_be16(0x0105) }; /* RFC 2960. Appendix A. Explicit Congestion Notification. * Explicit Congestion Notification Echo (ECNE) (12) */ struct sctp_ecnehdr { __be32 lowest_tsn; }; struct sctp_ecne_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_ecnehdr ence_hdr; }; /* RFC 2960. Appendix A. Explicit Congestion Notification. * Congestion Window Reduced (CWR) (13) */ struct sctp_cwrhdr { __be32 lowest_tsn; }; /* PR-SCTP * 3.2 Forward Cumulative TSN Chunk Definition (FORWARD TSN) * * Forward Cumulative TSN chunk has the following format: * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 192 | Flags = 0x00 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | New Cumulative TSN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream-1 | Stream Sequence-1 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ / * / \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream-N | Stream Sequence-N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: * * Set to all zeros on transmit and ignored on receipt. * * New Cumulative TSN: 32 bit u_int * * This indicates the new cumulative TSN to the data receiver. Upon * the reception of this value, the data receiver MUST consider * any missing TSNs earlier than or equal to this value as received * and stop reporting them as gaps in any subsequent SACKs. * * Stream-N: 16 bit u_int * * This field holds a stream number that was skipped by this * FWD-TSN. * * Stream Sequence-N: 16 bit u_int * This field holds the sequence number associated with the stream * that was skipped. The stream sequence field holds the largest stream * sequence number in this stream being skipped. The receiver of * the FWD-TSN's can use the Stream-N and Stream Sequence-N fields * to enable delivery of any stranded TSN's that remain on the stream * re-ordering queues. This field MUST NOT report TSN's corresponding * to DATA chunk that are marked as unordered. For ordered DATA * chunks this field MUST be filled in. */ struct sctp_fwdtsn_skip { __be16 stream; __be16 ssn; }; struct sctp_fwdtsn_hdr { __be32 new_cum_tsn; /* struct sctp_fwdtsn_skip skip[]; */ }; struct sctp_fwdtsn_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_fwdtsn_hdr fwdtsn_hdr; }; struct sctp_ifwdtsn_skip { __be16 stream; __u8 reserved; __u8 flags; __be32 mid; }; struct sctp_ifwdtsn_hdr { __be32 new_cum_tsn; /* struct sctp_ifwdtsn_skip skip[]; */ }; struct sctp_ifwdtsn_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_ifwdtsn_hdr fwdtsn_hdr; }; /* ADDIP * Section 3.1.1 Address Configuration Change Chunk (ASCONF) * * Serial Number: 32 bits (unsigned integer) * This value represents a Serial Number for the ASCONF Chunk. The * valid range of Serial Number is from 0 to 2^32-1. * Serial Numbers wrap back to 0 after reaching 2^32 -1. * * Address Parameter: 8 or 20 bytes (depending on type) * The address is an address of the sender of the ASCONF chunk, * the address MUST be considered part of the association by the * peer endpoint. This field may be used by the receiver of the * ASCONF to help in finding the association. This parameter MUST * be present in every ASCONF message i.e. it is a mandatory TLV * parameter. * * ASCONF Parameter: TLV format * Each Address configuration change is represented by a TLV * parameter as defined in Section 3.2. One or more requests may * be present in an ASCONF Chunk. * * Section 3.1.2 Address Configuration Acknowledgement Chunk (ASCONF-ACK) * * Serial Number: 32 bits (unsigned integer) * This value represents the Serial Number for the received ASCONF * Chunk that is acknowledged by this chunk. This value is copied * from the received ASCONF Chunk. * * ASCONF Parameter Response: TLV format * The ASCONF Parameter Response is used in the ASCONF-ACK to * report status of ASCONF processing. */ struct sctp_addip_param { struct sctp_paramhdr param_hdr; __be32 crr_id; }; struct sctp_addiphdr { __be32 serial; /* __u8 params[]; */ }; struct sctp_addip_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_addiphdr addip_hdr; }; /* AUTH * Section 4.1 Authentication Chunk (AUTH) * * This chunk is used to hold the result of the HMAC calculation. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0x0F | Flags=0 | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Shared Key Identifier | HMAC Identifier | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | | * \ HMAC / * / \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Type: 1 byte (unsigned integer) * This value MUST be set to 0x0F for all AUTH-chunks. * * Flags: 1 byte (unsigned integer) * Set to zero on transmit and ignored on receipt. * * Length: 2 bytes (unsigned integer) * This value holds the length of the HMAC in bytes plus 8. * * Shared Key Identifier: 2 bytes (unsigned integer) * This value describes which endpoint pair shared key is used. * * HMAC Identifier: 2 bytes (unsigned integer) * This value describes which message digest is being used. Table 2 * shows the currently defined values. * * The following Table 2 shows the currently defined values for HMAC * identifiers. * * +-----------------+--------------------------+ * | HMAC Identifier | Message Digest Algorithm | * +-----------------+--------------------------+ * | 0 | Reserved | * | 1 | SHA-1 defined in [8] | * | 2 | Reserved | * | 3 | SHA-256 defined in [8] | * +-----------------+--------------------------+ * * * HMAC: n bytes (unsigned integer) This hold the result of the HMAC * calculation. */ struct sctp_authhdr { __be16 shkey_id; __be16 hmac_id; /* __u8 hmac[]; */ }; struct sctp_auth_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_authhdr auth_hdr; }; struct sctp_infox { struct sctp_info *sctpinfo; struct sctp_association *asoc; }; struct sctp_reconf_chunk { struct sctp_chunkhdr chunk_hdr; /* __u8 params[]; */ }; struct sctp_strreset_outreq { struct sctp_paramhdr param_hdr; __be32 request_seq; __be32 response_seq; __be32 send_reset_at_tsn; __be16 list_of_streams[]; }; struct sctp_strreset_inreq { struct sctp_paramhdr param_hdr; __be32 request_seq; __be16 list_of_streams[]; }; struct sctp_strreset_tsnreq { struct sctp_paramhdr param_hdr; __be32 request_seq; }; struct sctp_strreset_addstrm { struct sctp_paramhdr param_hdr; __be32 request_seq; __be16 number_of_streams; __be16 reserved; }; enum { SCTP_STRRESET_NOTHING_TO_DO = 0x00, SCTP_STRRESET_PERFORMED = 0x01, SCTP_STRRESET_DENIED = 0x02, SCTP_STRRESET_ERR_WRONG_SSN = 0x03, SCTP_STRRESET_ERR_IN_PROGRESS = 0x04, SCTP_STRRESET_ERR_BAD_SEQNO = 0x05, SCTP_STRRESET_IN_PROGRESS = 0x06, }; struct sctp_strreset_resp { struct sctp_paramhdr param_hdr; __be32 response_seq; __be32 result; }; struct sctp_strreset_resptsn { struct sctp_paramhdr param_hdr; __be32 response_seq; __be32 result; __be32 senders_next_tsn; __be32 receivers_next_tsn; }; enum { SCTP_DSCP_SET_MASK = 0x1, SCTP_DSCP_VAL_MASK = 0xfc, SCTP_FLOWLABEL_SET_MASK = 0x100000, SCTP_FLOWLABEL_VAL_MASK = 0xfffff }; /* UDP Encapsulation * draft-tuexen-tsvwg-sctp-udp-encaps-cons-03.html#section-4-4 * * The error cause indicating an "Restart of an Association with * New Encapsulation Port" * * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Cause Code = 14 | Cause Length = 8 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Current Encapsulation Port | New Encapsulation Port | * +-------------------------------+-------------------------------+ */ struct sctp_new_encap_port_hdr { __be16 cur_port; __be16 new_port; }; /* Round an int up to the next multiple of 4. */ #define SCTP_PAD4(s) (((s)+3)&~3) /* Truncate to the previous multiple of 4. */ #define SCTP_TRUNC4(s) ((s)&~3) #endif /* __LINUX_SCTP_H__ */
39 32 43 43 28 28 38 13 25 32 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor security identifier (secid) manipulation fns * * Copyright 2009-2017 Canonical Ltd. * * AppArmor allocates a unique secid for every label used. If a label * is replaced it receives the secid of the label it is replacing. */ #include <linux/errno.h> #include <linux/err.h> #include <linux/gfp.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/xarray.h> #include "include/cred.h" #include "include/lib.h" #include "include/secid.h" #include "include/label.h" #include "include/policy_ns.h" /* * secids - do not pin labels with a refcount. They rely on the label * properly updating/freeing them */ #define AA_FIRST_SECID 2 static DEFINE_XARRAY_FLAGS(aa_secids, XA_FLAGS_LOCK_IRQ | XA_FLAGS_TRACK_FREE); int apparmor_display_secid_mode; /* * TODO: allow policy to reserve a secid range? * TODO: add secid pinning * TODO: use secid_update in label replace */ /* * see label for inverse aa_label_to_secid */ struct aa_label *aa_secid_to_label(u32 secid) { return xa_load(&aa_secids, secid); } static int apparmor_label_to_secctx(struct aa_label *label, struct lsm_context *cp) { /* TODO: cache secctx and ref count so we don't have to recreate */ int flags = FLAG_VIEW_SUBNS | FLAG_HIDDEN_UNCONFINED | FLAG_ABS_ROOT; int len; if (!label) return -EINVAL; if (apparmor_display_secid_mode) flags |= FLAG_SHOW_MODE; if (cp) len = aa_label_asxprint(&cp->context, root_ns, label, flags, GFP_ATOMIC); else len = aa_label_snxprint(NULL, 0, root_ns, label, flags); if (len < 0) return -ENOMEM; if (cp) { cp->len = len; cp->id = LSM_ID_APPARMOR; } return len; } int apparmor_secid_to_secctx(u32 secid, struct lsm_context *cp) { struct aa_label *label = aa_secid_to_label(secid); return apparmor_label_to_secctx(label, cp); } int apparmor_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp) { struct aa_label *label; label = prop->apparmor.label; return apparmor_label_to_secctx(label, cp); } int apparmor_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) { struct aa_label *label; label = aa_label_strn_parse(&root_ns->unconfined->label, secdata, seclen, GFP_KERNEL, false, false); if (IS_ERR(label)) return PTR_ERR(label); *secid = label->secid; return 0; } void apparmor_release_secctx(struct lsm_context *cp) { if (cp->id == LSM_ID_APPARMOR) { kfree(cp->context); cp->context = NULL; cp->id = LSM_ID_UNDEF; } } /** * aa_alloc_secid - allocate a new secid for a profile * @label: the label to allocate a secid for * @gfp: memory allocation flags * * Returns: 0 with @label->secid initialized * <0 returns error with @label->secid set to AA_SECID_INVALID */ int aa_alloc_secid(struct aa_label *label, gfp_t gfp) { unsigned long flags; int ret; xa_lock_irqsave(&aa_secids, flags); ret = __xa_alloc(&aa_secids, &label->secid, label, XA_LIMIT(AA_FIRST_SECID, INT_MAX), gfp); xa_unlock_irqrestore(&aa_secids, flags); if (ret < 0) { label->secid = AA_SECID_INVALID; return ret; } return 0; } /** * aa_free_secid - free a secid * @secid: secid to free */ void aa_free_secid(u32 secid) { unsigned long flags; xa_lock_irqsave(&aa_secids, flags); __xa_erase(&aa_secids, secid); xa_unlock_irqrestore(&aa_secids, flags); }
35 34 35 27 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 // SPDX-License-Identifier: GPL-2.0 /* Builtin firmware support */ #include <linux/firmware.h> #include "../firmware.h" /* Only if FW_LOADER=y */ #ifdef CONFIG_FW_LOADER struct builtin_fw { char *name; void *data; unsigned long size; }; extern struct builtin_fw __start_builtin_fw[]; extern struct builtin_fw __end_builtin_fw[]; static bool fw_copy_to_prealloc_buf(struct firmware *fw, void *buf, size_t size) { if (!buf) return true; if (size < fw->size) return false; memcpy(buf, fw->data, fw->size); return true; } /** * firmware_request_builtin() - load builtin firmware * @fw: pointer to firmware struct * @name: name of firmware file * * Some use cases in the kernel have a requirement so that no memory allocator * is involved as these calls take place early in boot process. An example is * the x86 CPU microcode loader. In these cases all the caller wants is to see * if the firmware was built-in and if so use it right away. This can be used * for such cases. * * This looks for the firmware in the built-in kernel. Only if the kernel was * built-in with the firmware you are looking for will this return successfully. * * Callers of this API do not need to use release_firmware() as the pointer to * the firmware is expected to be provided locally on the stack of the caller. **/ bool firmware_request_builtin(struct firmware *fw, const char *name) { struct builtin_fw *b_fw; if (!fw) return false; for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) { if (strcmp(name, b_fw->name) == 0) { fw->size = b_fw->size; fw->data = b_fw->data; return true; } } return false; } EXPORT_SYMBOL_NS_GPL(firmware_request_builtin, "TEST_FIRMWARE"); /** * firmware_request_builtin_buf() - load builtin firmware into optional buffer * @fw: pointer to firmware struct * @name: name of firmware file * @buf: If set this lets you use a pre-allocated buffer so that the built-in * firmware into is copied into. This field can be NULL. It is used by * callers such as request_firmware_into_buf() and * request_partial_firmware_into_buf() * @size: if buf was provided, the max size of the allocated buffer available. * If the built-in firmware does not fit into the pre-allocated @buf this * call will fail. * * This looks for the firmware in the built-in kernel. Only if the kernel was * built-in with the firmware you are looking for will this call possibly * succeed. If you passed a @buf the firmware will be copied into it *iff* the * built-in firmware fits into the pre-allocated buffer size specified in * @size. * * This caller is to be used internally by the firmware_loader only. **/ bool firmware_request_builtin_buf(struct firmware *fw, const char *name, void *buf, size_t size) { if (!firmware_request_builtin(fw, name)) return false; return fw_copy_to_prealloc_buf(fw, buf, size); } bool firmware_is_builtin(const struct firmware *fw) { struct builtin_fw *b_fw; for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) if (fw->data == b_fw->data) return true; return false; } #endif
14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 // SPDX-License-Identifier: GPL-2.0 /* * Implement the default iomap interfaces * * (C) Copyright 2004 Linus Torvalds */ #include <linux/pci.h> #include <linux/io.h> #include <linux/kmsan-checks.h> #include <linux/export.h> /* * Read/write from/to an (offsettable) iomem cookie. It might be a PIO * access or a MMIO access, these functions don't care. The info is * encoded in the hardware mapping set up by the mapping functions * (or the cookie itself, depending on implementation and hw). * * The generic routines don't assume any hardware mappings, and just * encode the PIO/MMIO as part of the cookie. They coldly assume that * the MMIO IO mappings are not in the low address range. * * Architectures for which this is not true can't use this generic * implementation and should do their own copy. */ #ifndef HAVE_ARCH_PIO_SIZE /* * We encode the physical PIO addresses (0-0xffff) into the * pointer by offsetting them with a constant (0x10000) and * assuming that all the low addresses are always PIO. That means * we can do some sanity checks on the low bits, and don't * need to just take things for granted. */ #define PIO_OFFSET 0x10000UL #define PIO_MASK 0x0ffffUL #define PIO_RESERVED 0x40000UL #endif static void bad_io_access(unsigned long port, const char *access) { static int count = 10; if (count) { count--; WARN(1, KERN_ERR "Bad IO access at port %#lx (%s)\n", port, access); } } /* * Ugly macros are a way of life. */ #define IO_COND(addr, is_pio, is_mmio) do { \ unsigned long port = (unsigned long __force)addr; \ if (port >= PIO_RESERVED) { \ is_mmio; \ } else if (port > PIO_OFFSET) { \ port &= PIO_MASK; \ is_pio; \ } else \ bad_io_access(port, #is_pio ); \ } while (0) #ifndef pio_read16be #define pio_read16be(port) swab16(inw(port)) #define pio_read32be(port) swab32(inl(port)) #endif #ifndef mmio_read16be #define mmio_read16be(addr) swab16(readw(addr)) #define mmio_read32be(addr) swab32(readl(addr)) #define mmio_read64be(addr) swab64(readq(addr)) #endif /* * Here and below, we apply __no_kmsan_checks to functions reading data from * hardware, to ensure that KMSAN marks their return values as initialized. */ __no_kmsan_checks unsigned int ioread8(const void __iomem *addr) { IO_COND(addr, return inb(port), return readb(addr)); return 0xff; } __no_kmsan_checks unsigned int ioread16(const void __iomem *addr) { IO_COND(addr, return inw(port), return readw(addr)); return 0xffff; } __no_kmsan_checks unsigned int ioread16be(const void __iomem *addr) { IO_COND(addr, return pio_read16be(port), return mmio_read16be(addr)); return 0xffff; } __no_kmsan_checks unsigned int ioread32(const void __iomem *addr) { IO_COND(addr, return inl(port), return readl(addr)); return 0xffffffff; } __no_kmsan_checks unsigned int ioread32be(const void __iomem *addr) { IO_COND(addr, return pio_read32be(port), return mmio_read32be(addr)); return 0xffffffff; } EXPORT_SYMBOL(ioread8); EXPORT_SYMBOL(ioread16); EXPORT_SYMBOL(ioread16be); EXPORT_SYMBOL(ioread32); EXPORT_SYMBOL(ioread32be); #ifdef CONFIG_64BIT static u64 pio_read64_lo_hi(unsigned long port) { u64 lo, hi; lo = inl(port); hi = inl(port + sizeof(u32)); return lo | (hi << 32); } static u64 pio_read64_hi_lo(unsigned long port) { u64 lo, hi; hi = inl(port + sizeof(u32)); lo = inl(port); return lo | (hi << 32); } static u64 pio_read64be_lo_hi(unsigned long port) { u64 lo, hi; lo = pio_read32be(port + sizeof(u32)); hi = pio_read32be(port); return lo | (hi << 32); } static u64 pio_read64be_hi_lo(unsigned long port) { u64 lo, hi; hi = pio_read32be(port); lo = pio_read32be(port + sizeof(u32)); return lo | (hi << 32); } __no_kmsan_checks u64 __ioread64_lo_hi(const void __iomem *addr) { IO_COND(addr, return pio_read64_lo_hi(port), return readq(addr)); return 0xffffffffffffffffULL; } __no_kmsan_checks u64 __ioread64_hi_lo(const void __iomem *addr) { IO_COND(addr, return pio_read64_hi_lo(port), return readq(addr)); return 0xffffffffffffffffULL; } __no_kmsan_checks u64 __ioread64be_lo_hi(const void __iomem *addr) { IO_COND(addr, return pio_read64be_lo_hi(port), return mmio_read64be(addr)); return 0xffffffffffffffffULL; } __no_kmsan_checks u64 __ioread64be_hi_lo(const void __iomem *addr) { IO_COND(addr, return pio_read64be_hi_lo(port), return mmio_read64be(addr)); return 0xffffffffffffffffULL; } EXPORT_SYMBOL(__ioread64_lo_hi); EXPORT_SYMBOL(__ioread64_hi_lo); EXPORT_SYMBOL(__ioread64be_lo_hi); EXPORT_SYMBOL(__ioread64be_hi_lo); #endif /* CONFIG_64BIT */ #ifndef pio_write16be #define pio_write16be(val,port) outw(swab16(val),port) #define pio_write32be(val,port) outl(swab32(val),port) #endif #ifndef mmio_write16be #define mmio_write16be(val,port) writew(swab16(val),port) #define mmio_write32be(val,port) writel(swab32(val),port) #define mmio_write64be(val,port) writeq(swab64(val),port) #endif void iowrite8(u8 val, void __iomem *addr) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, outb(val,port), writeb(val, addr)); } void iowrite16(u16 val, void __iomem *addr) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, outw(val,port), writew(val, addr)); } void iowrite16be(u16 val, void __iomem *addr) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write16be(val,port), mmio_write16be(val, addr)); } void iowrite32(u32 val, void __iomem *addr) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, outl(val,port), writel(val, addr)); } void iowrite32be(u32 val, void __iomem *addr) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write32be(val,port), mmio_write32be(val, addr)); } EXPORT_SYMBOL(iowrite8); EXPORT_SYMBOL(iowrite16); EXPORT_SYMBOL(iowrite16be); EXPORT_SYMBOL(iowrite32); EXPORT_SYMBOL(iowrite32be); #ifdef CONFIG_64BIT static void pio_write64_lo_hi(u64 val, unsigned long port) { outl(val, port); outl(val >> 32, port + sizeof(u32)); } static void pio_write64_hi_lo(u64 val, unsigned long port) { outl(val >> 32, port + sizeof(u32)); outl(val, port); } static void pio_write64be_lo_hi(u64 val, unsigned long port) { pio_write32be(val, port + sizeof(u32)); pio_write32be(val >> 32, port); } static void pio_write64be_hi_lo(u64 val, unsigned long port) { pio_write32be(val >> 32, port); pio_write32be(val, port + sizeof(u32)); } void __iowrite64_lo_hi(u64 val, void __iomem *addr) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64_lo_hi(val, port), writeq(val, addr)); } void __iowrite64_hi_lo(u64 val, void __iomem *addr) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64_hi_lo(val, port), writeq(val, addr)); } void __iowrite64be_lo_hi(u64 val, void __iomem *addr) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64be_lo_hi(val, port), mmio_write64be(val, addr)); } void __iowrite64be_hi_lo(u64 val, void __iomem *addr) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64be_hi_lo(val, port), mmio_write64be(val, addr)); } EXPORT_SYMBOL(__iowrite64_lo_hi); EXPORT_SYMBOL(__iowrite64_hi_lo); EXPORT_SYMBOL(__iowrite64be_lo_hi); EXPORT_SYMBOL(__iowrite64be_hi_lo); #endif /* CONFIG_64BIT */ /* * These are the "repeat MMIO read/write" functions. * Note the "__raw" accesses, since we don't want to * convert to CPU byte order. We write in "IO byte * order" (we also don't have IO barriers). */ #ifndef mmio_insb static inline void mmio_insb(const void __iomem *addr, u8 *dst, int count) { while (--count >= 0) { u8 data = __raw_readb(addr); *dst = data; dst++; } } static inline void mmio_insw(const void __iomem *addr, u16 *dst, int count) { while (--count >= 0) { u16 data = __raw_readw(addr); *dst = data; dst++; } } static inline void mmio_insl(const void __iomem *addr, u32 *dst, int count) { while (--count >= 0) { u32 data = __raw_readl(addr); *dst = data; dst++; } } #endif #ifndef mmio_outsb static inline void mmio_outsb(void __iomem *addr, const u8 *src, int count) { while (--count >= 0) { __raw_writeb(*src, addr); src++; } } static inline void mmio_outsw(void __iomem *addr, const u16 *src, int count) { while (--count >= 0) { __raw_writew(*src, addr); src++; } } static inline void mmio_outsl(void __iomem *addr, const u32 *src, int count) { while (--count >= 0) { __raw_writel(*src, addr); src++; } } #endif void ioread8_rep(const void __iomem *addr, void *dst, unsigned long count) { IO_COND(addr, insb(port,dst,count), mmio_insb(addr, dst, count)); /* KMSAN must treat values read from devices as initialized. */ kmsan_unpoison_memory(dst, count); } void ioread16_rep(const void __iomem *addr, void *dst, unsigned long count) { IO_COND(addr, insw(port,dst,count), mmio_insw(addr, dst, count)); /* KMSAN must treat values read from devices as initialized. */ kmsan_unpoison_memory(dst, count * 2); } void ioread32_rep(const void __iomem *addr, void *dst, unsigned long count) { IO_COND(addr, insl(port,dst,count), mmio_insl(addr, dst, count)); /* KMSAN must treat values read from devices as initialized. */ kmsan_unpoison_memory(dst, count * 4); } EXPORT_SYMBOL(ioread8_rep); EXPORT_SYMBOL(ioread16_rep); EXPORT_SYMBOL(ioread32_rep); void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(src, count); IO_COND(addr, outsb(port, src, count), mmio_outsb(addr, src, count)); } void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(src, count * 2); IO_COND(addr, outsw(port, src, count), mmio_outsw(addr, src, count)); } void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(src, count * 4); IO_COND(addr, outsl(port, src,count), mmio_outsl(addr, src, count)); } EXPORT_SYMBOL(iowrite8_rep); EXPORT_SYMBOL(iowrite16_rep); EXPORT_SYMBOL(iowrite32_rep); #ifdef CONFIG_HAS_IOPORT_MAP /* Create a virtual mapping cookie for an IO port range */ void __iomem *ioport_map(unsigned long port, unsigned int nr) { if (port > PIO_MASK) return NULL; return (void __iomem *) (unsigned long) (port + PIO_OFFSET); } void ioport_unmap(void __iomem *addr) { /* Nothing to do */ } EXPORT_SYMBOL(ioport_map); EXPORT_SYMBOL(ioport_unmap); #endif /* CONFIG_HAS_IOPORT_MAP */ #ifdef CONFIG_PCI /* Hide the details if this is a MMIO or PIO address space and just do what * you expect in the correct way. */ void pci_iounmap(struct pci_dev *dev, void __iomem * addr) { IO_COND(addr, /* nothing */, iounmap(addr)); } EXPORT_SYMBOL(pci_iounmap); #endif /* CONFIG_PCI */
5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved. */ #include "rxe.h" #include "rxe_hw_counters.h" static const struct rdma_stat_desc rxe_counter_descs[] = { [RXE_CNT_SENT_PKTS].name = "sent_pkts", [RXE_CNT_RCVD_PKTS].name = "rcvd_pkts", [RXE_CNT_DUP_REQ].name = "duplicate_request", [RXE_CNT_OUT_OF_SEQ_REQ].name = "out_of_seq_request", [RXE_CNT_RCV_RNR].name = "rcvd_rnr_err", [RXE_CNT_SND_RNR].name = "send_rnr_err", [RXE_CNT_RCV_SEQ_ERR].name = "rcvd_seq_err", [RXE_CNT_SENDER_SCHED].name = "ack_deferred", [RXE_CNT_RETRY_EXCEEDED].name = "retry_exceeded_err", [RXE_CNT_RNR_RETRY_EXCEEDED].name = "retry_rnr_exceeded_err", [RXE_CNT_COMP_RETRY].name = "completer_retry_err", [RXE_CNT_SEND_ERR].name = "send_err", [RXE_CNT_LINK_DOWNED].name = "link_downed", [RXE_CNT_RDMA_SEND].name = "rdma_sends", [RXE_CNT_RDMA_RECV].name = "rdma_recvs", }; int rxe_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, u32 port, int index) { struct rxe_dev *dev = to_rdev(ibdev); unsigned int cnt; if (!port || !stats) return -EINVAL; for (cnt = 0; cnt < ARRAY_SIZE(rxe_counter_descs); cnt++) stats->value[cnt] = atomic64_read(&dev->stats_counters[cnt]); return ARRAY_SIZE(rxe_counter_descs); } struct rdma_hw_stats *rxe_ib_alloc_hw_port_stats(struct ib_device *ibdev, u32 port_num) { BUILD_BUG_ON(ARRAY_SIZE(rxe_counter_descs) != RXE_NUM_OF_COUNTERS); return rdma_alloc_hw_stats_struct(rxe_counter_descs, ARRAY_SIZE(rxe_counter_descs), RDMA_HW_STATS_DEFAULT_LIFESPAN); }
214 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 /* SPDX-License-Identifier: GPL-2.0 */ /* File: linux/posix_acl.h (C) 2002 Andreas Gruenbacher, <a.gruenbacher@computer.org> */ #ifndef __LINUX_POSIX_ACL_H #define __LINUX_POSIX_ACL_H #include <linux/bug.h> #include <linux/slab.h> #include <linux/rcupdate.h> #include <linux/refcount.h> #include <uapi/linux/posix_acl.h> struct user_namespace; struct posix_acl_entry { short e_tag; unsigned short e_perm; union { kuid_t e_uid; kgid_t e_gid; }; }; struct posix_acl { /* New members MUST be added within the struct_group() macro below. */ struct_group_tagged(posix_acl_hdr, hdr, refcount_t a_refcount; unsigned int a_count; struct rcu_head a_rcu; ); struct posix_acl_entry a_entries[] __counted_by(a_count); }; static_assert(offsetof(struct posix_acl, a_entries) == sizeof(struct posix_acl_hdr), "struct member likely outside of struct_group_tagged()"); #define FOREACH_ACL_ENTRY(pa, acl, pe) \ for(pa=(acl)->a_entries, pe=pa+(acl)->a_count; pa<pe; pa++) /* * Duplicate an ACL handle. */ static inline struct posix_acl * posix_acl_dup(struct posix_acl *acl) { if (acl) refcount_inc(&acl->a_refcount); return acl; } /* * Free an ACL handle. */ static inline void posix_acl_release(struct posix_acl *acl) { if (acl && refcount_dec_and_test(&acl->a_refcount)) kfree_rcu(acl, a_rcu); } /* posix_acl.c */ extern void posix_acl_init(struct posix_acl *, int); extern struct posix_acl *posix_acl_alloc(unsigned int count, gfp_t flags); extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t); extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *); extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *); extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t); extern struct posix_acl *get_posix_acl(struct inode *, int); int set_posix_acl(struct mnt_idmap *, struct dentry *, int, struct posix_acl *); struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); struct posix_acl *posix_acl_clone(const struct posix_acl *acl, gfp_t flags); #ifdef CONFIG_FS_POSIX_ACL int posix_acl_chmod(struct mnt_idmap *, struct dentry *, umode_t); extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **, struct posix_acl **); int posix_acl_update_mode(struct mnt_idmap *, struct inode *, umode_t *, struct posix_acl **); int simple_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); extern int simple_acl_create(struct inode *, struct inode *); struct posix_acl *get_cached_acl(struct inode *inode, int type); void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl); void forget_cached_acl(struct inode *inode, int type); void forget_all_cached_acls(struct inode *inode); int posix_acl_valid(struct user_namespace *, const struct posix_acl *); int posix_acl_permission(struct mnt_idmap *, struct inode *, const struct posix_acl *, int); static inline void cache_no_acl(struct inode *inode) { inode->i_acl = NULL; inode->i_default_acl = NULL; } int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl); struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); int posix_acl_listxattr(struct inode *inode, char **buffer, ssize_t *remaining_size); #else static inline int posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode) { return 0; } #define simple_set_acl NULL static inline int simple_acl_create(struct inode *dir, struct inode *inode) { return 0; } static inline void cache_no_acl(struct inode *inode) { } static inline int posix_acl_create(struct inode *inode, umode_t *mode, struct posix_acl **default_acl, struct posix_acl **acl) { *default_acl = *acl = NULL; return 0; } static inline void forget_all_cached_acls(struct inode *inode) { } static inline int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct posix_acl *acl) { return -EOPNOTSUPP; } static inline struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return ERR_PTR(-EOPNOTSUPP); } static inline int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return -EOPNOTSUPP; } static inline int posix_acl_listxattr(struct inode *inode, char **buffer, ssize_t *remaining_size) { return 0; } #endif /* CONFIG_FS_POSIX_ACL */ struct posix_acl *get_inode_acl(struct inode *inode, int type); #endif /* __LINUX_POSIX_ACL_H */
12 1208 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 /* SPDX-License-Identifier: GPL-2.0 */ /* * descriptor table internals; you almost certainly want file.h instead. */ #ifndef __LINUX_FDTABLE_H #define __LINUX_FDTABLE_H #include <linux/posix_types.h> #include <linux/compiler.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/nospec.h> #include <linux/types.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/atomic.h> /* * The default fd array needs to be at least BITS_PER_LONG, * as this is the granularity returned by copy_fdset(). */ #define NR_OPEN_DEFAULT BITS_PER_LONG struct fdtable { unsigned int max_fds; struct file __rcu **fd; /* current fd array */ unsigned long *close_on_exec; unsigned long *open_fds; unsigned long *full_fds_bits; struct rcu_head rcu; }; /* * Open file table structure */ struct files_struct { /* * read mostly part */ atomic_t count; bool resize_in_progress; wait_queue_head_t resize_wait; struct fdtable __rcu *fdt; struct fdtable fdtab; /* * written part on a separate cache line in SMP */ spinlock_t file_lock ____cacheline_aligned_in_smp; unsigned int next_fd; unsigned long close_on_exec_init[1]; unsigned long open_fds_init[1]; unsigned long full_fds_bits_init[1]; struct file __rcu * fd_array[NR_OPEN_DEFAULT]; }; struct file_operations; struct vfsmount; struct dentry; #define rcu_dereference_check_fdtable(files, fdtfd) \ rcu_dereference_check((fdtfd), lockdep_is_held(&(files)->file_lock)) #define files_fdtable(files) \ rcu_dereference_check_fdtable((files), (files)->fdt) /* * The caller must ensure that fd table isn't shared or hold rcu or file lock */ static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = rcu_dereference_raw(files->fdt); unsigned long mask = array_index_mask_nospec(fd, fdt->max_fds); struct file *needs_masking; /* * 'mask' is zero for an out-of-bounds fd, all ones for ok. * 'fd&mask' is 'fd' for ok, or 0 for out of bounds. * * Accessing fdt->fd[0] is ok, but needs masking of the result. */ needs_masking = rcu_dereference_raw(fdt->fd[fd&mask]); return (struct file *)(mask & (unsigned long)needs_masking); } static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd) { RCU_LOCKDEP_WARN(!lockdep_is_held(&files->file_lock), "suspicious rcu_dereference_check() usage"); return files_lookup_fd_raw(files, fd); } static inline bool close_on_exec(unsigned int fd, const struct files_struct *files) { return test_bit(fd, files_fdtable(files)->close_on_exec); } struct task_struct; void put_files_struct(struct files_struct *fs); int unshare_files(void); struct fd_range { unsigned int from, to; }; struct files_struct *dup_fd(struct files_struct *, struct fd_range *) __latent_entropy; void do_close_on_exec(struct files_struct *); int iterate_fd(struct files_struct *, unsigned, int (*)(const void *, struct file *, unsigned), const void *); extern int close_fd(unsigned int fd); extern struct file *file_close_fd(unsigned int fd); extern struct kmem_cache *files_cachep; #endif /* __LINUX_FDTABLE_H */
27 17 23 6 4 8 27 14 23 25 28 18 1 6 2 25 25 28 30 31 24 6 28 14 8 2 2 2 6 3 3 15 15 6 6 6 4 8 5 1 7 51 49 48 41 15 41 41 11 11 1 7 3 8 2 8 2 5 5 7 3 7 3 9 9 2 171 196 1 1 2 1 195 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: IP/IP protocol decoder modified to support * virtual tunnel interface * * Authors: * Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012 */ /* This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c For comments look at net/ipv4/ip_gre.c --ANK */ #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> #include <linux/icmpv6.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/ip_tunnels.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> static struct rtnl_link_ops vti_link_ops __read_mostly; static unsigned int vti_net_id __read_mostly; static int vti_tunnel_init(struct net_device *dev); static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type, bool update_skb_dev) { struct ip_tunnel *tunnel; const struct iphdr *iph = ip_hdr(skb); struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, vti_net_id); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, 0); if (tunnel) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; if (update_skb_dev) skb->dev = tunnel->dev; return xfrm_input(skb, nexthdr, spi, encap_type); } return -EINVAL; drop: kfree_skb(skb); return 0; } static int vti_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return vti_input(skb, nexthdr, spi, encap_type, false); } static int vti_rcv(struct sk_buff *skb, __be32 spi, bool update_skb_dev) { XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); return vti_input(skb, ip_hdr(skb)->protocol, spi, 0, update_skb_dev); } static int vti_rcv_proto(struct sk_buff *skb) { return vti_rcv(skb, 0, false); } static int vti_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; struct net_device *dev; struct xfrm_state *x; const struct xfrm_mode *inner_mode; struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; u32 orig_mark = skb->mark; int ret; if (!tunnel) return 1; dev = tunnel->dev; if (err) { DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_dropped); return 0; } x = xfrm_input_state(skb); inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); if (inner_mode == NULL) { XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINSTATEMODEERROR); return -EINVAL; } } family = inner_mode->family; skb->mark = be32_to_cpu(tunnel->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); skb->mark = orig_mark; if (!ret) return -EPERM; skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev))); skb->dev = dev; dev_sw_netstats_rx_add(dev, skb->len); return 0; } static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src) { xfrm_address_t *daddr = (xfrm_address_t *)&dst; xfrm_address_t *saddr = (xfrm_address_t *)&src; /* if there is no transform then this tunnel is not functional. * Or if the xfrm is not mode tunnel. */ if (!x || x->props.mode != XFRM_MODE_TUNNEL || x->props.family != AF_INET) return false; if (!dst) return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET); if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET)) return false; return true; } static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_parm_kern *parms = &tunnel->parms; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ int pkt_len = skb->len; int err; int mtu; if (!dst) { switch (skb->protocol) { case htons(ETH_P_IP): { struct rtable *rt; fl->u.ip4.flowi4_oif = dev->ifindex; fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } dst = &rt->dst; skb_dst_set(skb, dst); break; } #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): fl->u.ip6.flowi6_oif = dev->ifindex; fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); if (dst->error) { dst_release(dst); dst = NULL; DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } skb_dst_set(skb, dst); break; #endif default: DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } } dst_hold(dst); dst = xfrm_lookup_route(tunnel->net, dst, fl, NULL, 0); if (IS_ERR(dst)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } if (dst->flags & DST_XFRM_QUEUE) goto xmit; if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) { DEV_STATS_INC(dev, tx_carrier_errors); dst_release(dst); goto tx_error_icmp; } tdev = dst_dev(dst); if (tdev == dev) { dst_release(dst); DEV_STATS_INC(dev, collisions); goto tx_error; } mtu = dst_mtu(dst); if (skb->len > mtu) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); } else { if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); } dst_release(dst); goto tx_error; } xmit: skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = skb_dst_dev(skb); err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) err = pkt_len; iptunnel_xmit_stats(dev, err); return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } /* This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. */ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct flowi fl; if (!pskb_inet_may_pull(skb)) goto tx_err; memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IP): memset(IPCB(skb), 0, sizeof(*IPCB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET); break; case htons(ETH_P_IPV6): memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6); break; default: goto tx_err; } /* override mark with tunnel output key */ fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key); return vti_xmit(skb, dev, &fl); tx_err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } static int vti4_err(struct sk_buff *skb, u32 info) { __be32 spi; __u32 mark; struct xfrm_state *x; struct ip_tunnel *tunnel; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah ; struct ip_comp_hdr *ipch; struct net *net = dev_net(skb->dev); const struct iphdr *iph = (const struct iphdr *)skb->data; int protocol = iph->protocol; struct ip_tunnel_net *itn = net_generic(net, vti_net_id); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr, iph->saddr, 0); if (!tunnel) return -1; mark = be32_to_cpu(tunnel->parms.o_key); switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; break; case ICMP_REDIRECT: break; default: return 0; } x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET); if (!x) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, protocol); else ipv4_redirect(skb, net, 0, protocol); xfrm_state_put(x); return 0; } static int vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; int err = 0; if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p->iph.version != 4 || p->iph.protocol != IPPROTO_IPIP || p->iph.ihl != 5) return -EINVAL; } if (!ip_tunnel_flags_is_be16_compat(p->i_flags) || !ip_tunnel_flags_is_be16_compat(p->o_flags)) return -EOVERFLOW; if (!(ip_tunnel_flags_to_be16(p->i_flags) & GRE_KEY)) p->i_key = 0; if (!(ip_tunnel_flags_to_be16(p->o_flags) & GRE_KEY)) p->o_key = 0; __set_bit(IP_TUNNEL_VTI_BIT, flags); ip_tunnel_flags_copy(p->i_flags, flags); err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; if (cmd != SIOCDELTUNNEL) { ip_tunnel_flags_from_be16(flags, GRE_KEY); ip_tunnel_flags_or(p->i_flags, p->i_flags, flags); ip_tunnel_flags_or(p->o_flags, p->o_flags, flags); } return 0; } static const struct net_device_ops vti_netdev_ops = { .ndo_init = vti_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = vti_tunnel_xmit, .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = vti_tunnel_ctl, }; static void vti_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &vti_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_TUNNEL; ip_tunnel_setup(dev, vti_net_id); } static int vti_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; __dev_addr_set(dev, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); dev->flags = IFF_NOARP; dev->addr_len = 4; dev->lltx = true; netif_keep_dst(dev); return ip_tunnel_init(dev); } static void __net_init vti_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; iph->version = 4; iph->protocol = IPPROTO_IPIP; iph->ihl = 5; } static struct xfrm4_protocol vti_esp4_protocol __read_mostly = { .handler = vti_rcv_proto, .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ah4_protocol __read_mostly = { .handler = vti_rcv_proto, .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { .handler = vti_rcv_proto, .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) static int vti_rcv_tunnel(struct sk_buff *skb) { XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); return vti_input(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr, 0, false); } static struct xfrm_tunnel vti_ipip_handler __read_mostly = { .handler = vti_rcv_tunnel, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 0, }; #if IS_ENABLED(CONFIG_IPV6) static struct xfrm_tunnel vti_ipip6_handler __read_mostly = { .handler = vti_rcv_tunnel, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 0, }; #endif #endif static int __net_init vti_init_net(struct net *net) { int err; struct ip_tunnel_net *itn; err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0"); if (err) return err; itn = net_generic(net, vti_net_id); if (itn->fb_tunnel_dev) vti_fb_tunnel_init(itn->fb_tunnel_dev); return 0; } static void __net_exit vti_exit_rtnl(struct net *net, struct list_head *dev_to_kill) { ip_tunnel_delete_net(net, vti_net_id, &vti_link_ops, dev_to_kill); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, .exit_rtnl = vti_exit_rtnl, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), }; static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return 0; } static void vti_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); parms->iph.protocol = IPPROTO_IPIP; if (!data) return; __set_bit(IP_TUNNEL_VTI_BIT, parms->i_flags); if (data[IFLA_VTI_LINK]) parms->link = nla_get_u32(data[IFLA_VTI_LINK]); if (data[IFLA_VTI_IKEY]) parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]); if (data[IFLA_VTI_OKEY]) parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); if (data[IFLA_VTI_LOCAL]) parms->iph.saddr = nla_get_in_addr(data[IFLA_VTI_LOCAL]); if (data[IFLA_VTI_REMOTE]) parms->iph.daddr = nla_get_in_addr(data[IFLA_VTI_REMOTE]); if (data[IFLA_VTI_FWMARK]) *fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } static int vti_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **data = params->data; struct ip_tunnel_parm_kern parms; struct nlattr **tb = params->tb; __u32 fwmark = 0; vti_netlink_parms(data, &parms, &fwmark); return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &parms, fwmark); } static int vti_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm_kern p; __u32 fwmark = t->fwmark; vti_netlink_parms(data, &p, &fwmark); return ip_tunnel_changelink(dev, tb, &p, fwmark); } static size_t vti_get_size(const struct net_device *dev) { return /* IFLA_VTI_LINK */ nla_total_size(4) + /* IFLA_VTI_IKEY */ nla_total_size(4) + /* IFLA_VTI_OKEY */ nla_total_size(4) + /* IFLA_VTI_LOCAL */ nla_total_size(4) + /* IFLA_VTI_REMOTE */ nla_total_size(4) + /* IFLA_VTI_FWMARK */ nla_total_size(4) + 0; } static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm_kern *p = &t->parms; if (nla_put_u32(skb, IFLA_VTI_LINK, p->link) || nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key) || nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key) || nla_put_in_addr(skb, IFLA_VTI_LOCAL, p->iph.saddr) || nla_put_in_addr(skb, IFLA_VTI_REMOTE, p->iph.daddr) || nla_put_u32(skb, IFLA_VTI_FWMARK, t->fwmark)) return -EMSGSIZE; return 0; } static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = { [IFLA_VTI_LINK] = { .type = NLA_U32 }, [IFLA_VTI_IKEY] = { .type = NLA_U32 }, [IFLA_VTI_OKEY] = { .type = NLA_U32 }, [IFLA_VTI_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) }, [IFLA_VTI_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_VTI_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vti_link_ops __read_mostly = { .kind = "vti", .maxtype = IFLA_VTI_MAX, .policy = vti_policy, .priv_size = sizeof(struct ip_tunnel), .setup = vti_tunnel_setup, .validate = vti_tunnel_validate, .newlink = vti_newlink, .changelink = vti_changelink, .dellink = ip_tunnel_dellink, .get_size = vti_get_size, .fill_info = vti_fill_info, .get_link_net = ip_tunnel_get_link_net, }; static int __init vti_init(void) { const char *msg; int err; pr_info("IPv4 over IPsec tunneling driver\n"); msg = "tunnel device"; err = register_pernet_device(&vti_net_ops); if (err < 0) goto pernet_dev_failed; msg = "tunnel protocols"; err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) msg = "ipip tunnel"; err = xfrm4_tunnel_register(&vti_ipip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ipip_failed; #if IS_ENABLED(CONFIG_IPV6) err = xfrm4_tunnel_register(&vti_ipip6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipip6_failed; #endif #endif msg = "netlink interface"; err = rtnl_link_register(&vti_link_ops); if (err < 0) goto rtnl_link_failed; return err; rtnl_link_failed: #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) #if IS_ENABLED(CONFIG_IPV6) xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6); xfrm_tunnel_ipip6_failed: #endif xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET); xfrm_tunnel_ipip_failed: #endif xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm_proto_comp_failed: xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: unregister_pernet_device(&vti_net_ops); pernet_dev_failed: pr_err("vti init: failed to register %s\n", msg); return err; } static void __exit vti_fini(void) { rtnl_link_unregister(&vti_link_ops); #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) #if IS_ENABLED(CONFIG_IPV6) xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6); #endif xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET); #endif xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); unregister_pernet_device(&vti_net_ops); } module_init(vti_init); module_exit(vti_fini); MODULE_DESCRIPTION("Virtual (secure) IP tunneling library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("vti"); MODULE_ALIAS_NETDEV("ip_vti0");
6 25 17 22 22 23 12 15 15 17 13 6 22 2 15 6 15 16 7 15 22 3 6 20 4 1 19 26 26 24 7 8 19 25 1 14 19 9 9 15 7 7 3 7 6 6 5 6 8 8 16 16 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2017 * * This file is part of the SCTP kernel implementation * * These functions manipulate sctp stream queue/scheduling. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> */ #include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> /* Priority handling * RFC DRAFT ndata section 3.4 */ static void sctp_sched_prio_unsched_all(struct sctp_stream *stream); static struct sctp_stream_priorities *sctp_sched_prio_head_get(struct sctp_stream_priorities *p) { p->users++; return p; } static void sctp_sched_prio_head_put(struct sctp_stream_priorities *p) { if (p && --p->users == 0) kfree(p); } static struct sctp_stream_priorities *sctp_sched_prio_new_head( struct sctp_stream *stream, int prio, gfp_t gfp) { struct sctp_stream_priorities *p; p = kmalloc_obj(*p, gfp); if (!p) return NULL; INIT_LIST_HEAD(&p->prio_sched); INIT_LIST_HEAD(&p->active); p->next = NULL; p->prio = prio; p->users = 1; return p; } static struct sctp_stream_priorities *sctp_sched_prio_get_head( struct sctp_stream *stream, int prio, gfp_t gfp) { struct sctp_stream_priorities *p; int i; /* Look into scheduled priorities first, as they are sorted and * we can find it fast IF it's scheduled. */ list_for_each_entry(p, &stream->prio_list, prio_sched) { if (p->prio == prio) return sctp_sched_prio_head_get(p); if (p->prio > prio) break; } /* No luck. So we search on all streams now. */ for (i = 0; i < stream->outcnt; i++) { if (!SCTP_SO(stream, i)->ext) continue; p = SCTP_SO(stream, i)->ext->prio_head; if (!p) /* Means all other streams won't be initialized * as well. */ break; if (p->prio == prio) return sctp_sched_prio_head_get(p); } /* If not even there, allocate a new one. */ return sctp_sched_prio_new_head(stream, prio, gfp); } static void sctp_sched_prio_next_stream(struct sctp_stream_priorities *p) { struct list_head *pos; pos = p->next->prio_list.next; if (pos == &p->active) pos = pos->next; p->next = list_entry(pos, struct sctp_stream_out_ext, prio_list); } static bool sctp_sched_prio_unsched(struct sctp_stream_out_ext *soute) { bool scheduled = false; if (!list_empty(&soute->prio_list)) { struct sctp_stream_priorities *prio_head = soute->prio_head; /* Scheduled */ scheduled = true; if (prio_head->next == soute) /* Try to move to the next stream */ sctp_sched_prio_next_stream(prio_head); list_del_init(&soute->prio_list); /* Also unsched the priority if this was the last stream */ if (list_empty(&prio_head->active)) { list_del_init(&prio_head->prio_sched); /* If there is no stream left, clear next */ prio_head->next = NULL; } } return scheduled; } static void sctp_sched_prio_sched(struct sctp_stream *stream, struct sctp_stream_out_ext *soute) { struct sctp_stream_priorities *prio, *prio_head; prio_head = soute->prio_head; /* Nothing to do if already scheduled */ if (!list_empty(&soute->prio_list)) return; /* Schedule the stream. If there is a next, we schedule the new * one before it, so it's the last in round robin order. * If there isn't, we also have to schedule the priority. */ if (prio_head->next) { list_add(&soute->prio_list, prio_head->next->prio_list.prev); return; } list_add(&soute->prio_list, &prio_head->active); prio_head->next = soute; list_for_each_entry(prio, &stream->prio_list, prio_sched) { if (prio->prio > prio_head->prio) { list_add(&prio_head->prio_sched, prio->prio_sched.prev); return; } } list_add_tail(&prio_head->prio_sched, &stream->prio_list); } static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid, __u16 prio, gfp_t gfp) { struct sctp_stream_out *sout = SCTP_SO(stream, sid); struct sctp_stream_out_ext *soute = sout->ext; struct sctp_stream_priorities *prio_head, *old; bool reschedule = false; old = soute->prio_head; if (old && old->prio == prio) return 0; prio_head = sctp_sched_prio_get_head(stream, prio, gfp); if (!prio_head) return -ENOMEM; reschedule = sctp_sched_prio_unsched(soute); soute->prio_head = prio_head; if (reschedule) sctp_sched_prio_sched(stream, soute); sctp_sched_prio_head_put(old); return 0; } static int sctp_sched_prio_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { *value = SCTP_SO(stream, sid)->ext->prio_head->prio; return 0; } static int sctp_sched_prio_init(struct sctp_stream *stream) { INIT_LIST_HEAD(&stream->prio_list); return 0; } static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { INIT_LIST_HEAD(&SCTP_SO(stream, sid)->ext->prio_list); return sctp_sched_prio_set(stream, sid, 0, gfp); } static void sctp_sched_prio_free_sid(struct sctp_stream *stream, __u16 sid) { sctp_sched_prio_head_put(SCTP_SO(stream, sid)->ext->prio_head); SCTP_SO(stream, sid)->ext->prio_head = NULL; } static void sctp_sched_prio_enqueue(struct sctp_outq *q, struct sctp_datamsg *msg) { struct sctp_stream *stream; struct sctp_chunk *ch; __u16 sid; ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); sid = sctp_chunk_stream_no(ch); stream = &q->asoc->stream; sctp_sched_prio_sched(stream, SCTP_SO(stream, sid)->ext); } static struct sctp_chunk *sctp_sched_prio_dequeue(struct sctp_outq *q) { struct sctp_stream *stream = &q->asoc->stream; struct sctp_stream_priorities *prio; struct sctp_stream_out_ext *soute; struct sctp_chunk *ch = NULL; /* Bail out quickly if queue is empty */ if (list_empty(&q->out_chunk_list)) goto out; /* Find which chunk is next. It's easy, it's either the current * one or the first chunk on the next active stream. */ if (stream->out_curr) { soute = stream->out_curr->ext; } else { prio = list_entry(stream->prio_list.next, struct sctp_stream_priorities, prio_sched); soute = prio->next; } ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list); sctp_sched_dequeue_common(q, ch); out: return ch; } static void sctp_sched_prio_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) { struct sctp_stream_priorities *prio; struct sctp_stream_out_ext *soute; __u16 sid; /* Last chunk on that msg, move to the next stream on * this priority. */ sid = sctp_chunk_stream_no(ch); soute = SCTP_SO(&q->asoc->stream, sid)->ext; prio = soute->prio_head; sctp_sched_prio_next_stream(prio); if (list_empty(&soute->outq)) sctp_sched_prio_unsched(soute); } static void sctp_sched_prio_sched_all(struct sctp_stream *stream) { struct sctp_association *asoc; struct sctp_stream_out *sout; struct sctp_chunk *ch; asoc = container_of(stream, struct sctp_association, stream); list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { __u16 sid; sid = sctp_chunk_stream_no(ch); sout = SCTP_SO(stream, sid); if (sout->ext) sctp_sched_prio_sched(stream, sout->ext); } } static void sctp_sched_prio_unsched_all(struct sctp_stream *stream) { struct sctp_stream_priorities *p, *tmp; struct sctp_stream_out_ext *soute, *souttmp; list_for_each_entry_safe(p, tmp, &stream->prio_list, prio_sched) list_for_each_entry_safe(soute, souttmp, &p->active, prio_list) sctp_sched_prio_unsched(soute); } static const struct sctp_sched_ops sctp_sched_prio = { .set = sctp_sched_prio_set, .get = sctp_sched_prio_get, .init = sctp_sched_prio_init, .init_sid = sctp_sched_prio_init_sid, .free_sid = sctp_sched_prio_free_sid, .enqueue = sctp_sched_prio_enqueue, .dequeue = sctp_sched_prio_dequeue, .dequeue_done = sctp_sched_prio_dequeue_done, .sched_all = sctp_sched_prio_sched_all, .unsched_all = sctp_sched_prio_unsched_all, }; void sctp_sched_ops_prio_init(void) { sctp_sched_ops_register(SCTP_SS_PRIO, &sctp_sched_prio); }
21894 21883 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 // SPDX-License-Identifier: GPL-2.0-only /* * x86 APERF/MPERF KHz calculation for * /sys/.../cpufreq/scaling_cur_freq * * Copyright (C) 2017 Intel Corp. * Author: Len Brown <len.brown@intel.com> */ #include <linux/cpufreq.h> #include <linux/delay.h> #include <linux/ktime.h> #include <linux/math64.h> #include <linux/percpu.h> #include <linux/rcupdate.h> #include <linux/sched/isolation.h> #include <linux/sched/topology.h> #include <linux/smp.h> #include <linux/syscore_ops.h> #include <asm/cpu.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> #include <asm/msr.h> #include "cpu.h" struct aperfmperf { seqcount_t seq; unsigned long last_update; u64 acnt; u64 mcnt; u64 aperf; u64 mperf; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { .seq = SEQCNT_ZERO(cpu_samples.seq) }; static void init_counter_refs(void *data) { u64 aperf, mperf; rdmsrq(MSR_IA32_APERF, aperf); rdmsrq(MSR_IA32_MPERF, mperf); this_cpu_write(cpu_samples.aperf, aperf); this_cpu_write(cpu_samples.mperf, mperf); } #if defined(CONFIG_X86_64) && defined(CONFIG_SMP) /* * APERF/MPERF frequency ratio computation. * * The scheduler wants to do frequency invariant accounting and needs a <1 * ratio to account for the 'current' frequency, corresponding to * freq_curr / freq_max. * * Since the frequency freq_curr on x86 is controlled by micro-controller and * our P-state setting is little more than a request/hint, we need to observe * the effective frequency 'BusyMHz', i.e. the average frequency over a time * interval after discarding idle time. This is given by: * * BusyMHz = delta_APERF / delta_MPERF * freq_base * * where freq_base is the max non-turbo P-state. * * The freq_max term has to be set to a somewhat arbitrary value, because we * can't know which turbo states will be available at a given point in time: * it all depends on the thermal headroom of the entire package. We set it to * the turbo level with 4 cores active. * * Benchmarks show that's a good compromise between the 1C turbo ratio * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, * which would ignore the entire turbo range (a conspicuous part, making * freq_curr/freq_max always maxed out). * * An exception to the heuristic above is the Atom uarch, where we choose the * highest turbo level for freq_max since Atom's are generally oriented towards * power efficiency. * * Setting freq_max to anything less than the 1C turbo ratio makes the ratio * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. */ DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; void arch_set_max_freq_ratio(bool turbo_disabled) { arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : arch_turbo_freq_ratio; } EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); static bool __init turbo_disabled(void) { u64 misc_en; int err; err = rdmsrq_safe(MSR_IA32_MISC_ENABLE, &misc_en); if (err) return false; return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); } static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) { int err; err = rdmsrq_safe(MSR_ATOM_CORE_RATIOS, base_freq); if (err) return false; err = rdmsrq_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); if (err) return false; *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ return true; } #define X86_MATCH(vfm) \ X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL) static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { X86_MATCH(INTEL_XEON_PHI_KNL), X86_MATCH(INTEL_XEON_PHI_KNM), {} }; static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { X86_MATCH(INTEL_SKYLAKE_X), {} }; static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { X86_MATCH(INTEL_ATOM_GOLDMONT), X86_MATCH(INTEL_ATOM_GOLDMONT_D), X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS), {} }; static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int num_delta_fratio) { int fratio, delta_fratio, found; int err, i; u64 msr; err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr); if (err) return false; fratio = (msr >> 8) & 0xFF; i = 16; found = 0; do { if (found >= num_delta_fratio) { *turbo_freq = fratio; return true; } delta_fratio = (msr >> (i + 5)) & 0x7; if (delta_fratio) { found += 1; fratio -= delta_fratio; } i += 8; } while (i < 64); return true; } static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) { u64 ratios, counts; u32 group_size; int err, i; err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &ratios); if (err) return false; err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT1, &counts); if (err) return false; for (i = 0; i < 64; i += 8) { group_size = (counts >> i) & 0xFF; if (group_size >= size) { *turbo_freq = (ratios >> i) & 0xFF; return true; } } return false; } static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) { u64 msr; int err; err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ /* The CPU may have less than 4 cores */ if (!*turbo_freq) *turbo_freq = msr & 0xFF; /* 1C turbo */ return true; } static bool __init intel_set_max_freq_ratio(void) { u64 base_freq, turbo_freq; u64 turbo_ratio; if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) goto out; if (x86_match_cpu(has_glm_turbo_ratio_limits) && skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) goto out; if (x86_match_cpu(has_knl_turbo_ratio_limits) && knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) goto out; if (x86_match_cpu(has_skx_turbo_ratio_limits) && skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) goto out; if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) goto out; return false; out: /* * Some hypervisors advertise X86_FEATURE_APERFMPERF * but then fill all MSR's with zeroes. * Some CPUs have turbo boost but don't declare any turbo ratio * in MSR_TURBO_RATIO_LIMIT. */ if (!base_freq || !turbo_freq) { pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); return false; } turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); if (!turbo_ratio) { pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); return false; } arch_turbo_freq_ratio = turbo_ratio; arch_set_max_freq_ratio(turbo_disabled()); return true; } #ifdef CONFIG_PM_SLEEP static const struct syscore_ops freq_invariance_syscore_ops = { .resume = init_counter_refs, }; static struct syscore freq_invariance_syscore = { .ops = &freq_invariance_syscore_ops, }; static void register_freq_invariance_syscore(void) { register_syscore(&freq_invariance_syscore); } #else static inline void register_freq_invariance_syscore(void) {} #endif static void freq_invariance_enable(void) { if (static_branch_unlikely(&arch_scale_freq_key)) { WARN_ON_ONCE(1); return; } static_branch_enable_cpuslocked(&arch_scale_freq_key); register_freq_invariance_syscore(); pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); } void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { arch_turbo_freq_ratio = ratio; arch_set_max_freq_ratio(turbo_disabled); freq_invariance_enable(); } static void __init bp_init_freq_invariance(void) { if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return; if (intel_set_max_freq_ratio()) { guard(cpus_read_lock)(); freq_invariance_enable(); } } static void disable_freq_invariance_workfn(struct work_struct *work) { int cpu; static_branch_disable(&arch_scale_freq_key); /* * Set arch_freq_scale to a default value on all cpus * This negates the effect of scaling */ for_each_possible_cpu(cpu) per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE; } static DECLARE_WORK(disable_freq_invariance_work, disable_freq_invariance_workfn); DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale); static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key); struct arch_hybrid_cpu_scale { unsigned long capacity; unsigned long freq_ratio; }; static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale; /** * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling * * Allocate memory for per-CPU data used by hybrid CPU capacity scaling, * initialize it and set the static key controlling its code paths. * * Must be called before arch_set_cpu_capacity(). */ bool arch_enable_hybrid_capacity_scale(void) { int cpu; if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) { WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled"); return true; } arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale); if (!arch_cpu_scale) return false; for_each_possible_cpu(cpu) { per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE; per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio; } static_branch_enable(&arch_hybrid_cap_scale_key); pr_info("Hybrid CPU capacity scaling enabled\n"); return true; } /** * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU * @cpu: Target CPU. * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap. * @max_cap: System-wide maximum CPU capacity. * @cap_freq: Frequency of @cpu corresponding to @cap. * @base_freq: Frequency of @cpu at which MPERF counts. * * The units in which @cap and @max_cap are expressed do not matter, so long * as they are consistent, because the former is effectively divided by the * latter. Analogously for @cap_freq and @base_freq. * * After calling this function for all CPUs, call arch_rebuild_sched_domains() * to let the scheduler know that capacity-aware scheduling can be used going * forward. */ void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap, unsigned long cap_freq, unsigned long base_freq) { if (static_branch_likely(&arch_hybrid_cap_scale_key)) { WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity, div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap)); WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio, div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq)); } else { WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled"); } } unsigned long arch_scale_cpu_capacity(int cpu) { if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity); return SCHED_CAPACITY_SCALE; } EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity); static void scale_freq_tick(u64 acnt, u64 mcnt) { u64 freq_scale, freq_ratio; if (!arch_scale_freq_invariant()) return; if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) goto error; if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio); else freq_ratio = arch_max_freq_ratio; if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt) goto error; freq_scale = div64_u64(acnt, mcnt); if (!freq_scale) goto error; if (freq_scale > SCHED_CAPACITY_SCALE) freq_scale = SCHED_CAPACITY_SCALE; this_cpu_write(arch_freq_scale, freq_scale); return; error: pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); schedule_work(&disable_freq_invariance_work); } #else static inline void bp_init_freq_invariance(void) { } static inline void scale_freq_tick(u64 acnt, u64 mcnt) { } #endif /* CONFIG_X86_64 && CONFIG_SMP */ void arch_scale_freq_tick(void) { struct aperfmperf *s = this_cpu_ptr(&cpu_samples); u64 acnt, mcnt, aperf, mperf; if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return; rdmsrq(MSR_IA32_APERF, aperf); rdmsrq(MSR_IA32_MPERF, mperf); acnt = aperf - s->aperf; mcnt = mperf - s->mperf; s->aperf = aperf; s->mperf = mperf; raw_write_seqcount_begin(&s->seq); s->last_update = jiffies; s->acnt = acnt; s->mcnt = mcnt; raw_write_seqcount_end(&s->seq); scale_freq_tick(acnt, mcnt); } /* * Discard samples older than the define maximum sample age of 20ms. There * is no point in sending IPIs in such a case. If the scheduler tick was * not running then the CPU is either idle or isolated. */ #define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) int arch_freq_get_on_cpu(int cpu) { struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); unsigned int seq, freq; unsigned long last; u64 acnt, mcnt; if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) goto fallback; do { seq = raw_read_seqcount_begin(&s->seq); last = s->last_update; acnt = s->acnt; mcnt = s->mcnt; } while (read_seqcount_retry(&s->seq, seq)); /* * Bail on invalid count and when the last update was too long ago, * which covers idle and NOHZ full CPUs. */ if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE) goto fallback; return div64_u64((cpu_khz * acnt), mcnt); fallback: freq = cpufreq_quick_get(cpu); return freq ? freq : cpu_khz; } static int __init bp_init_aperfmperf(void) { if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return 0; init_counter_refs(NULL); bp_init_freq_invariance(); return 0; } early_initcall(bp_init_aperfmperf); void ap_init_aperfmperf(void) { if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) init_counter_refs(NULL); }
13 4 15 8 4 4 3 3 11 10 2 9 9 5 5 5 11 11 4 4 6 6 5 2 10 10 2 6 39 39 34 35 34 34 26 26 182 129 14 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2017 Pablo Neira Ayuso <pablo@netfilter.org> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/list.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> struct nft_bitmap_elem { struct nft_elem_priv priv; struct list_head head; struct nft_set_ext ext; }; /* This bitmap uses two bits to represent one element. These two bits determine * the element state in the current and the future generation. * * An element can be in three states. The generation cursor is represented using * the ^ character, note that this cursor shifts on every successful transaction. * If no transaction is going on, we observe all elements are in the following * state: * * 11 = this element is active in the current generation. In case of no updates, * ^ it stays active in the next generation. * 00 = this element is inactive in the current generation. In case of no * ^ updates, it stays inactive in the next generation. * * On transaction handling, we observe these two temporary states: * * 01 = this element is inactive in the current generation and it becomes active * ^ in the next one. This happens when the element is inserted but commit * path has not yet been executed yet, so activation is still pending. On * transaction abortion, the element is removed. * 10 = this element is active in the current generation and it becomes inactive * ^ in the next one. This happens when the element is deactivated but commit * path has not yet been executed yet, so removal is still pending. On * transaction abortion, the next generation bit is reset to go back to * restore its previous state. */ struct nft_bitmap { struct list_head list; u16 bitmap_size; u8 bitmap[]; }; static inline void nft_bitmap_location(const struct nft_set *set, const void *key, u32 *idx, u32 *off) { u32 k; if (set->klen == 2) k = *(u16 *)key; else k = *(u8 *)key; k <<= 1; *idx = k / BITS_PER_BYTE; *off = k % BITS_PER_BYTE; } /* Fetch the two bits that represent the element and check if it is active based * on the generation mask. */ static inline bool nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask) { return (bitmap[idx] & (0x3 << off)) & (genmask << off); } INDIRECT_CALLABLE_SCOPE const struct nft_set_ext * nft_bitmap_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { const struct nft_bitmap *priv = nft_set_priv(set); static const struct nft_set_ext found; u8 genmask = nft_genmask_cur(net); u32 idx, off; nft_bitmap_location(set, key, &idx, &off); if (nft_bitmap_active(priv->bitmap, idx, off, genmask)) return &found; return NULL; } static struct nft_bitmap_elem * nft_bitmap_elem_find(const struct net *net, const struct nft_set *set, struct nft_bitmap_elem *this, u8 genmask) { const struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be; list_for_each_entry_rcu(be, &priv->list, head, lockdep_is_held(&nft_pernet(net)->commit_mutex)) { if (memcmp(nft_set_ext_key(&be->ext), nft_set_ext_key(&this->ext), set->klen) || !nft_set_elem_active(&be->ext, genmask)) continue; return be; } return NULL; } static struct nft_elem_priv * nft_bitmap_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { const struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); struct nft_bitmap_elem *be; list_for_each_entry_rcu(be, &priv->list, head) { if (memcmp(nft_set_ext_key(&be->ext), elem->key.val.data, set->klen) || !nft_set_elem_active(&be->ext, genmask)) continue; return &be->priv; } return ERR_PTR(-ENOENT); } static int nft_bitmap_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_elem_priv **elem_priv) { struct nft_bitmap_elem *new = nft_elem_priv_cast(elem->priv), *be; struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); u32 idx, off; be = nft_bitmap_elem_find(net, set, new, genmask); if (be) { *elem_priv = &be->priv; return -EEXIST; } nft_bitmap_location(set, nft_set_ext_key(&new->ext), &idx, &off); /* Enter 01 state. */ priv->bitmap[idx] |= (genmask << off); list_add_tail_rcu(&new->head, &priv->list); return 0; } static void nft_bitmap_remove(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv); struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); u32 idx, off; nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 00 state. */ priv->bitmap[idx] &= ~(genmask << off); list_del_rcu(&be->head); } static void nft_bitmap_activate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv); struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); u32 idx, off; nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 11 state. */ priv->bitmap[idx] |= (genmask << off); nft_clear(net, &be->ext); } static void nft_bitmap_flush(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv); struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); u32 idx, off; nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 10 state, similar to deactivation. */ priv->bitmap[idx] &= ~(genmask << off); nft_set_elem_change_active(net, set, &be->ext); } static struct nft_elem_priv * nft_bitmap_deactivate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_bitmap_elem *this = nft_elem_priv_cast(elem->priv), *be; struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); u32 idx, off; nft_bitmap_location(set, elem->key.val.data, &idx, &off); be = nft_bitmap_elem_find(net, set, this, genmask); if (!be) return NULL; /* Enter 10 state. */ priv->bitmap[idx] &= ~(genmask << off); nft_set_elem_change_active(net, set, &be->ext); return &be->priv; } static void nft_bitmap_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { const struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be; list_for_each_entry_rcu(be, &priv->list, head, lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) { if (iter->count < iter->skip) goto cont; iter->err = iter->fn(ctx, set, iter, &be->priv); if (iter->err < 0) return; cont: iter->count++; } } /* The bitmap size is pow(2, key length in bits) / bits per byte. This is * multiplied by two since each element takes two bits. For 8 bit keys, the * bitmap consumes 66 bytes. For 16 bit keys, 16388 bytes. */ static inline u32 nft_bitmap_size(u32 klen) { return ((2 << ((klen * BITS_PER_BYTE) - 1)) / BITS_PER_BYTE) << 1; } static inline u64 nft_bitmap_total_size(u32 klen) { return sizeof(struct nft_bitmap) + nft_bitmap_size(klen); } static u64 nft_bitmap_privsize(const struct nlattr * const nla[], const struct nft_set_desc *desc) { u32 klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); return nft_bitmap_total_size(klen); } static int nft_bitmap_init(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const nla[]) { struct nft_bitmap *priv = nft_set_priv(set); BUILD_BUG_ON(offsetof(struct nft_bitmap_elem, priv) != 0); INIT_LIST_HEAD(&priv->list); priv->bitmap_size = nft_bitmap_size(set->klen); return 0; } static void nft_bitmap_destroy(const struct nft_ctx *ctx, const struct nft_set *set) { struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be, *n; list_for_each_entry_safe(be, n, &priv->list, head) nf_tables_set_elem_destroy(ctx, set, &be->priv); } static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { /* Make sure bitmaps we don't get bitmaps larger than 16 Kbytes. */ if (desc->klen > 2) return false; else if (desc->expr) return false; est->size = nft_bitmap_total_size(desc->klen); est->lookup = NFT_SET_CLASS_O_1; est->space = NFT_SET_CLASS_O_1; return true; } const struct nft_set_type nft_set_bitmap_type = { .ops = { .privsize = nft_bitmap_privsize, .elemsize = offsetof(struct nft_bitmap_elem, ext), .estimate = nft_bitmap_estimate, .init = nft_bitmap_init, .destroy = nft_bitmap_destroy, .insert = nft_bitmap_insert, .remove = nft_bitmap_remove, .deactivate = nft_bitmap_deactivate, .flush = nft_bitmap_flush, .activate = nft_bitmap_activate, .lookup = nft_bitmap_lookup, .walk = nft_bitmap_walk, .get = nft_bitmap_get, }, };
58 58 58 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 // SPDX-License-Identifier: GPL-2.0-or-later /* client.c: NFS client sharing and management code * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/module.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/time.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/errno.h> #include <linux/unistd.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/metrics.h> #include <linux/sunrpc/xprtsock.h> #include <linux/sunrpc/xprtrdma.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> #include <linux/lockd/bind.h> #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/vfs.h> #include <linux/inet.h> #include <linux/in6.h> #include <linux/slab.h> #include <linux/idr.h> #include <net/ipv6.h> #include <linux/nfs_xdr.h> #include <linux/sunrpc/bc_xprt.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> #include <linux/nfslocalio.h> #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" #include "iostat.h" #include "internal.h" #include "fscache.h" #include "pnfs.h" #include "nfs.h" #include "netns.h" #include "sysfs.h" #include "nfs42.h" #define NFSDBG_FACILITY NFSDBG_CLIENT static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); static DEFINE_RWLOCK(nfs_version_lock); static struct nfs_subversion *nfs_version_mods[5] = { [2] = NULL, [3] = NULL, [4] = NULL, }; /* * RPC cruft for NFS */ static const struct rpc_version *nfs_version[5] = { [2] = NULL, [3] = NULL, [4] = NULL, }; const struct rpc_program nfs_program = { .name = "nfs", .number = NFS_PROGRAM, .nrvers = ARRAY_SIZE(nfs_version), .version = nfs_version, .pipe_dir_name = NFS_PIPE_DIRNAME, }; static struct nfs_subversion *__find_nfs_version(unsigned int version) { struct nfs_subversion *nfs; read_lock(&nfs_version_lock); nfs = nfs_version_mods[version]; read_unlock(&nfs_version_lock); return nfs; } struct nfs_subversion *find_nfs_version(unsigned int version) { struct nfs_subversion *nfs = __find_nfs_version(version); if (!nfs && request_module("nfsv%d", version) == 0) nfs = __find_nfs_version(version); if (!nfs) return ERR_PTR(-EPROTONOSUPPORT); if (!get_nfs_version(nfs)) return ERR_PTR(-EAGAIN); return nfs; } int get_nfs_version(struct nfs_subversion *nfs) { return try_module_get(nfs->owner); } EXPORT_SYMBOL_GPL(get_nfs_version); void put_nfs_version(struct nfs_subversion *nfs) { module_put(nfs->owner); } void register_nfs_version(struct nfs_subversion *nfs) { write_lock(&nfs_version_lock); nfs_version_mods[nfs->rpc_ops->version] = nfs; nfs_version[nfs->rpc_ops->version] = nfs->rpc_vers; write_unlock(&nfs_version_lock); } EXPORT_SYMBOL_GPL(register_nfs_version); void unregister_nfs_version(struct nfs_subversion *nfs) { write_lock(&nfs_version_lock); nfs_version[nfs->rpc_ops->version] = NULL; nfs_version_mods[nfs->rpc_ops->version] = NULL; write_unlock(&nfs_version_lock); } EXPORT_SYMBOL_GPL(unregister_nfs_version); /* * Allocate a shared client record * * Since these are allocated/deallocated very rarely, we don't * bother putting them in a slab cache... */ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) { struct nfs_client *clp; int err = -ENOMEM; if ((clp = kzalloc_obj(*clp)) == NULL) goto error_0; clp->cl_minorversion = cl_init->minorversion; clp->cl_nfs_mod = cl_init->nfs_mod; if (!get_nfs_version(clp->cl_nfs_mod)) goto error_dealloc; clp->rpc_ops = clp->cl_nfs_mod->rpc_ops; refcount_set(&clp->cl_count, 1); clp->cl_cons_state = NFS_CS_INITING; memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen); clp->cl_addrlen = cl_init->addrlen; if (cl_init->hostname) { err = -ENOMEM; clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL); if (!clp->cl_hostname) goto error_cleanup; } INIT_LIST_HEAD(&clp->cl_superblocks); clp->cl_rpcclient = ERR_PTR(-EINVAL); clp->cl_flags = cl_init->init_flags; clp->cl_proto = cl_init->proto; clp->cl_nconnect = cl_init->nconnect; clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1; clp->cl_net = get_net_track(cl_init->net, &clp->cl_ns_tracker, GFP_KERNEL); #if IS_ENABLED(CONFIG_NFS_LOCALIO) seqlock_init(&clp->cl_boot_lock); ktime_get_real_ts64(&clp->cl_nfssvc_boot); nfs_uuid_init(&clp->cl_uuid); INIT_WORK(&clp->cl_local_probe_work, nfs_local_probe_async_work); #endif /* CONFIG_NFS_LOCALIO */ clp->cl_principal = "*"; clp->cl_xprtsec = cl_init->xprtsec; return clp; error_cleanup: put_nfs_version(clp->cl_nfs_mod); error_dealloc: kfree(clp); error_0: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(nfs_alloc_client); #if IS_ENABLED(CONFIG_NFS_V4) static void nfs_cleanup_cb_ident_idr(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); idr_destroy(&nn->cb_ident_idr); } /* nfs_client_lock held */ static void nfs_cb_idr_remove_locked(struct nfs_client *clp) { struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); if (clp->cl_cb_ident) idr_remove(&nn->cb_ident_idr, clp->cl_cb_ident); } static void pnfs_init_server(struct nfs_server *server) { rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); } #else static void nfs_cleanup_cb_ident_idr(struct net *net) { } static void nfs_cb_idr_remove_locked(struct nfs_client *clp) { } static void pnfs_init_server(struct nfs_server *server) { } #endif /* CONFIG_NFS_V4 */ /* * Destroy a shared client record */ void nfs_free_client(struct nfs_client *clp) { nfs_localio_disable_client(clp); /* -EIO all pending I/O */ if (!IS_ERR(clp->cl_rpcclient)) rpc_shutdown_client(clp->cl_rpcclient); put_net_track(clp->cl_net, &clp->cl_ns_tracker); put_nfs_version(clp->cl_nfs_mod); kfree(clp->cl_hostname); kfree(clp->cl_acceptor); kfree_rcu(clp, rcu); } EXPORT_SYMBOL_GPL(nfs_free_client); /* * Release a reference to a shared client record */ void nfs_put_client(struct nfs_client *clp) { struct nfs_net *nn; if (!clp) return; nn = net_generic(clp->cl_net, nfs_net_id); if (refcount_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) { list_del(&clp->cl_share_link); nfs_cb_idr_remove_locked(clp); spin_unlock(&nn->nfs_client_lock); WARN_ON_ONCE(!list_empty(&clp->cl_superblocks)); clp->rpc_ops->free_client(clp); } } EXPORT_SYMBOL_GPL(nfs_put_client); /* * Find an nfs_client on the list that matches the initialisation data * that is supplied. */ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) { struct nfs_client *clp; const struct sockaddr *sap = (struct sockaddr *)data->addr; struct nfs_net *nn = net_generic(data->net, nfs_net_id); int error; again: list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; /* Don't match clients that failed to initialise properly */ if (clp->cl_cons_state < 0) continue; /* If a client is still initializing then we need to wait */ if (clp->cl_cons_state > NFS_CS_READY) { refcount_inc(&clp->cl_count); spin_unlock(&nn->nfs_client_lock); error = nfs_wait_client_init_complete(clp); nfs_put_client(clp); spin_lock(&nn->nfs_client_lock); if (error < 0) return ERR_PTR(error); goto again; } /* Different NFS versions cannot share the same nfs_client */ if (clp->rpc_ops != data->nfs_mod->rpc_ops) continue; if (clp->cl_proto != data->proto) continue; /* Match nfsv4 minorversion */ if (clp->cl_minorversion != data->minorversion) continue; /* Match request for a dedicated DS */ if (test_bit(NFS_CS_DS, &data->init_flags) != test_bit(NFS_CS_DS, &clp->cl_flags)) continue; /* Match the full socket address */ if (!rpc_cmp_addr_port(sap, clap)) /* Match all xprt_switch full socket addresses */ if (IS_ERR(clp->cl_rpcclient) || !rpc_clnt_xprt_switch_has_addr(clp->cl_rpcclient, sap)) continue; /* Match the xprt security policy */ if (clp->cl_xprtsec.policy != data->xprtsec.policy) continue; if (clp->cl_xprtsec.policy == RPC_XPRTSEC_TLS_X509) { if (clp->cl_xprtsec.cert_serial != data->xprtsec.cert_serial) continue; if (clp->cl_xprtsec.privkey_serial != data->xprtsec.privkey_serial) continue; } refcount_inc(&clp->cl_count); return clp; } return NULL; } /* * Return true if @clp is done initializing, false if still working on it. * * Use nfs_client_init_status to check if it was successful. */ bool nfs_client_init_is_complete(const struct nfs_client *clp) { return clp->cl_cons_state <= NFS_CS_READY; } EXPORT_SYMBOL_GPL(nfs_client_init_is_complete); /* * Return 0 if @clp was successfully initialized, -errno otherwise. * * This must be called *after* nfs_client_init_is_complete() returns true, * otherwise it will pop WARN_ON_ONCE and return -EINVAL */ int nfs_client_init_status(const struct nfs_client *clp) { /* called without checking nfs_client_init_is_complete */ if (clp->cl_cons_state > NFS_CS_READY) { WARN_ON_ONCE(1); return -EINVAL; } return clp->cl_cons_state; } EXPORT_SYMBOL_GPL(nfs_client_init_status); int nfs_wait_client_init_complete(const struct nfs_client *clp) { return wait_event_killable(nfs_client_active_wq, nfs_client_init_is_complete(clp)); } EXPORT_SYMBOL_GPL(nfs_wait_client_init_complete); /* * Found an existing client. Make sure it's ready before returning. */ static struct nfs_client * nfs_found_client(const struct nfs_client_initdata *cl_init, struct nfs_client *clp) { int error; error = nfs_wait_client_init_complete(clp); if (error < 0) { nfs_put_client(clp); return ERR_PTR(-ERESTARTSYS); } if (clp->cl_cons_state < NFS_CS_READY) { error = clp->cl_cons_state; nfs_put_client(clp); return ERR_PTR(error); } smp_rmb(); return clp; } /* * Look up a client by IP address and protocol version * - creates a new record if one doesn't yet exist */ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) { struct nfs_client *clp, *new = NULL; struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id); const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops; if (cl_init->hostname == NULL) { WARN_ON(1); return ERR_PTR(-EINVAL); } /* see if the client already exists */ do { spin_lock(&nn->nfs_client_lock); clp = nfs_match_client(cl_init); if (clp) { spin_unlock(&nn->nfs_client_lock); if (new) new->rpc_ops->free_client(new); if (IS_ERR(clp)) return clp; return nfs_found_client(cl_init, clp); } if (new) { list_add_tail(&new->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); new = rpc_ops->init_client(new, cl_init); if (!IS_ERR(new)) nfs_local_probe_async(new); return new; } spin_unlock(&nn->nfs_client_lock); new = rpc_ops->alloc_client(cl_init); } while (!IS_ERR(new)); return new; } EXPORT_SYMBOL_GPL(nfs_get_client); /* * Mark a server as ready or failed */ void nfs_mark_client_ready(struct nfs_client *clp, int state) { smp_wmb(); clp->cl_cons_state = state; wake_up_all(&nfs_client_active_wq); } EXPORT_SYMBOL_GPL(nfs_mark_client_ready); /* * Initialise the timeout values for a connection */ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans) { to->to_initval = timeo * HZ / 10; to->to_retries = retrans; switch (proto) { case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_TCP_TLS: case XPRT_TRANSPORT_RDMA: if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_TCP_RETRANS; if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_TCP_TIMEOUT) to->to_initval = NFS_MAX_TCP_TIMEOUT; to->to_increment = to->to_initval; to->to_maxval = to->to_initval + (to->to_increment * to->to_retries); if (to->to_maxval > NFS_MAX_TCP_TIMEOUT) to->to_maxval = NFS_MAX_TCP_TIMEOUT; if (to->to_maxval < to->to_initval) to->to_maxval = to->to_initval; to->to_exponential = 0; break; case XPRT_TRANSPORT_UDP: if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_UDP_RETRANS; if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_UDP_TIMEOUT) to->to_initval = NFS_MAX_UDP_TIMEOUT; to->to_maxval = NFS_MAX_UDP_TIMEOUT; to->to_exponential = 1; break; default: BUG(); } } EXPORT_SYMBOL_GPL(nfs_init_timeout_values); /* * Create an RPC client handle */ int nfs_create_rpc_client(struct nfs_client *clp, const struct nfs_client_initdata *cl_init, rpc_authflavor_t flavor) { struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); struct rpc_clnt *clnt = NULL; struct rpc_create_args args = { .net = clp->cl_net, .protocol = clp->cl_proto, .nconnect = clp->cl_nconnect, .address = (struct sockaddr *)&clp->cl_addr, .addrsize = clp->cl_addrlen, .timeout = cl_init->timeparms, .servername = clp->cl_hostname, .nodename = cl_init->nodename, .program = &nfs_program, .stats = &nn->rpcstats, .version = clp->rpc_ops->version, .authflavor = flavor, .cred = cl_init->cred, .xprtsec = cl_init->xprtsec, .connect_timeout = cl_init->connect_timeout, .reconnect_timeout = cl_init->reconnect_timeout, }; if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_DISCRTRY; if (test_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT; if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS; if (test_bit(NFS_CS_NOPING, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NOPING; if (test_bit(NFS_CS_REUSEPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_REUSEPORT; if (test_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NETUNREACH_FATAL; if (!IS_ERR(clp->cl_rpcclient)) return 0; clnt = rpc_create(&args); if (IS_ERR(clnt)) { dprintk("%s: cannot create RPC client. Error = %ld\n", __func__, PTR_ERR(clnt)); return PTR_ERR(clnt); } clnt->cl_principal = clp->cl_principal; clp->cl_rpcclient = clnt; clnt->cl_max_connect = clp->cl_max_connect; return 0; } EXPORT_SYMBOL_GPL(nfs_create_rpc_client); /* * Version 2 or 3 client destruction */ static void nfs_destroy_server(struct nfs_server *server) { if (server->nlm_host) nlmclnt_done(server->nlm_host); } /* * Version 2 or 3 lockd setup */ static int nfs_start_lockd(struct nfs_server *server) { struct nlm_host *host; struct nfs_client *clp = server->nfs_client; struct nlmclnt_initdata nlm_init = { .hostname = clp->cl_hostname, .address = (struct sockaddr *)&clp->cl_addr, .addrlen = clp->cl_addrlen, .nfs_version = clp->rpc_ops->version, .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? 1 : 0, .net = clp->cl_net, .nlmclnt_ops = clp->cl_nfs_mod->rpc_ops->nlmclnt_ops, .cred = server->cred, }; if (nlm_init.nfs_version > 3) return 0; if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) && (server->flags & NFS_MOUNT_LOCAL_FCNTL)) return 0; switch (clp->cl_proto) { default: nlm_init.protocol = IPPROTO_TCP; break; #ifndef CONFIG_NFS_DISABLE_UDP_SUPPORT case XPRT_TRANSPORT_UDP: nlm_init.protocol = IPPROTO_UDP; #endif } host = nlmclnt_init(&nlm_init); if (IS_ERR(host)) return PTR_ERR(host); server->nlm_host = host; server->destroy = nfs_destroy_server; nfs_sysfs_link_rpc_client(server, nlmclnt_rpc_clnt(host), NULL); return 0; } /* * Create a general RPC client */ int nfs_init_server_rpcclient(struct nfs_server *server, const struct rpc_timeout *timeo, rpc_authflavor_t pseudoflavour) { struct nfs_client *clp = server->nfs_client; server->client = rpc_clone_client_set_auth(clp->cl_rpcclient, pseudoflavour); if (IS_ERR(server->client)) { dprintk("%s: couldn't create rpc_client!\n", __func__); return PTR_ERR(server->client); } memcpy(&server->client->cl_timeout_default, timeo, sizeof(server->client->cl_timeout_default)); server->client->cl_timeout = &server->client->cl_timeout_default; server->client->cl_softrtry = 0; if (server->flags & NFS_MOUNT_SOFTERR) server->client->cl_softerr = 1; if (server->flags & NFS_MOUNT_SOFT) server->client->cl_softrtry = 1; nfs_sysfs_link_rpc_client(server, server->client, NULL); return 0; } EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient); /** * nfs_init_client - Initialise an NFS2 or NFS3 client * * @clp: nfs_client to initialise * @cl_init: Initialisation parameters * * Returns pointer to an NFS client, or an ERR_PTR value. */ struct nfs_client *nfs_init_client(struct nfs_client *clp, const struct nfs_client_initdata *cl_init) { int error; /* the client is already initialised */ if (clp->cl_cons_state == NFS_CS_READY) return clp; /* * Create a client RPC handle for doing FSSTAT with UNIX auth only * - RFC 2623, sec 2.3.2 */ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); nfs_mark_client_ready(clp, error == 0 ? NFS_CS_READY : error); if (error < 0) { nfs_put_client(clp); clp = ERR_PTR(error); } return clp; } EXPORT_SYMBOL_GPL(nfs_init_client); static void nfs4_server_set_init_caps(struct nfs_server *server) { #if IS_ENABLED(CONFIG_NFS_V4) /* Set the basic capabilities */ server->caps = server->nfs_client->cl_mvops->init_caps; if (server->flags & NFS_MOUNT_NORDIRPLUS) server->caps &= ~NFS_CAP_READDIRPLUS; if (server->nfs_client->cl_proto == XPRT_TRANSPORT_RDMA) server->caps &= ~NFS_CAP_READ_PLUS; /* * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower * authentication. */ if (nfs4_disable_idmapping && server->client->cl_auth->au_flavor == RPC_AUTH_UNIX) server->caps |= NFS_CAP_UIDGID_NOMAP; #endif } void nfs_server_set_init_caps(struct nfs_server *server) { switch (server->nfs_client->rpc_ops->version) { case 2: server->caps = NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS; break; case 3: server->caps = NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS; if (!(server->flags & NFS_MOUNT_NORDIRPLUS)) server->caps |= NFS_CAP_READDIRPLUS; break; default: nfs4_server_set_init_caps(server); break; } } EXPORT_SYMBOL_GPL(nfs_server_set_init_caps); /* * Create a version 2 or 3 client */ static int nfs_init_server(struct nfs_server *server, const struct fs_context *fc) { const struct nfs_fs_context *ctx = nfs_fc2context(fc); struct rpc_timeout timeparms; struct nfs_client_initdata cl_init = { .hostname = ctx->nfs_server.hostname, .addr = &ctx->nfs_server._address, .addrlen = ctx->nfs_server.addrlen, .nfs_mod = ctx->nfs_mod, .proto = ctx->nfs_server.protocol, .net = fc->net_ns, .timeparms = &timeparms, .cred = server->cred, .nconnect = ctx->nfs_server.nconnect, .init_flags = (1UL << NFS_CS_REUSEPORT), .xprtsec = ctx->xprtsec, }; struct nfs_client *clp; int error; nfs_init_timeout_values(&timeparms, ctx->nfs_server.protocol, ctx->timeo, ctx->retrans); if (ctx->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); if (ctx->flags & NFS_MOUNT_NETUNREACH_FATAL) __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags); /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); if (IS_ERR(clp)) return PTR_ERR(clp); server->nfs_client = clp; nfs_sysfs_add_server(server); nfs_sysfs_link_rpc_client(server, clp->cl_rpcclient, "_state"); /* Initialise the client representation from the mount data */ server->flags = ctx->flags; server->options = ctx->options; switch (clp->rpc_ops->version) { case 2: server->fattr_valid = NFS_ATTR_FATTR_V2; break; case 3: server->fattr_valid = NFS_ATTR_FATTR_V3; break; default: server->fattr_valid = NFS_ATTR_FATTR_V4; } if (ctx->bsize) { server->bsize = ctx->bsize; server->automount_inherit |= NFS_AUTOMOUNT_INHERIT_BSIZE; } if (ctx->rsize) { server->rsize = nfs_io_size(ctx->rsize, clp->cl_proto); server->automount_inherit |= NFS_AUTOMOUNT_INHERIT_RSIZE; } if (ctx->wsize) { server->wsize = nfs_io_size(ctx->wsize, clp->cl_proto); server->automount_inherit |= NFS_AUTOMOUNT_INHERIT_WSIZE; } server->acregmin = ctx->acregmin * HZ; server->acregmax = ctx->acregmax * HZ; server->acdirmin = ctx->acdirmin * HZ; server->acdirmax = ctx->acdirmax * HZ; /* Start lockd here, before we might error out */ error = nfs_start_lockd(server); if (error < 0) goto error; server->port = ctx->nfs_server.port; server->auth_info = ctx->auth_info; error = nfs_init_server_rpcclient(server, &timeparms, ctx->selected_flavor); if (error < 0) goto error; nfs_server_set_init_caps(server); /* Preserve the values of mount_server-related mount options */ if (ctx->mount_server.addrlen) { memcpy(&server->mountd_address, &ctx->mount_server.address, ctx->mount_server.addrlen); server->mountd_addrlen = ctx->mount_server.addrlen; } server->mountd_version = ctx->mount_server.version; server->mountd_port = ctx->mount_server.port; server->mountd_protocol = ctx->mount_server.protocol; server->namelen = ctx->namlen; return 0; error: server->nfs_client = NULL; nfs_put_client(clp); return error; } /* * Load up the server record from information gained in an fsinfo record */ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) { struct nfs_client *clp = server->nfs_client; unsigned long max_rpc_payload, raw_max_rpc_payload; /* Work out a lot of parameters */ if (server->rsize == 0) server->rsize = nfs_io_size(fsinfo->rtpref, clp->cl_proto); if (server->wsize == 0) server->wsize = nfs_io_size(fsinfo->wtpref, clp->cl_proto); if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax) server->rsize = nfs_io_size(fsinfo->rtmax, clp->cl_proto); if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax) server->wsize = nfs_io_size(fsinfo->wtmax, clp->cl_proto); raw_max_rpc_payload = rpc_max_payload(server->client); max_rpc_payload = nfs_block_size(raw_max_rpc_payload, NULL); if (server->rsize > max_rpc_payload) server->rsize = max_rpc_payload; if (server->rsize > NFS_MAX_FILE_IO_SIZE) server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT; if (server->wsize > max_rpc_payload) server->wsize = max_rpc_payload; if (server->wsize > NFS_MAX_FILE_IO_SIZE) server->wsize = NFS_MAX_FILE_IO_SIZE; server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); if (server->dtsize > NFS_MAX_FILE_IO_SIZE) server->dtsize = NFS_MAX_FILE_IO_SIZE; if (server->dtsize > server->rsize) server->dtsize = server->rsize; if (server->flags & NFS_MOUNT_NOAC) { server->acregmin = server->acregmax = 0; server->acdirmin = server->acdirmax = 0; } server->maxfilesize = fsinfo->maxfilesize; server->change_attr_type = fsinfo->change_attr_type; server->clone_blksize = fsinfo->clone_blksize; /* We're airborne Set socket buffersize */ rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); #ifdef CONFIG_NFS_V4_2 /* * Defaults until limited by the session parameters. */ server->gxasize = min_t(unsigned int, raw_max_rpc_payload, XATTR_SIZE_MAX); server->sxasize = min_t(unsigned int, raw_max_rpc_payload, XATTR_SIZE_MAX); server->lxasize = min_t(unsigned int, raw_max_rpc_payload, nfs42_listxattr_xdrsize(XATTR_LIST_MAX)); if (fsinfo->xattr_support) server->caps |= NFS_CAP_XATTR; else server->caps &= ~NFS_CAP_XATTR; #endif } /* * Probe filesystem information, including the FSID on v2/v3 */ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr) { struct nfs_fsinfo fsinfo; struct nfs_client *clp = server->nfs_client; int error; if (clp->rpc_ops->set_capabilities != NULL) { error = clp->rpc_ops->set_capabilities(server, mntfh); if (error < 0) return error; } fsinfo.fattr = fattr; fsinfo.nlayouttypes = 0; memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype)); error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) return error; nfs_server_set_fsinfo(server, &fsinfo); /* Get some general file system info */ if (server->namelen == 0) { struct nfs_pathconf pathinfo; pathinfo.fattr = fattr; nfs_fattr_init(fattr); if (clp->rpc_ops->pathconf(server, mntfh, &pathinfo) >= 0) server->namelen = pathinfo.max_namelen; } if (clp->rpc_ops->discover_trunking != NULL && (server->caps & NFS_CAP_FS_LOCATIONS && (server->flags & NFS_MOUNT_TRUNK_DISCOVERY))) { error = clp->rpc_ops->discover_trunking(server, mntfh); if (error < 0) return error; } return 0; } /* * Grab the destination's particulars, including lease expiry time. * * Returns zero if probe succeeded and retrieved FSID matches the FSID * we have cached. */ int nfs_probe_server(struct nfs_server *server, struct nfs_fh *mntfh) { struct nfs_fattr *fattr; int error; fattr = nfs_alloc_fattr(); if (fattr == NULL) return -ENOMEM; /* Sanity: the probe won't work if the destination server * does not recognize the migrated FH. */ error = nfs_probe_fsinfo(server, mntfh, fattr); nfs_free_fattr(fattr); return error; } EXPORT_SYMBOL_GPL(nfs_probe_server); /* * Copy useful information when duplicating a server record */ void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source) { target->flags = source->flags; target->automount_inherit = source->automount_inherit; if (source->automount_inherit & NFS_AUTOMOUNT_INHERIT_BSIZE) target->bsize = source->bsize; if (source->automount_inherit & NFS_AUTOMOUNT_INHERIT_RSIZE) target->rsize = source->rsize; if (source->automount_inherit & NFS_AUTOMOUNT_INHERIT_WSIZE) target->wsize = source->wsize; target->acregmin = source->acregmin; target->acregmax = source->acregmax; target->acdirmin = source->acdirmin; target->acdirmax = source->acdirmax; target->options = source->options; target->auth_info = source->auth_info; target->port = source->port; } EXPORT_SYMBOL_GPL(nfs_server_copy_userdata); void nfs_server_insert_lists(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); spin_lock(&nn->nfs_client_lock); list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); list_add_tail(&server->master_link, &nn->nfs_volume_list); clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); spin_unlock(&nn->nfs_client_lock); } EXPORT_SYMBOL_GPL(nfs_server_insert_lists); void nfs_server_remove_lists(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; struct nfs_net *nn; if (clp == NULL) return; nn = net_generic(clp->cl_net, nfs_net_id); spin_lock(&nn->nfs_client_lock); list_del_rcu(&server->client_link); if (list_empty(&clp->cl_superblocks)) set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); list_del(&server->master_link); spin_unlock(&nn->nfs_client_lock); synchronize_rcu(); } EXPORT_SYMBOL_GPL(nfs_server_remove_lists); static DEFINE_IDA(s_sysfs_ids); /* * Allocate and initialise a server record */ struct nfs_server *nfs_alloc_server(void) { struct nfs_server *server; server = kzalloc_obj(struct nfs_server); if (!server) return NULL; server->s_sysfs_id = ida_alloc(&s_sysfs_ids, GFP_KERNEL); if (server->s_sysfs_id < 0) { kfree(server); return NULL; } server->client = server->client_acl = ERR_PTR(-EINVAL); /* Zero out the NFS state stuff */ INIT_LIST_HEAD(&server->client_link); INIT_LIST_HEAD(&server->master_link); INIT_LIST_HEAD(&server->delegations); spin_lock_init(&server->delegations_lock); INIT_LIST_HEAD(&server->delegations_return); INIT_LIST_HEAD(&server->delegations_lru); INIT_LIST_HEAD(&server->delegations_delayed); INIT_LIST_HEAD(&server->layouts); INIT_LIST_HEAD(&server->state_owners_lru); INIT_LIST_HEAD(&server->ss_copies); INIT_LIST_HEAD(&server->ss_src_copies); atomic_set(&server->active, 0); atomic_long_set(&server->nr_active_delegations, 0); server->io_stats = nfs_alloc_iostats(); if (!server->io_stats) { kfree(server); return NULL; } server->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED; init_waitqueue_head(&server->write_congestion_wait); atomic_long_set(&server->writeback, 0); atomic64_set(&server->owner_ctr, 0); pnfs_init_server(server); rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC"); return server; } EXPORT_SYMBOL_GPL(nfs_alloc_server); static void delayed_free(struct rcu_head *p) { struct nfs_server *server = container_of(p, struct nfs_server, rcu); nfs_free_iostats(server->io_stats); kfree(server); } /* * Free up a server record */ void nfs_free_server(struct nfs_server *server) { nfs_server_remove_lists(server); if (server->destroy != NULL) server->destroy(server); if (!IS_ERR(server->client_acl)) rpc_shutdown_client(server->client_acl); if (!IS_ERR(server->client)) rpc_shutdown_client(server->client); nfs_put_client(server->nfs_client); if (server->kobj.state_initialized) { nfs_sysfs_remove_server(server); kobject_put(&server->kobj); } ida_free(&s_sysfs_ids, server->s_sysfs_id); put_cred(server->cred); nfs_release_automount_timer(); call_rcu(&server->rcu, delayed_free); } EXPORT_SYMBOL_GPL(nfs_free_server); /* * Create a version 2 or 3 volume record * - keyed on server and FSID */ struct nfs_server *nfs_create_server(struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_server *server; struct nfs_fattr *fattr; int error; server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); server->cred = get_cred(fc->cred); error = -ENOMEM; fattr = nfs_alloc_fattr(); if (fattr == NULL) goto error; /* Get a client representation */ error = nfs_init_server(server, fc); if (error < 0) goto error; /* Probe the root fh to retrieve its FSID */ error = nfs_probe_fsinfo(server, ctx->mntfh, fattr); if (error < 0) goto error; if (server->nfs_client->rpc_ops->version == 3) { if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN) server->namelen = NFS3_MAXNAMLEN; if (!(ctx->flags & NFS_MOUNT_NORDIRPLUS)) server->caps |= NFS_CAP_READDIRPLUS; } else { if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) server->namelen = NFS2_MAXNAMLEN; } /* Linux 'subtree_check' borkenness mandates this setting */ server->fh_expire_type = NFS_FH_VOL_RENAME; if (!(fattr->valid & NFS_ATTR_FATTR)) { error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh, fattr, NULL); if (error < 0) { dprintk("nfs_create_server: getattr error = %d\n", -error); goto error; } } memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); dprintk("Server FSID: %llx:%llx\n", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); nfs_server_insert_lists(server); server->mount_time = jiffies; nfs_free_fattr(fattr); return server; error: nfs_free_fattr(fattr); nfs_free_server(server); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(nfs_create_server); /* * Clone an NFS2, NFS3 or NFS4 server record */ struct nfs_server *nfs_clone_server(struct nfs_server *source, struct nfs_fh *fh, struct nfs_fattr *fattr, rpc_authflavor_t flavor) { struct nfs_server *server; int error; server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); server->cred = get_cred(source->cred); /* Copy data from the source */ server->nfs_client = source->nfs_client; server->destroy = source->destroy; refcount_inc(&server->nfs_client->cl_count); nfs_server_copy_userdata(server, source); server->fsid = fattr->fsid; nfs_sysfs_add_server(server); nfs_sysfs_link_rpc_client(server, server->nfs_client->cl_rpcclient, "_state"); error = nfs_init_server_rpcclient(server, source->client->cl_timeout, flavor); if (error < 0) goto out_free_server; nfs_server_set_init_caps(server); /* probe the filesystem info for this server filesystem */ error = nfs_probe_server(server, fh); if (error < 0) goto out_free_server; if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; error = nfs_start_lockd(server); if (error < 0) goto out_free_server; nfs_server_insert_lists(server); server->mount_time = jiffies; return server; out_free_server: nfs_free_server(server); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(nfs_clone_server); void nfs_clients_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); INIT_LIST_HEAD(&nn->nfs_client_list); INIT_LIST_HEAD(&nn->nfs_volume_list); #if IS_ENABLED(CONFIG_NFS_V4) idr_init(&nn->cb_ident_idr); INIT_LIST_HEAD(&nn->nfs4_data_server_cache); spin_lock_init(&nn->nfs4_data_server_lock); #endif /* CONFIG_NFS_V4 */ spin_lock_init(&nn->nfs_client_lock); nn->boot_time = ktime_get_real(); memset(&nn->rpcstats, 0, sizeof(nn->rpcstats)); nn->rpcstats.program = &nfs_program; nfs_netns_sysfs_setup(nn, net); } void nfs_clients_exit(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); nfs_netns_sysfs_destroy(nn); nfs_cleanup_cb_ident_idr(net); WARN_ON_ONCE(!list_empty(&nn->nfs_client_list)); WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list)); #if IS_ENABLED(CONFIG_NFS_V4) WARN_ON_ONCE(!list_empty(&nn->nfs4_data_server_cache)); #endif /* CONFIG_NFS_V4 */ } #ifdef CONFIG_PROC_FS static void *nfs_server_list_start(struct seq_file *p, loff_t *pos); static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_server_list_stop(struct seq_file *p, void *v); static int nfs_server_list_show(struct seq_file *m, void *v); static const struct seq_operations nfs_server_list_ops = { .start = nfs_server_list_start, .next = nfs_server_list_next, .stop = nfs_server_list_stop, .show = nfs_server_list_show, }; static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos); static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_volume_list_stop(struct seq_file *p, void *v); static int nfs_volume_list_show(struct seq_file *m, void *v); static const struct seq_operations nfs_volume_list_ops = { .start = nfs_volume_list_start, .next = nfs_volume_list_next, .stop = nfs_volume_list_stop, .show = nfs_volume_list_show, }; /* * set up the iterator to start reading from the server list and return the first item */ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) __acquires(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* lock the list against modification */ spin_lock(&nn->nfs_client_lock); return seq_list_start_head(&nn->nfs_client_list, *_pos); } /* * move to next server */ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); return seq_list_next(v, &nn->nfs_client_list, pos); } /* * clean up after reading from the transports list */ static void nfs_server_list_stop(struct seq_file *p, void *v) __releases(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); spin_unlock(&nn->nfs_client_lock); } /* * display a header line followed by a load of call lines */ static int nfs_server_list_show(struct seq_file *m, void *v) { struct nfs_client *clp; struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* display header on line 1 */ if (v == &nn->nfs_client_list) { seq_puts(m, "NV SERVER PORT USE HOSTNAME\n"); return 0; } /* display one transport per line on subsequent lines */ clp = list_entry(v, struct nfs_client, cl_share_link); /* Check if the client is initialized */ if (clp->cl_cons_state != NFS_CS_READY) return 0; rcu_read_lock(); seq_printf(m, "v%u %s %s %3d %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), refcount_read(&clp->cl_count), clp->cl_hostname); rcu_read_unlock(); return 0; } /* * set up the iterator to start reading from the volume list and return the first item */ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) __acquires(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* lock the list against modification */ spin_lock(&nn->nfs_client_lock); return seq_list_start_head(&nn->nfs_volume_list, *_pos); } /* * move to next volume */ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); return seq_list_next(v, &nn->nfs_volume_list, pos); } /* * clean up after reading from the transports list */ static void nfs_volume_list_stop(struct seq_file *p, void *v) __releases(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); spin_unlock(&nn->nfs_client_lock); } /* * display a header line followed by a load of call lines */ static int nfs_volume_list_show(struct seq_file *m, void *v) { struct nfs_server *server; struct nfs_client *clp; char dev[13]; // 8 for 2^24, 1 for ':', 3 for 2^8, 1 for '\0' char fsid[34]; // 2 * 16 for %llx, 1 for ':', 1 for '\0' struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* display header on line 1 */ if (v == &nn->nfs_volume_list) { seq_puts(m, "NV SERVER PORT DEV FSID" " FSC\n"); return 0; } /* display one transport per line on subsequent lines */ server = list_entry(v, struct nfs_server, master_link); clp = server->nfs_client; snprintf(dev, sizeof(dev), "%u:%u", MAJOR(server->s_dev), MINOR(server->s_dev)); snprintf(fsid, sizeof(fsid), "%llx:%llx", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); rcu_read_lock(); seq_printf(m, "v%u %s %s %-12s %-33s %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), dev, fsid, nfs_server_fscache_state(server)); rcu_read_unlock(); return 0; } int nfs_fs_proc_net_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); struct proc_dir_entry *p; nn->proc_nfsfs = proc_net_mkdir(net, "nfsfs", net->proc_net); if (!nn->proc_nfsfs) goto error_0; /* a file of servers with which we're dealing */ p = proc_create_net("servers", S_IFREG|S_IRUGO, nn->proc_nfsfs, &nfs_server_list_ops, sizeof(struct seq_net_private)); if (!p) goto error_1; /* a file of volumes that we have mounted */ p = proc_create_net("volumes", S_IFREG|S_IRUGO, nn->proc_nfsfs, &nfs_volume_list_ops, sizeof(struct seq_net_private)); if (!p) goto error_1; return 0; error_1: remove_proc_subtree("nfsfs", net->proc_net); error_0: return -ENOMEM; } void nfs_fs_proc_net_exit(struct net *net) { remove_proc_subtree("nfsfs", net->proc_net); } /* * initialise the /proc/fs/nfsfs/ directory */ int __init nfs_fs_proc_init(void) { if (!proc_mkdir("fs/nfsfs", NULL)) goto error_0; /* a file of servers with which we're dealing */ if (!proc_symlink("fs/nfsfs/servers", NULL, "../../net/nfsfs/servers")) goto error_1; /* a file of volumes that we have mounted */ if (!proc_symlink("fs/nfsfs/volumes", NULL, "../../net/nfsfs/volumes")) goto error_1; return 0; error_1: remove_proc_subtree("fs/nfsfs", NULL); error_0: return -ENOMEM; } /* * clean up the /proc/fs/nfsfs/ directory */ void nfs_fs_proc_exit(void) { remove_proc_subtree("fs/nfsfs", NULL); ida_destroy(&s_sysfs_ids); } #endif /* CONFIG_PROC_FS */
2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 // SPDX-License-Identifier: GPL-2.0-or-later /* * DSA tagging protocol handling * * Copyright (c) 2008-2009 Marvell Semiconductor * Copyright (c) 2013 Florian Fainelli <florian@openwrt.org> * Copyright (c) 2016 Andrew Lunn <andrew@lunn.ch> */ #include <linux/netdevice.h> #include <linux/ptp_classify.h> #include <linux/skbuff.h> #include <net/dsa.h> #include <net/dst_metadata.h> #include "tag.h" #include "user.h" static LIST_HEAD(dsa_tag_drivers_list); static DEFINE_MUTEX(dsa_tag_drivers_lock); /* Determine if we should defer delivery of skb until we have a rx timestamp. * * Called from dsa_switch_rcv. For now, this will only work if tagging is * enabled on the switch. Normally the MAC driver would retrieve the hardware * timestamp when it reads the packet out of the hardware. However in a DSA * switch, the DSA driver owning the interface to which the packet is * delivered is never notified unless we do so here. */ static bool dsa_skb_defer_rx_timestamp(struct dsa_user_priv *p, struct sk_buff *skb) { struct dsa_switch *ds = p->dp->ds; unsigned int type; if (!ds->ops->port_rxtstamp) return false; if (skb_headroom(skb) < ETH_HLEN) return false; __skb_push(skb, ETH_HLEN); type = ptp_classify_raw(skb); __skb_pull(skb, ETH_HLEN); if (type == PTP_CLASS_NONE) return false; return ds->ops->port_rxtstamp(ds, p->dp->index, skb, type); } static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *unused) { struct metadata_dst *md_dst = skb_metadata_dst(skb); struct dsa_port *cpu_dp = dev->dsa_ptr; struct sk_buff *nskb = NULL; struct dsa_user_priv *p; if (unlikely(!cpu_dp)) { kfree_skb(skb); return 0; } skb = skb_unshare(skb, GFP_ATOMIC); if (!skb) return 0; if (md_dst && md_dst->type == METADATA_HW_PORT_MUX) { unsigned int port = md_dst->u.port_info.port_id; skb_dst_drop(skb); if (!skb_has_extensions(skb)) skb->slow_gro = 0; skb->dev = dsa_conduit_find_user(dev, 0, port); if (likely(skb->dev)) { dsa_default_offload_fwd_mark(skb); nskb = skb; } } else { nskb = cpu_dp->rcv(skb, dev); } if (!nskb) { kfree_skb(skb); return 0; } skb = nskb; skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); if (unlikely(!dsa_user_dev_check(skb->dev))) { /* Packet is to be injected directly on an upper * device, e.g. a team/bond, so skip all DSA-port * specific actions. */ netif_rx(skb); return 0; } p = netdev_priv(skb->dev); if (unlikely(cpu_dp->ds->untag_bridge_pvid || cpu_dp->ds->untag_vlan_aware_bridge_pvid)) { nskb = dsa_software_vlan_untag(skb); if (!nskb) { kfree_skb(skb); return 0; } skb = nskb; } dev_sw_netstats_rx_add(skb->dev, skb->len + ETH_HLEN); if (dsa_skb_defer_rx_timestamp(p, skb)) return 0; gro_cells_receive(&p->gcells, skb); return 0; } struct packet_type dsa_pack_type __read_mostly = { .type = cpu_to_be16(ETH_P_XDSA), .func = dsa_switch_rcv, }; static void dsa_tag_driver_register(struct dsa_tag_driver *dsa_tag_driver, struct module *owner) { dsa_tag_driver->owner = owner; mutex_lock(&dsa_tag_drivers_lock); list_add_tail(&dsa_tag_driver->list, &dsa_tag_drivers_list); mutex_unlock(&dsa_tag_drivers_lock); } void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[], unsigned int count, struct module *owner) { unsigned int i; for (i = 0; i < count; i++) dsa_tag_driver_register(dsa_tag_driver_array[i], owner); } static void dsa_tag_driver_unregister(struct dsa_tag_driver *dsa_tag_driver) { mutex_lock(&dsa_tag_drivers_lock); list_del(&dsa_tag_driver->list); mutex_unlock(&dsa_tag_drivers_lock); } EXPORT_SYMBOL_GPL(dsa_tag_drivers_register); void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[], unsigned int count) { unsigned int i; for (i = 0; i < count; i++) dsa_tag_driver_unregister(dsa_tag_driver_array[i]); } EXPORT_SYMBOL_GPL(dsa_tag_drivers_unregister); const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops) { return ops->name; }; /* Function takes a reference on the module owning the tagger, * so dsa_tag_driver_put must be called afterwards. */ const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name) { const struct dsa_device_ops *ops = ERR_PTR(-ENOPROTOOPT); struct dsa_tag_driver *dsa_tag_driver; request_module("%s%s", DSA_TAG_DRIVER_ALIAS, name); mutex_lock(&dsa_tag_drivers_lock); list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { const struct dsa_device_ops *tmp = dsa_tag_driver->ops; if (strcmp(name, tmp->name)) continue; if (!try_module_get(dsa_tag_driver->owner)) break; ops = tmp; break; } mutex_unlock(&dsa_tag_drivers_lock); return ops; } const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol) { struct dsa_tag_driver *dsa_tag_driver; const struct dsa_device_ops *ops; bool found = false; request_module("%sid-%d", DSA_TAG_DRIVER_ALIAS, tag_protocol); mutex_lock(&dsa_tag_drivers_lock); list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { ops = dsa_tag_driver->ops; if (ops->proto == tag_protocol) { found = true; break; } } if (found) { if (!try_module_get(dsa_tag_driver->owner)) ops = ERR_PTR(-ENOPROTOOPT); } else { ops = ERR_PTR(-ENOPROTOOPT); } mutex_unlock(&dsa_tag_drivers_lock); return ops; } void dsa_tag_driver_put(const struct dsa_device_ops *ops) { struct dsa_tag_driver *dsa_tag_driver; mutex_lock(&dsa_tag_drivers_lock); list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { if (dsa_tag_driver->ops == ops) { module_put(dsa_tag_driver->owner); break; } } mutex_unlock(&dsa_tag_drivers_lock); }
377 30 231 528 190 621 165 167 316 206 166 167 167 167 393 417 416 415 316 206 364 363 76 415 416 111 167 167 167 164 167 63 166 199 190 22 167 166 165 167 167 167 167 167 167 167 64 166 166 165 82 166 24 24 63 63 62 63 63 23 23 23 23 23 22 158 158 156 158 158 354 356 354 54 352 527 457 513 157 511 514 527 527 527 525 528 528 166 166 526 165 514 23 23 514 158 512 205 184 42 12 207 66 451 452 364 451 452 452 219 39 176 188 1 2 4 4 12 12 4 12 12 12 386 207 174 146 147 207 206 207 224 207 6 134 115 134 2 134 10 135 20 347 349 146 205 2 2 121 3 2 135 4 12 3 12 9 5 130 105 8 129 186 119 190 15 1539 1 1546 1539 2066 2068 289 2064 2064 1201 2041 2054 913 938 706 952 1149 694 1246 68 68 206 1715 1682 16 1676 1678 329 1 4 1544 1539 1538 237 237 107 199 228 228 62 21 141 317 256 234 183 256 221 5 256 5 251 28 227 18 236 27 235 234 2 106 218 65 65 65 65 62 58 61 61 61 61 61 61 36 33 20 20 16 16 20 20 4 20 20 19 20 20 20 20 20 20 16 16 19 20 316 316 297 315 316 315 315 316 316 316 316 316 195 188 315 1 316 316 82 316 12 7 8 12 12 12 12 12 12 12 20 15 14 28 7 14 14 16 13 11 2 9 14 29 33 29 30 57 57 58 58 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet * & Swedish University of Agricultural Sciences. * * Jens Laas <jens.laas@data.slu.se> Swedish University of * Agricultural Sciences. * * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet * * This work is based on the LPC-trie which is originally described in: * * An experimental study of compression methods for dynamic tries * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. * https://www.csc.kth.se/~snilsson/software/dyntrie2/ * * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 * * Code from fib_hash has been reused which includes the following header: * * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 FIB: lookup engine and maintenance routines. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Substantial contributions to this work comes from: * * David S. Miller, <davem@davemloft.net> * Stephen Hemminger <shemminger@osdl.org> * Paul E. McKenney <paulmck@us.ibm.com> * Patrick McHardy <kaber@trash.net> */ #include <linux/cache.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/rcupdate_wait.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/notifier.h> #include <net/net_namespace.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> #include <net/fib_notifier.h> #include <trace/events/fib.h> #include "fib_lookup.h" static int call_fib_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, u32 dst, int dst_len, struct fib_alias *fa, struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { .info.extack = extack, .dst = dst, .dst_len = dst_len, .fi = fa->fa_info, .dscp = fa->fa_dscp, .type = fa->fa_type, .tb_id = fa->tb_id, }; return call_fib4_notifier(nb, event_type, &info.info); } static int call_fib_entry_notifiers(struct net *net, enum fib_event_type event_type, u32 dst, int dst_len, struct fib_alias *fa, struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { .info.extack = extack, .dst = dst, .dst_len = dst_len, .fi = fa->fa_info, .dscp = fa->fa_dscp, .type = fa->fa_type, .tb_id = fa->tb_id, }; return call_fib4_notifiers(net, event_type, &info.info); } #define MAX_STAT_DEPTH 32 #define KEYLENGTH (8*sizeof(t_key)) #define KEY_MAX ((t_key)~0) typedef unsigned int t_key; #define IS_TRIE(n) ((n)->pos >= KEYLENGTH) #define IS_TNODE(n) ((n)->bits) #define IS_LEAF(n) (!(n)->bits) struct key_vector { t_key key; unsigned char pos; /* 2log(KEYLENGTH) bits needed */ unsigned char bits; /* 2log(KEYLENGTH) bits needed */ unsigned char slen; union { /* This list pointer if valid if (pos | bits) == 0 (LEAF) */ struct hlist_head leaf; /* This array is valid if (pos | bits) > 0 (TNODE) */ DECLARE_FLEX_ARRAY(struct key_vector __rcu *, tnode); }; }; struct tnode { struct rcu_head rcu; t_key empty_children; /* KEYLENGTH bits needed */ t_key full_children; /* KEYLENGTH bits needed */ struct key_vector __rcu *parent; struct key_vector kv[1]; #define tn_bits kv[0].bits }; #define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n]) #define LEAF_SIZE TNODE_SIZE(1) #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats { unsigned int gets; unsigned int backtrack; unsigned int semantic_match_passed; unsigned int semantic_match_miss; unsigned int null_node_hit; unsigned int resize_node_skipped; }; #endif struct trie_stat { unsigned int totdepth; unsigned int maxdepth; unsigned int tnodes; unsigned int leaves; unsigned int nullpointers; unsigned int prefixes; unsigned int nodesizes[MAX_STAT_DEPTH]; }; struct trie { struct key_vector kv[1]; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats; #endif }; static struct key_vector *resize(struct trie *t, struct key_vector *tn); static unsigned int tnode_free_size; /* * synchronize_rcu after call_rcu for outstanding dirty memory; it should be * especially useful before resizing the root node with PREEMPT_NONE configs; * the value was obtained experimentally, aiming to avoid visible slowdown. */ unsigned int sysctl_fib_sync_mem = 512 * 1024; unsigned int sysctl_fib_sync_mem_min = 64 * 1024; unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024; static struct kmem_cache *fn_alias_kmem __ro_after_init; static struct kmem_cache *trie_leaf_kmem __ro_after_init; static inline struct tnode *tn_info(struct key_vector *kv) { return container_of(kv, struct tnode, kv[0]); } /* caller must hold RTNL */ #define node_parent(tn) rtnl_dereference(tn_info(tn)->parent) #define get_child(tn, i) rtnl_dereference((tn)->tnode[i]) /* caller must hold RCU read lock or RTNL */ #define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent) #define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i]) /* wrapper for rcu_assign_pointer */ static inline void node_set_parent(struct key_vector *n, struct key_vector *tp) { if (n) rcu_assign_pointer(tn_info(n)->parent, tp); } #define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p) /* This provides us with the number of children in this node, in the case of a * leaf this will return 0 meaning none of the children are accessible. */ static inline unsigned long child_length(const struct key_vector *tn) { return (1ul << tn->bits) & ~(1ul); } #define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos) static inline unsigned long get_index(t_key key, struct key_vector *kv) { unsigned long index = key ^ kv->key; if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos)) return 0; return index >> kv->pos; } /* To understand this stuff, an understanding of keys and all their bits is * necessary. Every node in the trie has a key associated with it, but not * all of the bits in that key are significant. * * Consider a node 'n' and its parent 'tp'. * * If n is a leaf, every bit in its key is significant. Its presence is * necessitated by path compression, since during a tree traversal (when * searching for a leaf - unless we are doing an insertion) we will completely * ignore all skipped bits we encounter. Thus we need to verify, at the end of * a potentially successful search, that we have indeed been walking the * correct key path. * * Note that we can never "miss" the correct key in the tree if present by * following the wrong path. Path compression ensures that segments of the key * that are the same for all keys with a given prefix are skipped, but the * skipped part *is* identical for each node in the subtrie below the skipped * bit! trie_insert() in this implementation takes care of that. * * if n is an internal node - a 'tnode' here, the various parts of its key * have many different meanings. * * Example: * _________________________________________________________________ * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | * ----------------------------------------------------------------- * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 * * _________________________________________________________________ * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | * ----------------------------------------------------------------- * 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 * * tp->pos = 22 * tp->bits = 3 * n->pos = 13 * n->bits = 4 * * First, let's just ignore the bits that come before the parent tp, that is * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this * point we do not use them for anything. * * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the * index into the parent's child array. That is, they will be used to find * 'n' among tp's children. * * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits * for the node n. * * All the bits we have seen so far are significant to the node n. The rest * of the bits are really not needed or indeed known in n->key. * * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into * n's child array, and will of course be different for each child. * * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown * at this point. */ static const int halve_threshold = 25; static const int inflate_threshold = 50; static const int halve_threshold_root = 15; static const int inflate_threshold_root = 30; static inline void alias_free_mem_rcu(struct fib_alias *fa) { kfree_rcu(fa, rcu); } #define TNODE_VMALLOC_MAX \ ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *)) static void __node_free_rcu(struct rcu_head *head) { struct tnode *n = container_of(head, struct tnode, rcu); if (!n->tn_bits) kmem_cache_free(trie_leaf_kmem, n); else kvfree(n); } #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu) static struct tnode *tnode_alloc(int bits) { size_t size; /* verify bits is within bounds */ if (bits > TNODE_VMALLOC_MAX) return NULL; /* determine size and verify it is non-zero and didn't overflow */ size = TNODE_SIZE(1ul << bits); if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); else return vzalloc(size); } static inline void empty_child_inc(struct key_vector *n) { tn_info(n)->empty_children++; if (!tn_info(n)->empty_children) tn_info(n)->full_children++; } static inline void empty_child_dec(struct key_vector *n) { if (!tn_info(n)->empty_children) tn_info(n)->full_children--; tn_info(n)->empty_children--; } static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) { struct key_vector *l; struct tnode *kv; kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); if (!kv) return NULL; /* initialize key vector */ l = kv->kv; l->key = key; l->pos = 0; l->bits = 0; l->slen = fa->fa_slen; /* link leaf to fib alias */ INIT_HLIST_HEAD(&l->leaf); hlist_add_head(&fa->fa_list, &l->leaf); return l; } static struct key_vector *tnode_new(t_key key, int pos, int bits) { unsigned int shift = pos + bits; struct key_vector *tn; struct tnode *tnode; /* verify bits and pos their msb bits clear and values are valid */ BUG_ON(!bits || (shift > KEYLENGTH)); tnode = tnode_alloc(bits); if (!tnode) return NULL; pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0), sizeof(struct key_vector *) << bits); if (bits == KEYLENGTH) tnode->full_children = 1; else tnode->empty_children = 1ul << bits; tn = tnode->kv; tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; tn->pos = pos; tn->bits = bits; tn->slen = pos; return tn; } /* Check whether a tnode 'n' is "full", i.e. it is an internal node * and no bits are skipped. See discussion in dyntree paper p. 6 */ static inline int tnode_full(struct key_vector *tn, struct key_vector *n) { return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n); } /* Add a child at position i overwriting the old value. * Update the value of full_children and empty_children. */ static void put_child(struct key_vector *tn, unsigned long i, struct key_vector *n) { struct key_vector *chi = get_child(tn, i); int isfull, wasfull; BUG_ON(i >= child_length(tn)); /* update emptyChildren, overflow into fullChildren */ if (!n && chi) empty_child_inc(tn); if (n && !chi) empty_child_dec(tn); /* update fullChildren */ wasfull = tnode_full(tn, chi); isfull = tnode_full(tn, n); if (wasfull && !isfull) tn_info(tn)->full_children--; else if (!wasfull && isfull) tn_info(tn)->full_children++; if (n && (tn->slen < n->slen)) tn->slen = n->slen; rcu_assign_pointer(tn->tnode[i], n); } static void update_children(struct key_vector *tn) { unsigned long i; /* update all of the child parent pointers */ for (i = child_length(tn); i;) { struct key_vector *inode = get_child(tn, --i); if (!inode) continue; /* Either update the children of a tnode that * already belongs to us or update the child * to point to ourselves. */ if (node_parent(inode) == tn) update_children(inode); else node_set_parent(inode, tn); } } static inline void put_child_root(struct key_vector *tp, t_key key, struct key_vector *n) { if (IS_TRIE(tp)) rcu_assign_pointer(tp->tnode[0], n); else put_child(tp, get_index(key, tp), n); } static inline void tnode_free_init(struct key_vector *tn) { tn_info(tn)->rcu.next = NULL; } static inline void tnode_free_append(struct key_vector *tn, struct key_vector *n) { tn_info(n)->rcu.next = tn_info(tn)->rcu.next; tn_info(tn)->rcu.next = &tn_info(n)->rcu; } static void tnode_free(struct key_vector *tn) { struct callback_head *head = &tn_info(tn)->rcu; while (head) { head = head->next; tnode_free_size += TNODE_SIZE(1ul << tn->bits); node_free(tn); tn = container_of(head, struct tnode, rcu)->kv; } if (tnode_free_size >= READ_ONCE(sysctl_fib_sync_mem)) { tnode_free_size = 0; synchronize_net(); } } static struct key_vector *replace(struct trie *t, struct key_vector *oldtnode, struct key_vector *tn) { struct key_vector *tp = node_parent(oldtnode); unsigned long i; /* setup the parent pointer out of and back into this node */ NODE_INIT_PARENT(tn, tp); put_child_root(tp, tn->key, tn); /* update all of the child parent pointers */ update_children(tn); /* all pointers should be clean so we are done */ tnode_free(oldtnode); /* resize children now that oldtnode is freed */ for (i = child_length(tn); i;) { struct key_vector *inode = get_child(tn, --i); /* resize child node */ if (tnode_full(tn, inode)) tn = resize(t, inode); } return tp; } static struct key_vector *inflate(struct trie *t, struct key_vector *oldtnode) { struct key_vector *tn; unsigned long i; t_key m; pr_debug("In inflate\n"); tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1); if (!tn) goto notnode; /* prepare oldtnode to be freed */ tnode_free_init(oldtnode); /* Assemble all of the pointers in our cluster, in this case that * represents all of the pointers out of our allocated nodes that * point to existing tnodes and the links between our allocated * nodes. */ for (i = child_length(oldtnode), m = 1u << tn->pos; i;) { struct key_vector *inode = get_child(oldtnode, --i); struct key_vector *node0, *node1; unsigned long j, k; /* An empty child */ if (!inode) continue; /* A leaf or an internal node with skipped bits */ if (!tnode_full(oldtnode, inode)) { put_child(tn, get_index(inode->key, tn), inode); continue; } /* drop the node in the old tnode free list */ tnode_free_append(oldtnode, inode); /* An internal node with two children */ if (inode->bits == 1) { put_child(tn, 2 * i + 1, get_child(inode, 1)); put_child(tn, 2 * i, get_child(inode, 0)); continue; } /* We will replace this node 'inode' with two new * ones, 'node0' and 'node1', each with half of the * original children. The two new nodes will have * a position one bit further down the key and this * means that the "significant" part of their keys * (see the discussion near the top of this file) * will differ by one bit, which will be "0" in * node0's key and "1" in node1's key. Since we are * moving the key position by one step, the bit that * we are moving away from - the bit at position * (tn->pos) - is the one that will differ between * node0 and node1. So... we synthesize that bit in the * two new keys. */ node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1); if (!node1) goto nomem; node0 = tnode_new(inode->key, inode->pos, inode->bits - 1); tnode_free_append(tn, node1); if (!node0) goto nomem; tnode_free_append(tn, node0); /* populate child pointers in new nodes */ for (k = child_length(inode), j = k / 2; j;) { put_child(node1, --j, get_child(inode, --k)); put_child(node0, j, get_child(inode, j)); put_child(node1, --j, get_child(inode, --k)); put_child(node0, j, get_child(inode, j)); } /* link new nodes to parent */ NODE_INIT_PARENT(node1, tn); NODE_INIT_PARENT(node0, tn); /* link parent to nodes */ put_child(tn, 2 * i + 1, node1); put_child(tn, 2 * i, node0); } /* setup the parent pointers into and out of this node */ return replace(t, oldtnode, tn); nomem: /* all pointers should be clean so we are done */ tnode_free(tn); notnode: return NULL; } static struct key_vector *halve(struct trie *t, struct key_vector *oldtnode) { struct key_vector *tn; unsigned long i; pr_debug("In halve\n"); tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1); if (!tn) goto notnode; /* prepare oldtnode to be freed */ tnode_free_init(oldtnode); /* Assemble all of the pointers in our cluster, in this case that * represents all of the pointers out of our allocated nodes that * point to existing tnodes and the links between our allocated * nodes. */ for (i = child_length(oldtnode); i;) { struct key_vector *node1 = get_child(oldtnode, --i); struct key_vector *node0 = get_child(oldtnode, --i); struct key_vector *inode; /* At least one of the children is empty */ if (!node1 || !node0) { put_child(tn, i / 2, node1 ? : node0); continue; } /* Two nonempty children */ inode = tnode_new(node0->key, oldtnode->pos, 1); if (!inode) goto nomem; tnode_free_append(tn, inode); /* initialize pointers out of node */ put_child(inode, 1, node1); put_child(inode, 0, node0); NODE_INIT_PARENT(inode, tn); /* link parent to node */ put_child(tn, i / 2, inode); } /* setup the parent pointers into and out of this node */ return replace(t, oldtnode, tn); nomem: /* all pointers should be clean so we are done */ tnode_free(tn); notnode: return NULL; } static struct key_vector *collapse(struct trie *t, struct key_vector *oldtnode) { struct key_vector *n, *tp; unsigned long i; /* scan the tnode looking for that one child that might still exist */ for (n = NULL, i = child_length(oldtnode); !n && i;) n = get_child(oldtnode, --i); /* compress one level */ tp = node_parent(oldtnode); put_child_root(tp, oldtnode->key, n); node_set_parent(n, tp); /* drop dead node */ node_free(oldtnode); return tp; } static unsigned char update_suffix(struct key_vector *tn) { unsigned char slen = tn->pos; unsigned long stride, i; unsigned char slen_max; /* only vector 0 can have a suffix length greater than or equal to * tn->pos + tn->bits, the second highest node will have a suffix * length at most of tn->pos + tn->bits - 1 */ slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen); /* search though the list of children looking for nodes that might * have a suffix greater than the one we currently have. This is * why we start with a stride of 2 since a stride of 1 would * represent the nodes with suffix length equal to tn->pos */ for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) { struct key_vector *n = get_child(tn, i); if (!n || (n->slen <= slen)) continue; /* update stride and slen based on new value */ stride <<= (n->slen - slen); slen = n->slen; i &= ~(stride - 1); /* stop searching if we have hit the maximum possible value */ if (slen >= slen_max) break; } tn->slen = slen; return slen; } /* From "Implementing a dynamic compressed trie" by Stefan Nilsson of * the Helsinki University of Technology and Matti Tikkanen of Nokia * Telecommunications, page 6: * "A node is doubled if the ratio of non-empty children to all * children in the *doubled* node is at least 'high'." * * 'high' in this instance is the variable 'inflate_threshold'. It * is expressed as a percentage, so we multiply it with * child_length() and instead of multiplying by 2 (since the * child array will be doubled by inflate()) and multiplying * the left-hand side by 100 (to handle the percentage thing) we * multiply the left-hand side by 50. * * The left-hand side may look a bit weird: child_length(tn) * - tn->empty_children is of course the number of non-null children * in the current node. tn->full_children is the number of "full" * children, that is non-null tnodes with a skip value of 0. * All of those will be doubled in the resulting inflated tnode, so * we just count them one extra time here. * * A clearer way to write this would be: * * to_be_doubled = tn->full_children; * not_to_be_doubled = child_length(tn) - tn->empty_children - * tn->full_children; * * new_child_length = child_length(tn) * 2; * * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / * new_child_length; * if (new_fill_factor >= inflate_threshold) * * ...and so on, tho it would mess up the while () loop. * * anyway, * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= * inflate_threshold * * avoid a division: * 100 * (not_to_be_doubled + 2*to_be_doubled) >= * inflate_threshold * new_child_length * * expand not_to_be_doubled and to_be_doubled, and shorten: * 100 * (child_length(tn) - tn->empty_children + * tn->full_children) >= inflate_threshold * new_child_length * * expand new_child_length: * 100 * (child_length(tn) - tn->empty_children + * tn->full_children) >= * inflate_threshold * child_length(tn) * 2 * * shorten again: * 50 * (tn->full_children + child_length(tn) - * tn->empty_children) >= inflate_threshold * * child_length(tn) * */ static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn) { unsigned long used = child_length(tn); unsigned long threshold = used; /* Keep root node larger */ threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold; used -= tn_info(tn)->empty_children; used += tn_info(tn)->full_children; /* if bits == KEYLENGTH then pos = 0, and will fail below */ return (used > 1) && tn->pos && ((50 * used) >= threshold); } static inline bool should_halve(struct key_vector *tp, struct key_vector *tn) { unsigned long used = child_length(tn); unsigned long threshold = used; /* Keep root node larger */ threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold; used -= tn_info(tn)->empty_children; /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */ return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold); } static inline bool should_collapse(struct key_vector *tn) { unsigned long used = child_length(tn); used -= tn_info(tn)->empty_children; /* account for bits == KEYLENGTH case */ if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children) used -= KEY_MAX; /* One child or none, time to drop us from the trie */ return used < 2; } #define MAX_WORK 10 static struct key_vector *resize(struct trie *t, struct key_vector *tn) { #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats = t->stats; #endif struct key_vector *tp = node_parent(tn); unsigned long cindex = get_index(tn->key, tp); int max_work = MAX_WORK; pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", tn, inflate_threshold, halve_threshold); /* track the tnode via the pointer from the parent instead of * doing it ourselves. This way we can let RCU fully do its * thing without us interfering */ BUG_ON(tn != get_child(tp, cindex)); /* Double as long as the resulting node has a number of * nonempty nodes that are above the threshold. */ while (should_inflate(tp, tn) && max_work) { tp = inflate(t, tn); if (!tp) { #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->resize_node_skipped); #endif break; } max_work--; tn = get_child(tp, cindex); } /* update parent in case inflate failed */ tp = node_parent(tn); /* Return if at least one inflate is run */ if (max_work != MAX_WORK) return tp; /* Halve as long as the number of empty children in this * node is above threshold. */ while (should_halve(tp, tn) && max_work) { tp = halve(t, tn); if (!tp) { #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->resize_node_skipped); #endif break; } max_work--; tn = get_child(tp, cindex); } /* Only one child remains */ if (should_collapse(tn)) return collapse(t, tn); /* update parent in case halve failed */ return node_parent(tn); } static void node_pull_suffix(struct key_vector *tn, unsigned char slen) { unsigned char node_slen = tn->slen; while ((node_slen > tn->pos) && (node_slen > slen)) { slen = update_suffix(tn); if (node_slen == slen) break; tn = node_parent(tn); node_slen = tn->slen; } } static void node_push_suffix(struct key_vector *tn, unsigned char slen) { while (tn->slen < slen) { tn->slen = slen; tn = node_parent(tn); } } /* rcu_read_lock needs to be hold by caller from readside */ static struct key_vector *fib_find_node(struct trie *t, struct key_vector **tp, u32 key) { struct key_vector *pn, *n = t->kv; unsigned long index = 0; do { pn = n; n = get_child_rcu(n, index); if (!n) break; index = get_cindex(key, n); /* This bit of code is a bit tricky but it combines multiple * checks into a single check. The prefix consists of the * prefix plus zeros for the bits in the cindex. The index * is the difference between the key and this value. From * this we can actually derive several pieces of data. * if (index >= (1ul << bits)) * we have a mismatch in skip bits and failed * else * we know the value is cindex * * This check is safe even if bits == KEYLENGTH due to the * fact that we can only allocate a node with 32 bits if a * long is greater than 32 bits. */ if (index >= (1ul << n->bits)) { n = NULL; break; } /* keep searching until we find a perfect match leaf or NULL */ } while (IS_TNODE(n)); *tp = pn; return n; } /* Return the first fib alias matching DSCP with * priority less than or equal to PRIO. * If 'find_first' is set, return the first matching * fib alias, regardless of DSCP and priority. */ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, dscp_t dscp, u32 prio, u32 tb_id, bool find_first) { struct fib_alias *fa; if (!fah) return NULL; hlist_for_each_entry(fa, fah, fa_list) { /* Avoid Sparse warning when using dscp_t in inequalities */ u8 __fa_dscp = inet_dscp_to_dsfield(fa->fa_dscp); u8 __dscp = inet_dscp_to_dsfield(dscp); if (fa->fa_slen < slen) continue; if (fa->fa_slen != slen) break; if (fa->tb_id > tb_id) continue; if (fa->tb_id != tb_id) break; if (find_first) return fa; if (__fa_dscp > __dscp) continue; if (fa->fa_info->fib_priority >= prio || __fa_dscp < __dscp) return fa; } return NULL; } static struct fib_alias * fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri) { u8 slen = KEYLENGTH - fri->dst_len; struct key_vector *l, *tp; struct fib_table *tb; struct fib_alias *fa; struct trie *t; tb = fib_get_table(net, fri->tb_id); if (!tb) return NULL; t = (struct trie *)tb->tb_data; l = fib_find_node(t, &tp, be32_to_cpu(fri->dst)); if (!l) return NULL; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { if (fa->fa_slen == slen && fa->tb_id == fri->tb_id && fa->fa_dscp == fri->dscp && fa->fa_info == fri->fi && fa->fa_type == fri->type) return fa; } return NULL; } void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) { u8 fib_notify_on_flag_change; struct fib_alias *fa_match; struct sk_buff *skb; int err; rcu_read_lock(); fa_match = fib_find_matching_alias(net, fri); if (!fa_match) goto out; /* These are paired with the WRITE_ONCE() happening in this function. * The reason is that we are only protected by RCU at this point. */ if (READ_ONCE(fa_match->offload) == fri->offload && READ_ONCE(fa_match->trap) == fri->trap && READ_ONCE(fa_match->offload_failed) == fri->offload_failed) goto out; WRITE_ONCE(fa_match->offload, fri->offload); WRITE_ONCE(fa_match->trap, fri->trap); fib_notify_on_flag_change = READ_ONCE(net->ipv4.sysctl_fib_notify_on_flag_change); /* 2 means send notifications only if offload_failed was changed. */ if (fib_notify_on_flag_change == 2 && READ_ONCE(fa_match->offload_failed) == fri->offload_failed) goto out; WRITE_ONCE(fa_match->offload_failed, fri->offload_failed); if (!fib_notify_on_flag_change) goto out; skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC); if (!skb) { err = -ENOBUFS; goto errout; } err = fib_dump_info(skb, 0, 0, RTM_NEWROUTE, fri, 0); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_IPV4_ROUTE, NULL, GFP_ATOMIC); goto out; errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_ROUTE, err); out: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set); static void trie_rebalance(struct trie *t, struct key_vector *tn) { while (!IS_TRIE(tn)) tn = resize(t, tn); } static int fib_insert_node(struct trie *t, struct key_vector *tp, struct fib_alias *new, t_key key) { struct key_vector *n, *l; l = leaf_new(key, new); if (!l) goto noleaf; /* retrieve child from parent node */ n = get_child(tp, get_index(key, tp)); /* Case 2: n is a LEAF or a TNODE and the key doesn't match. * * Add a new tnode here * first tnode need some special handling * leaves us in position for handling as case 3 */ if (n) { struct key_vector *tn; tn = tnode_new(key, __fls(key ^ n->key), 1); if (!tn) goto notnode; /* initialize routes out of node */ NODE_INIT_PARENT(tn, tp); put_child(tn, get_index(key, tn) ^ 1, n); /* start adding routes into the node */ put_child_root(tp, key, tn); node_set_parent(n, tn); /* parent now has a NULL spot where the leaf can go */ tp = tn; } /* Case 3: n is NULL, and will just insert a new leaf */ node_push_suffix(tp, new->fa_slen); NODE_INIT_PARENT(l, tp); put_child_root(tp, key, l); trie_rebalance(t, tp); return 0; notnode: node_free(l); noleaf: return -ENOMEM; } static int fib_insert_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *new, struct fib_alias *fa, t_key key) { if (!l) return fib_insert_node(t, tp, new, key); if (fa) { hlist_add_before_rcu(&new->fa_list, &fa->fa_list); } else { struct fib_alias *last; hlist_for_each_entry(last, &l->leaf, fa_list) { if (new->fa_slen < last->fa_slen) break; if ((new->fa_slen == last->fa_slen) && (new->tb_id > last->tb_id)) break; fa = last; } if (fa) hlist_add_behind_rcu(&new->fa_list, &fa->fa_list); else hlist_add_head_rcu(&new->fa_list, &l->leaf); } /* if we added to the tail node then we need to update slen */ if (l->slen < new->fa_slen) { l->slen = new->fa_slen; node_push_suffix(tp, new->fa_slen); } return 0; } static void fib_remove_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *old); /* Caller must hold RTNL. */ int fib_table_insert(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) { struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; struct key_vector *l, *tp; u16 nlflags = NLM_F_EXCL; struct fib_info *fi; u8 plen = cfg->fc_dst_len; u8 slen = KEYLENGTH - plen; dscp_t dscp; u32 key; int err; key = ntohl(cfg->fc_dst); pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); fi = fib_create_info(cfg, extack); if (IS_ERR(fi)) { err = PTR_ERR(fi); goto err; } dscp = cfg->fc_dscp; l = fib_find_node(t, &tp, key); fa = l ? fib_find_alias(&l->leaf, slen, dscp, fi->fib_priority, tb->tb_id, false) : NULL; /* Now fa, if non-NULL, points to the first fib alias * with the same keys [prefix,dscp,priority], if such key already * exists or to the node before which we will insert new one. * * If fa is NULL, we will need to allocate a new one and * insert to the tail of the section matching the suffix length * of the new alias. */ if (fa && fa->fa_dscp == dscp && fa->fa_info->fib_priority == fi->fib_priority) { struct fib_alias *fa_first, *fa_match; err = -EEXIST; if (cfg->fc_nlflags & NLM_F_EXCL) goto out; nlflags &= ~NLM_F_EXCL; /* We have 2 goals: * 1. Find exact match for type, scope, fib_info to avoid * duplicate routes * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it */ fa_match = NULL; fa_first = fa; hlist_for_each_entry_from(fa, fa_list) { if ((fa->fa_slen != slen) || (fa->tb_id != tb->tb_id) || (fa->fa_dscp != dscp)) break; if (fa->fa_info->fib_priority != fi->fib_priority) break; if (fa->fa_type == cfg->fc_type && fa->fa_info == fi) { fa_match = fa; break; } } if (cfg->fc_nlflags & NLM_F_REPLACE) { struct fib_info *fi_drop; u8 state; nlflags |= NLM_F_REPLACE; fa = fa_first; if (fa_match) { if (fa == fa_match) err = 0; goto out; } err = -ENOBUFS; new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (!new_fa) goto out; fi_drop = fa->fa_info; new_fa->fa_dscp = fa->fa_dscp; new_fa->fa_info = fi; new_fa->fa_type = cfg->fc_type; state = READ_ONCE(fa->fa_state); new_fa->fa_state = state & ~FA_S_ACCESSED; new_fa->fa_slen = fa->fa_slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; new_fa->offload = 0; new_fa->trap = 0; new_fa->offload_failed = 0; hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0, tb->tb_id, true) == new_fa) { enum fib_event_type fib_event; fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib_entry_notifiers(net, fib_event, key, plen, new_fa, extack); if (err) { hlist_replace_rcu(&new_fa->fa_list, &fa->fa_list); goto out_free_new_fa; } } rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); alias_free_mem_rcu(fa); fib_release_info(fi_drop); if (state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); goto succeeded; } /* Error if we find a perfect match which * uses the same scope, type, and nexthop * information. */ if (fa_match) goto out; if (cfg->fc_nlflags & NLM_F_APPEND) nlflags |= NLM_F_APPEND; else fa = fa_first; } err = -ENOENT; if (!(cfg->fc_nlflags & NLM_F_CREATE)) goto out; nlflags |= NLM_F_CREATE; err = -ENOBUFS; new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (!new_fa) goto out; new_fa->fa_info = fi; new_fa->fa_dscp = dscp; new_fa->fa_type = cfg->fc_type; new_fa->fa_state = 0; new_fa->fa_slen = slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; new_fa->offload = 0; new_fa->trap = 0; new_fa->offload_failed = 0; /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); if (err) goto out_free_new_fa; /* The alias was already inserted, so the node must exist. */ l = l ? l : fib_find_node(t, &tp, key); if (WARN_ON_ONCE(!l)) { err = -ENOENT; goto out_free_new_fa; } if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) == new_fa) { enum fib_event_type fib_event; fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib_entry_notifiers(net, fib_event, key, plen, new_fa, extack); if (err) goto out_remove_new_fa; } if (!plen) tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: return 0; out_remove_new_fa: fib_remove_alias(t, tp, l, new_fa); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: fib_release_info(fi); err: return err; } static inline t_key prefix_mismatch(t_key key, struct key_vector *n) { t_key prefix = n->key; return (key ^ prefix) & (prefix | -prefix); } bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags, const struct flowi4 *flp) { if (nhc->nhc_flags & RTNH_F_DEAD) return false; if (ip_ignore_linkdown(nhc->nhc_dev) && nhc->nhc_flags & RTNH_F_LINKDOWN && !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) return false; if (flp->flowi4_oif && flp->flowi4_oif != nhc->nhc_oif) return false; return true; } /* should be called with rcu_read_lock */ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags) { struct trie *t = (struct trie *) tb->tb_data; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats = t->stats; #endif const t_key key = ntohl(flp->daddr); struct key_vector *n, *pn; struct fib_alias *fa; unsigned long index; t_key cindex; pn = t->kv; cindex = 0; n = get_child_rcu(pn, cindex); if (!n) { trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN); return -EAGAIN; } #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->gets); #endif /* Step 1: Travel to the longest prefix match in the trie */ for (;;) { index = get_cindex(key, n); /* This bit of code is a bit tricky but it combines multiple * checks into a single check. The prefix consists of the * prefix plus zeros for the "bits" in the prefix. The index * is the difference between the key and this value. From * this we can actually derive several pieces of data. * if (index >= (1ul << bits)) * we have a mismatch in skip bits and failed * else * we know the value is cindex * * This check is safe even if bits == KEYLENGTH due to the * fact that we can only allocate a node with 32 bits if a * long is greater than 32 bits. */ if (index >= (1ul << n->bits)) break; /* we have found a leaf. Prefixes have already been compared */ if (IS_LEAF(n)) goto found; /* only record pn and cindex if we are going to be chopping * bits later. Otherwise we are just wasting cycles. */ if (n->slen > n->pos) { pn = n; cindex = index; } n = get_child_rcu(n, index); if (unlikely(!n)) goto backtrace; } /* Step 2: Sort out leaves and begin backtracing for longest prefix */ for (;;) { /* record the pointer where our next node pointer is stored */ struct key_vector __rcu **cptr = n->tnode; /* This test verifies that none of the bits that differ * between the key and the prefix exist in the region of * the lsb and higher in the prefix. */ if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos)) goto backtrace; /* exit out and process leaf */ if (unlikely(IS_LEAF(n))) break; /* Don't bother recording parent info. Since we are in * prefix match mode we will have to come back to wherever * we started this traversal anyway */ while ((n = rcu_dereference(*cptr)) == NULL) { backtrace: #ifdef CONFIG_IP_FIB_TRIE_STATS if (!n) this_cpu_inc(stats->null_node_hit); #endif /* If we are at cindex 0 there are no more bits for * us to strip at this level so we must ascend back * up one level to see if there are any more bits to * be stripped there. */ while (!cindex) { t_key pkey = pn->key; /* If we don't have a parent then there is * nothing for us to do as we do not have any * further nodes to parse. */ if (IS_TRIE(pn)) { trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN); return -EAGAIN; } #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->backtrack); #endif /* Get Child's index */ pn = node_parent_rcu(pn); cindex = get_index(pkey, pn); } /* strip the least significant bit from the cindex */ cindex &= cindex - 1; /* grab pointer for next child node */ cptr = &pn->tnode[cindex]; } } found: /* this line carries forward the xor from earlier in the function */ index = key ^ n->key; /* Step 3: Process the leaf, if that fails fall back to backtracing */ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; struct fib_nh_common *nhc; int nhsel, err; if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) { if (index >= (1ul << fa->fa_slen)) continue; } if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp)) continue; /* Paired with WRITE_ONCE() in fib_release_info() */ if (READ_ONCE(fi->fib_dead)) continue; if (fa->fa_info->fib_scope < flp->flowi4_scope) continue; fib_alias_accessed(fa); err = fib_props[fa->fa_type].error; if (unlikely(err < 0)) { out_reject: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif trace_fib_table_lookup(tb->tb_id, flp, NULL, err); return err; } if (fi->fib_flags & RTNH_F_DEAD) continue; if (unlikely(fi->nh)) { if (nexthop_is_blackhole(fi->nh)) { err = fib_props[RTN_BLACKHOLE].error; goto out_reject; } nhc = nexthop_get_nhc_lookup(fi->nh, fib_flags, flp, &nhsel); if (nhc) goto set_result; goto miss; } for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { nhc = fib_info_nhc(fi, nhsel); if (!fib_lookup_good_nhc(nhc, fib_flags, flp)) continue; set_result: if (!(fib_flags & FIB_LOOKUP_NOREF)) refcount_inc(&fi->fib_clntref); res->prefix = htonl(n->key); res->prefixlen = KEYLENGTH - fa->fa_slen; res->nh_sel = nhsel; res->nhc = nhc; res->type = fa->fa_type; res->scope = fi->fib_scope; res->dscp = fa->fa_dscp; res->fi = fi; res->table = tb; res->fa_head = &n->leaf; #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif trace_fib_table_lookup(tb->tb_id, flp, nhc, err); return err; } } miss: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_miss); #endif goto backtrace; } EXPORT_SYMBOL_GPL(fib_table_lookup); static void fib_remove_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *old) { /* record the location of the previous list_info entry */ struct hlist_node **pprev = old->fa_list.pprev; struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next); /* remove the fib_alias from the list */ hlist_del_rcu(&old->fa_list); /* if we emptied the list this leaf will be freed and we can sort * out parent suffix lengths as a part of trie_rebalance */ if (hlist_empty(&l->leaf)) { if (tp->slen == l->slen) node_pull_suffix(tp, tp->pos); put_child_root(tp, l->key, NULL); node_free(l); trie_rebalance(t, tp); return; } /* only access fa if it is pointing at the last valid hlist_node */ if (*pprev) return; /* update the trie with the latest suffix length */ l->slen = fa->fa_slen; node_pull_suffix(tp, fa->fa_slen); } static void fib_notify_alias_delete(struct net *net, u32 key, struct hlist_head *fah, struct fib_alias *fa_to_delete, struct netlink_ext_ack *extack) { struct fib_alias *fa_next, *fa_to_notify; u32 tb_id = fa_to_delete->tb_id; u8 slen = fa_to_delete->fa_slen; enum fib_event_type fib_event; /* Do not notify if we do not care about the route. */ if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete) return; /* Determine if the route should be replaced by the next route in the * list. */ fa_next = hlist_entry_safe(fa_to_delete->fa_list.next, struct fib_alias, fa_list); if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) { fib_event = FIB_EVENT_ENTRY_REPLACE; fa_to_notify = fa_next; } else { fib_event = FIB_EVENT_ENTRY_DEL; fa_to_notify = fa_to_delete; } call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen, fa_to_notify, extack); } /* Caller must hold RTNL. */ int fib_table_delete(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) { struct trie *t = (struct trie *) tb->tb_data; struct fib_alias *fa, *fa_to_delete; struct key_vector *l, *tp; u8 plen = cfg->fc_dst_len; u8 slen = KEYLENGTH - plen; dscp_t dscp; u32 key; key = ntohl(cfg->fc_dst); l = fib_find_node(t, &tp, key); if (!l) return -ESRCH; dscp = cfg->fc_dscp; fa = fib_find_alias(&l->leaf, slen, dscp, 0, tb->tb_id, false); if (!fa) return -ESRCH; pr_debug("Deleting %08x/%d dsfield=0x%02x t=%p\n", key, plen, inet_dscp_to_dsfield(dscp), t); fa_to_delete = NULL; hlist_for_each_entry_from(fa, fa_list) { struct fib_info *fi = fa->fa_info; if ((fa->fa_slen != slen) || (fa->tb_id != tb->tb_id) || (fa->fa_dscp != dscp)) break; if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && (cfg->fc_scope == RT_SCOPE_NOWHERE || fa->fa_info->fib_scope == cfg->fc_scope) && (!cfg->fc_prefsrc || fi->fib_prefsrc == cfg->fc_prefsrc) && (!cfg->fc_protocol || fi->fib_protocol == cfg->fc_protocol) && fib_nh_match(net, cfg, fi, extack) == 0 && fib_metrics_match(cfg, fi)) { fa_to_delete = fa; break; } } if (!fa_to_delete) return -ESRCH; fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); if (!plen) tb->tb_num_default--; fib_remove_alias(t, tp, l, fa_to_delete); if (READ_ONCE(fa_to_delete->fa_state) & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); fib_release_info(fa_to_delete->fa_info); alias_free_mem_rcu(fa_to_delete); return 0; } /* Scan for the next leaf starting at the provided key value */ static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key) { struct key_vector *pn, *n = *tn; unsigned long cindex; /* this loop is meant to try and find the key in the trie */ do { /* record parent and next child index */ pn = n; cindex = (key > pn->key) ? get_index(key, pn) : 0; if (cindex >> pn->bits) break; /* descend into the next child */ n = get_child_rcu(pn, cindex++); if (!n) break; /* guarantee forward progress on the keys */ if (IS_LEAF(n) && (n->key >= key)) goto found; } while (IS_TNODE(n)); /* this loop will search for the next leaf with a greater key */ while (!IS_TRIE(pn)) { /* if we exhausted the parent node we will need to climb */ if (cindex >= (1ul << pn->bits)) { t_key pkey = pn->key; pn = node_parent_rcu(pn); cindex = get_index(pkey, pn) + 1; continue; } /* grab the next available node */ n = get_child_rcu(pn, cindex++); if (!n) continue; /* no need to compare keys since we bumped the index */ if (IS_LEAF(n)) goto found; /* Rescan start scanning in new node */ pn = n; cindex = 0; } *tn = pn; return NULL; /* Root of trie */ found: /* if we are at the limit for keys just return NULL for the tnode */ *tn = pn; return n; } static void fib_trie_free(struct fib_table *tb) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; struct fib_alias *fa; /* walk trie in reverse order and free everything */ for (;;) { struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; if (IS_TRIE(pn)) break; n = pn; pn = node_parent(pn); /* drop emptied tnode */ put_child_root(pn, n->key, NULL); node_free(n); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { hlist_del_rcu(&fa->fa_list); alias_free_mem_rcu(fa); } put_child_root(pn, n->key, NULL); node_free(n); } #ifdef CONFIG_IP_FIB_TRIE_STATS free_percpu(t->stats); #endif kfree(tb); } struct fib_table *fib_trie_unmerge(struct fib_table *oldtb) { struct trie *ot = (struct trie *)oldtb->tb_data; struct key_vector *l, *tp = ot->kv; struct fib_table *local_tb; struct fib_alias *fa; struct trie *lt; t_key key = 0; if (oldtb->tb_data == oldtb->__data) return oldtb; local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL); if (!local_tb) return NULL; lt = (struct trie *)local_tb->tb_data; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { struct key_vector *local_l = NULL, *local_tp; hlist_for_each_entry(fa, &l->leaf, fa_list) { struct fib_alias *new_fa; if (local_tb->tb_id != fa->tb_id) continue; /* clone fa for new local table */ new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (!new_fa) goto out; memcpy(new_fa, fa, sizeof(*fa)); /* insert clone into table */ if (!local_l) local_l = fib_find_node(lt, &local_tp, l->key); if (fib_insert_alias(lt, local_tp, local_l, new_fa, NULL, l->key)) { kmem_cache_free(fn_alias_kmem, new_fa); goto out; } } /* stop loop if key wrapped back to 0 */ key = l->key + 1; if (key < l->key) break; } return local_tb; out: fib_trie_free(local_tb); return NULL; } /* Caller must hold RTNL */ void fib_table_flush_external(struct fib_table *tb) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; struct fib_alias *fa; /* walk trie in reverse order */ for (;;) { unsigned char slen = 0; struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; /* cannot resize the trie vector */ if (IS_TRIE(pn)) break; /* update the suffix to address pulled leaves */ if (pn->slen > pn->pos) update_suffix(pn); /* resize completed node */ pn = resize(t, pn); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { /* if alias was cloned to local then we just * need to remove the local copy from main */ if (tb->tb_id != fa->tb_id) { hlist_del_rcu(&fa->fa_list); alias_free_mem_rcu(fa); continue; } /* record local slen */ slen = fa->fa_slen; } /* update leaf slen */ n->slen = slen; if (hlist_empty(&n->leaf)) { put_child_root(pn, n->key, NULL); node_free(n); } } } /* Caller must hold RTNL. */ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) { struct trie *t = (struct trie *)tb->tb_data; struct nl_info info = { .nl_net = net }; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; struct fib_alias *fa; int found = 0; /* walk trie in reverse order */ for (;;) { unsigned char slen = 0; struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; /* cannot resize the trie vector */ if (IS_TRIE(pn)) break; /* update the suffix to address pulled leaves */ if (pn->slen > pn->pos) update_suffix(pn); /* resize completed node */ pn = resize(t, pn); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (!fi || tb->tb_id != fa->tb_id || (!(fi->fib_flags & RTNH_F_DEAD) && !fib_props[fa->fa_type].error)) { slen = fa->fa_slen; continue; } /* When not flushing the entire table, skip error * routes that are not marked for deletion. */ if (!flush_all && fib_props[fa->fa_type].error && !(fi->fib_flags & RTNH_F_DEAD)) { slen = fa->fa_slen; continue; } fib_notify_alias_delete(net, n->key, &n->leaf, fa, NULL); if (fi->pfsrc_removed) rtmsg_fib(RTM_DELROUTE, htonl(n->key), fa, KEYLENGTH - fa->fa_slen, tb->tb_id, &info, 0); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); found++; } /* update leaf slen */ n->slen = slen; if (hlist_empty(&n->leaf)) { put_child_root(pn, n->key, NULL); node_free(n); } } pr_debug("trie_flush found=%d\n", found); return found; } /* derived from fib_trie_free */ static void __fib_info_notify_update(struct net *net, struct fib_table *tb, struct nl_info *info) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct fib_alias *fa; for (;;) { struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; if (IS_TRIE(pn)) break; pn = node_parent(pn); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry(fa, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id) continue; rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa, KEYLENGTH - fa->fa_slen, tb->tb_id, info, NLM_F_REPLACE); } } } void fib_info_notify_update(struct net *net, struct nl_info *info) { unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist, lockdep_rtnl_is_held()) __fib_info_notify_update(net, tb, info); } } static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct fib_alias *fa; int last_slen = -1; int err; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (!fi) continue; /* local and main table can share the same trie, * so don't notify twice for the same entry. */ if (tb->tb_id != fa->tb_id) continue; if (fa->fa_slen == last_slen) continue; last_slen = fa->fa_slen; err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE, l->key, KEYLENGTH - fa->fa_slen, fa, extack); if (err) return err; } return 0; } static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; t_key key = 0; int err; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { err = fib_leaf_notify(l, tb, nb, extack); if (err) return err; key = l->key + 1; /* stop in case of wrap around */ if (key < l->key) break; } return 0; } int fib_notify(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { unsigned int h; int err; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { err = fib_table_notify(tb, nb, extack); if (err) return err; } } return 0; } static void __trie_free_rcu(struct rcu_head *head) { struct fib_table *tb = container_of(head, struct fib_table, rcu); #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie *t = (struct trie *)tb->tb_data; if (tb->tb_data == tb->__data) free_percpu(t->stats); #endif /* CONFIG_IP_FIB_TRIE_STATS */ kfree(tb); } void fib_free_table(struct fib_table *tb) { call_rcu(&tb->rcu, __trie_free_rcu); } static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb, struct fib_dump_filter *filter) { unsigned int flags = NLM_F_MULTI; __be32 xkey = htonl(l->key); int i, s_i, i_fa, s_fa, err; struct fib_alias *fa; if (filter->filter_set || !filter->dump_exceptions || !filter->dump_routes) flags |= NLM_F_DUMP_FILTERED; s_i = cb->args[4]; s_fa = cb->args[5]; i = 0; /* rcu_read_lock is hold by caller */ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (i < s_i) goto next; i_fa = 0; if (tb->tb_id != fa->tb_id) goto next; if (filter->filter_set) { if (filter->rt_type && fa->fa_type != filter->rt_type) goto next; if ((filter->protocol && fi->fib_protocol != filter->protocol)) goto next; if (filter->dev && !fib_info_nh_uses_dev(fi, filter->dev)) goto next; } if (filter->dump_routes) { if (!s_fa) { struct fib_rt_info fri; fri.fi = fi; fri.tb_id = tb->tb_id; fri.dst = xkey; fri.dst_len = KEYLENGTH - fa->fa_slen; fri.dscp = fa->fa_dscp; fri.type = fa->fa_type; fri.offload = READ_ONCE(fa->offload); fri.trap = READ_ONCE(fa->trap); fri.offload_failed = READ_ONCE(fa->offload_failed); err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, &fri, flags); if (err < 0) goto stop; } i_fa++; } if (filter->dump_exceptions) { err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi, &i_fa, s_fa, flags); if (err < 0) goto stop; } next: i++; } cb->args[4] = i; return skb->len; stop: cb->args[4] = i; cb->args[5] = i_fa; return err; } /* rcu_read_lock needs to be hold by caller from readside */ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb, struct fib_dump_filter *filter) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; /* Dump starting at last key. * Note: 0.0.0.0/0 (ie default) is first key. */ int count = cb->args[2]; t_key key = cb->args[3]; /* First time here, count and key are both always 0. Count > 0 * and key == 0 means the dump has wrapped around and we are done. */ if (count && !key) return 0; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { int err; err = fn_trie_dump_leaf(l, tb, skb, cb, filter); if (err < 0) { cb->args[3] = key; cb->args[2] = count; return err; } ++count; key = l->key + 1; memset(&cb->args[4], 0, sizeof(cb->args) - 4*sizeof(cb->args[0])); /* stop loop if key wrapped back to 0 */ if (key < l->key) break; } cb->args[3] = key; cb->args[2] = count; return 0; } void __init fib_trie_init(void) { fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); trie_leaf_kmem = kmem_cache_create("ip_fib_trie", LEAF_SIZE, 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); } struct fib_table *fib_trie_table(u32 id, struct fib_table *alias) { struct fib_table *tb; struct trie *t; size_t sz = sizeof(*tb); if (!alias) sz += sizeof(struct trie); tb = kzalloc(sz, GFP_KERNEL); if (!tb) return NULL; tb->tb_id = id; tb->tb_num_default = 0; tb->tb_data = (alias ? alias->__data : tb->__data); if (alias) return tb; t = (struct trie *) tb->tb_data; t->kv[0].pos = KEYLENGTH; t->kv[0].slen = KEYLENGTH; #ifdef CONFIG_IP_FIB_TRIE_STATS t->stats = alloc_percpu(struct trie_use_stats); if (!t->stats) { kfree(tb); tb = NULL; } #endif return tb; } #ifdef CONFIG_PROC_FS /* Depth first Trie walk iterator */ struct fib_trie_iter { struct seq_net_private p; struct fib_table *tb; struct key_vector *tnode; unsigned int index; unsigned int depth; }; static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter) { unsigned long cindex = iter->index; struct key_vector *pn = iter->tnode; t_key pkey; pr_debug("get_next iter={node=%p index=%d depth=%d}\n", iter->tnode, iter->index, iter->depth); while (!IS_TRIE(pn)) { while (cindex < child_length(pn)) { struct key_vector *n = get_child_rcu(pn, cindex++); if (!n) continue; if (IS_LEAF(n)) { iter->tnode = pn; iter->index = cindex; } else { /* push down one level */ iter->tnode = n; iter->index = 0; ++iter->depth; } return n; } /* Current node exhausted, pop back up */ pkey = pn->key; pn = node_parent_rcu(pn); cindex = get_index(pkey, pn) + 1; --iter->depth; } /* record root node so further searches know we are done */ iter->tnode = pn; iter->index = 0; return NULL; } static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter, struct trie *t) { struct key_vector *n, *pn; if (!t) return NULL; pn = t->kv; n = rcu_dereference(pn->tnode[0]); if (!n) return NULL; if (IS_TNODE(n)) { iter->tnode = n; iter->index = 0; iter->depth = 1; } else { iter->tnode = pn; iter->index = 0; iter->depth = 0; } return n; } static void trie_collect_stats(struct trie *t, struct trie_stat *s) { struct key_vector *n; struct fib_trie_iter iter; memset(s, 0, sizeof(*s)); rcu_read_lock(); for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { if (IS_LEAF(n)) { struct fib_alias *fa; s->leaves++; s->totdepth += iter.depth; if (iter.depth > s->maxdepth) s->maxdepth = iter.depth; hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) ++s->prefixes; } else { s->tnodes++; if (n->bits < MAX_STAT_DEPTH) s->nodesizes[n->bits]++; s->nullpointers += tn_info(n)->empty_children; } } rcu_read_unlock(); } /* * This outputs /proc/net/fib_triestats */ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) { unsigned int i, max, pointers, bytes, avdepth; if (stat->leaves) avdepth = stat->totdepth*100 / stat->leaves; else avdepth = 0; seq_printf(seq, "\tAver depth: %u.%02d\n", avdepth / 100, avdepth % 100); seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); seq_printf(seq, "\tLeaves: %u\n", stat->leaves); bytes = LEAF_SIZE * stat->leaves; seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); bytes += sizeof(struct fib_alias) * stat->prefixes; seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes); bytes += TNODE_SIZE(0) * stat->tnodes; max = MAX_STAT_DEPTH; while (max > 0 && stat->nodesizes[max-1] == 0) max--; pointers = 0; for (i = 1; i < max; i++) if (stat->nodesizes[i] != 0) { seq_printf(seq, " %u: %u", i, stat->nodesizes[i]); pointers += (1<<i) * stat->nodesizes[i]; } seq_putc(seq, '\n'); seq_printf(seq, "\tPointers: %u\n", pointers); bytes += sizeof(struct key_vector *) * pointers; seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); } #ifdef CONFIG_IP_FIB_TRIE_STATS static void trie_show_usage(struct seq_file *seq, const struct trie_use_stats __percpu *stats) { struct trie_use_stats s = { 0 }; int cpu; /* loop through all of the CPUs and gather up the stats */ for_each_possible_cpu(cpu) { const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu); s.gets += pcpu->gets; s.backtrack += pcpu->backtrack; s.semantic_match_passed += pcpu->semantic_match_passed; s.semantic_match_miss += pcpu->semantic_match_miss; s.null_node_hit += pcpu->null_node_hit; s.resize_node_skipped += pcpu->resize_node_skipped; } seq_printf(seq, "\nCounters:\n---------\n"); seq_printf(seq, "gets = %u\n", s.gets); seq_printf(seq, "backtracks = %u\n", s.backtrack); seq_printf(seq, "semantic match passed = %u\n", s.semantic_match_passed); seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss); seq_printf(seq, "null node hit= %u\n", s.null_node_hit); seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped); } #endif /* CONFIG_IP_FIB_TRIE_STATS */ static void fib_table_print(struct seq_file *seq, struct fib_table *tb) { if (tb->tb_id == RT_TABLE_LOCAL) seq_puts(seq, "Local:\n"); else if (tb->tb_id == RT_TABLE_MAIN) seq_puts(seq, "Main:\n"); else seq_printf(seq, "Id %d:\n", tb->tb_id); } static int fib_triestat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; unsigned int h; seq_printf(seq, "Basic info: size of leaf:" " %zd bytes, size of tnode: %zd bytes.\n", LEAF_SIZE, TNODE_SIZE(0)); rcu_read_lock(); for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { struct trie *t = (struct trie *) tb->tb_data; struct trie_stat stat; if (!t) continue; fib_table_print(seq, tb); trie_collect_stats(t, &stat); trie_show_stats(seq, &stat); #ifdef CONFIG_IP_FIB_TRIE_STATS trie_show_usage(seq, t->stats); #endif } cond_resched_rcu(); } rcu_read_unlock(); return 0; } static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos) { struct fib_trie_iter *iter = seq->private; struct net *net = seq_file_net(seq); loff_t idx = 0; unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { struct key_vector *n; for (n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); n; n = fib_trie_get_next(iter)) if (pos == idx++) { iter->tb = tb; return n; } } } return NULL; } static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return fib_trie_get_idx(seq, *pos); } static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_trie_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct fib_table *tb = iter->tb; struct hlist_node *tb_node; unsigned int h; struct key_vector *n; ++*pos; /* next node in same table */ n = fib_trie_get_next(iter); if (n) return n; /* walk rest of this hash chain */ h = tb->tb_id & (FIB_TABLE_HASHSZ - 1); while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) { tb = hlist_entry(tb_node, struct fib_table, tb_hlist); n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); if (n) goto found; } /* new hash chain */ while (++h < FIB_TABLE_HASHSZ) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist) { n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); if (n) goto found; } } return NULL; found: iter->tb = tb; return n; } static void fib_trie_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static void seq_indent(struct seq_file *seq, int n) { while (n-- > 0) seq_puts(seq, " "); } static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) { switch (s) { case RT_SCOPE_UNIVERSE: return "universe"; case RT_SCOPE_SITE: return "site"; case RT_SCOPE_LINK: return "link"; case RT_SCOPE_HOST: return "host"; case RT_SCOPE_NOWHERE: return "nowhere"; default: snprintf(buf, len, "scope=%d", s); return buf; } } static const char *const rtn_type_names[__RTN_MAX] = { [RTN_UNSPEC] = "UNSPEC", [RTN_UNICAST] = "UNICAST", [RTN_LOCAL] = "LOCAL", [RTN_BROADCAST] = "BROADCAST", [RTN_ANYCAST] = "ANYCAST", [RTN_MULTICAST] = "MULTICAST", [RTN_BLACKHOLE] = "BLACKHOLE", [RTN_UNREACHABLE] = "UNREACHABLE", [RTN_PROHIBIT] = "PROHIBIT", [RTN_THROW] = "THROW", [RTN_NAT] = "NAT", [RTN_XRESOLVE] = "XRESOLVE", }; static inline const char *rtn_type(char *buf, size_t len, unsigned int t) { if (t < __RTN_MAX && rtn_type_names[t]) return rtn_type_names[t]; snprintf(buf, len, "type %u", t); return buf; } /* Pretty print the trie */ static int fib_trie_seq_show(struct seq_file *seq, void *v) { const struct fib_trie_iter *iter = seq->private; struct key_vector *n = v; if (IS_TRIE(node_parent_rcu(n))) fib_table_print(seq, iter->tb); if (IS_TNODE(n)) { __be32 prf = htonl(n->key); seq_indent(seq, iter->depth-1); seq_printf(seq, " +-- %pI4/%zu %u %u %u\n", &prf, KEYLENGTH - n->pos - n->bits, n->bits, tn_info(n)->full_children, tn_info(n)->empty_children); } else { __be32 val = htonl(n->key); struct fib_alias *fa; seq_indent(seq, iter->depth); seq_printf(seq, " |-- %pI4\n", &val); hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { char buf1[32], buf2[32]; seq_indent(seq, iter->depth + 1); seq_printf(seq, " /%zu %s %s", KEYLENGTH - fa->fa_slen, rtn_scope(buf1, sizeof(buf1), fa->fa_info->fib_scope), rtn_type(buf2, sizeof(buf2), fa->fa_type)); if (fa->fa_dscp) seq_printf(seq, " tos=%d", inet_dscp_to_dsfield(fa->fa_dscp)); seq_putc(seq, '\n'); } } return 0; } static const struct seq_operations fib_trie_seq_ops = { .start = fib_trie_seq_start, .next = fib_trie_seq_next, .stop = fib_trie_seq_stop, .show = fib_trie_seq_show, }; struct fib_route_iter { struct seq_net_private p; struct fib_table *main_tb; struct key_vector *tnode; loff_t pos; t_key key; }; static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos) { struct key_vector *l, **tp = &iter->tnode; t_key key; /* use cached location of previously found key */ if (iter->pos > 0 && pos >= iter->pos) { key = iter->key; } else { iter->pos = 1; key = 0; } pos -= iter->pos; while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) { key = l->key + 1; iter->pos++; l = NULL; /* handle unlikely case of a key wrap */ if (!key) break; } if (l) iter->key = l->key; /* remember it */ else iter->pos = 0; /* forget it */ return l; } static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct fib_route_iter *iter = seq->private; struct fib_table *tb; struct trie *t; rcu_read_lock(); tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); if (!tb) return NULL; iter->main_tb = tb; t = (struct trie *)tb->tb_data; iter->tnode = t->kv; if (*pos != 0) return fib_route_get_idx(iter, *pos); iter->pos = 0; iter->key = KEY_MAX; return SEQ_START_TOKEN; } static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_route_iter *iter = seq->private; struct key_vector *l = NULL; t_key key = iter->key + 1; ++*pos; /* only allow key of 0 for start of sequence */ if ((v == SEQ_START_TOKEN) || key) l = leaf_walk_rcu(&iter->tnode, key); if (l) { iter->key = l->key; iter->pos++; } else { iter->pos = 0; } return l; } static void fib_route_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi) { unsigned int flags = 0; if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT) flags = RTF_REJECT; if (fi) { const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); if (nhc->nhc_gw.ipv4) flags |= RTF_GATEWAY; } if (mask == htonl(0xFFFFFFFF)) flags |= RTF_HOST; flags |= RTF_UP; return flags; } /* * This outputs /proc/net/route. * The format of the file is not supposed to be changed * and needs to be same as fib_hash output to avoid breaking * legacy utilities */ static int fib_route_seq_show(struct seq_file *seq, void *v) { struct fib_route_iter *iter = seq->private; struct fib_table *tb = iter->main_tb; struct fib_alias *fa; struct key_vector *l = v; __be32 prefix; if (v == SEQ_START_TOKEN) { seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU" "\tWindow\tIRTT"); return 0; } prefix = htonl(l->key); hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen); unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); if ((fa->fa_type == RTN_BROADCAST) || (fa->fa_type == RTN_MULTICAST)) continue; if (fa->tb_id != tb->tb_id) continue; seq_setwidth(seq, 127); if (fi) { struct fib_nh_common *nhc = fib_info_nhc(fi, 0); __be32 gw = 0; if (nhc->nhc_gw_family == AF_INET) gw = nhc->nhc_gw.ipv4; seq_printf(seq, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" "%u\t%08X\t%d\t%u\t%u", nhc->nhc_dev ? nhc->nhc_dev->name : "*", prefix, gw, flags, 0, 0, fi->fib_priority, mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0), fi->fib_window, fi->fib_rtt >> 3); } else { seq_printf(seq, "*\t%08X\t%08X\t%04X\t%d\t%u\t" "%u\t%08X\t%d\t%u\t%u", prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0); } seq_pad(seq, '\n'); } return 0; } static const struct seq_operations fib_route_seq_ops = { .start = fib_route_seq_start, .next = fib_route_seq_next, .stop = fib_route_seq_stop, .show = fib_route_seq_show, }; int __net_init fib_proc_init(struct net *net) { if (!proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops, sizeof(struct fib_trie_iter))) goto out1; if (!proc_create_net_single("fib_triestat", 0444, net->proc_net, fib_triestat_seq_show, NULL)) goto out2; if (!proc_create_net("route", 0444, net->proc_net, &fib_route_seq_ops, sizeof(struct fib_route_iter))) goto out3; return 0; out3: remove_proc_entry("fib_triestat", net->proc_net); out2: remove_proc_entry("fib_trie", net->proc_net); out1: return -ENOMEM; } void __net_exit fib_proc_exit(struct net *net) { remove_proc_entry("fib_trie", net->proc_net); remove_proc_entry("fib_triestat", net->proc_net); remove_proc_entry("route", net->proc_net); } #endif /* CONFIG_PROC_FS */
2 2 5 1 1 5 5 5 5 5 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2020 Google Corporation */ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/mgmt.h> #include "mgmt_util.h" #include "msft.h" #define MSFT_RSSI_THRESHOLD_VALUE_MIN -127 #define MSFT_RSSI_THRESHOLD_VALUE_MAX 20 #define MSFT_RSSI_LOW_TIMEOUT_MAX 0x3C #define MSFT_OP_READ_SUPPORTED_FEATURES 0x00 struct msft_cp_read_supported_features { __u8 sub_opcode; } __packed; struct msft_rp_read_supported_features { __u8 status; __u8 sub_opcode; __le64 features; __u8 evt_prefix_len; __u8 evt_prefix[]; } __packed; #define MSFT_OP_LE_MONITOR_ADVERTISEMENT 0x03 #define MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN 0x01 struct msft_le_monitor_advertisement_pattern { __u8 length; __u8 data_type; __u8 start_byte; __u8 pattern[]; }; struct msft_le_monitor_advertisement_pattern_data { __u8 count; __u8 data[]; }; struct msft_cp_le_monitor_advertisement { __u8 sub_opcode; __s8 rssi_high; __s8 rssi_low; __u8 rssi_low_interval; __u8 rssi_sampling_period; __u8 cond_type; __u8 data[]; } __packed; struct msft_rp_le_monitor_advertisement { __u8 status; __u8 sub_opcode; __u8 handle; } __packed; #define MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT 0x04 struct msft_cp_le_cancel_monitor_advertisement { __u8 sub_opcode; __u8 handle; } __packed; struct msft_rp_le_cancel_monitor_advertisement { __u8 status; __u8 sub_opcode; } __packed; #define MSFT_OP_LE_SET_ADVERTISEMENT_FILTER_ENABLE 0x05 struct msft_cp_le_set_advertisement_filter_enable { __u8 sub_opcode; __u8 enable; } __packed; struct msft_rp_le_set_advertisement_filter_enable { __u8 status; __u8 sub_opcode; } __packed; #define MSFT_EV_LE_MONITOR_DEVICE 0x02 struct msft_ev_le_monitor_device { __u8 addr_type; bdaddr_t bdaddr; __u8 monitor_handle; __u8 monitor_state; } __packed; struct msft_monitor_advertisement_handle_data { __u8 msft_handle; __u16 mgmt_handle; __s8 rssi_high; __s8 rssi_low; __u8 rssi_low_interval; __u8 rssi_sampling_period; __u8 cond_type; struct list_head list; }; enum monitor_addr_filter_state { AF_STATE_IDLE, AF_STATE_ADDING, AF_STATE_ADDED, AF_STATE_REMOVING, }; #define MSFT_MONITOR_ADVERTISEMENT_TYPE_ADDR 0x04 struct msft_monitor_addr_filter_data { __u8 msft_handle; __u8 pattern_handle; /* address filters pertain to */ __u16 mgmt_handle; int state; __s8 rssi_high; __s8 rssi_low; __u8 rssi_low_interval; __u8 rssi_sampling_period; __u8 addr_type; bdaddr_t bdaddr; struct list_head list; }; struct msft_data { __u64 features; __u8 evt_prefix_len; __u8 *evt_prefix; struct list_head handle_map; struct list_head address_filters; __u8 resuming; __u8 suspending; __u8 filter_enabled; /* To synchronize add/remove address filter and monitor device event.*/ struct mutex filter_lock; }; bool msft_monitor_supported(struct hci_dev *hdev) { return !!(msft_get_features(hdev) & MSFT_FEATURE_MASK_LE_ADV_MONITOR); } static bool read_supported_features(struct hci_dev *hdev, struct msft_data *msft) { struct msft_cp_read_supported_features cp; struct msft_rp_read_supported_features *rp; struct sk_buff *skb; cp.sub_opcode = MSFT_OP_READ_SUPPORTED_FEATURES; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to read MSFT supported features (%ld)", PTR_ERR(skb)); return false; } if (skb->len < sizeof(*rp)) { bt_dev_err(hdev, "MSFT supported features length mismatch"); goto failed; } rp = (struct msft_rp_read_supported_features *)skb->data; if (rp->sub_opcode != MSFT_OP_READ_SUPPORTED_FEATURES) goto failed; if (rp->evt_prefix_len > 0) { msft->evt_prefix = kmemdup(rp->evt_prefix, rp->evt_prefix_len, GFP_KERNEL); if (!msft->evt_prefix) goto failed; } msft->evt_prefix_len = rp->evt_prefix_len; msft->features = __le64_to_cpu(rp->features); if (msft->features & MSFT_FEATURE_MASK_CURVE_VALIDITY) hdev->msft_curve_validity = true; kfree_skb(skb); return true; failed: kfree_skb(skb); return false; } /* is_mgmt = true matches the handle exposed to userspace via mgmt. * is_mgmt = false matches the handle used by the msft controller. * This function requires the caller holds hdev->lock */ static struct msft_monitor_advertisement_handle_data *msft_find_handle_data (struct hci_dev *hdev, u16 handle, bool is_mgmt) { struct msft_monitor_advertisement_handle_data *entry; struct msft_data *msft = hdev->msft_data; list_for_each_entry(entry, &msft->handle_map, list) { if (is_mgmt && entry->mgmt_handle == handle) return entry; if (!is_mgmt && entry->msft_handle == handle) return entry; } return NULL; } /* This function requires the caller holds msft->filter_lock */ static struct msft_monitor_addr_filter_data *msft_find_address_data (struct hci_dev *hdev, u8 addr_type, bdaddr_t *addr, u8 pattern_handle) { struct msft_monitor_addr_filter_data *entry; struct msft_data *msft = hdev->msft_data; list_for_each_entry(entry, &msft->address_filters, list) { if (entry->pattern_handle == pattern_handle && addr_type == entry->addr_type && !bacmp(addr, &entry->bdaddr)) return entry; } return NULL; } /* This function requires the caller holds hdev->lock */ static int msft_monitor_device_del(struct hci_dev *hdev, __u16 mgmt_handle, bdaddr_t *bdaddr, __u8 addr_type, bool notify) { struct monitored_device *dev, *tmp; int count = 0; list_for_each_entry_safe(dev, tmp, &hdev->monitored_devices, list) { /* mgmt_handle == 0 indicates remove all devices, whereas, * bdaddr == NULL indicates remove all devices matching the * mgmt_handle. */ if ((!mgmt_handle || dev->handle == mgmt_handle) && (!bdaddr || (!bacmp(bdaddr, &dev->bdaddr) && addr_type == dev->addr_type))) { if (notify && dev->notified) { mgmt_adv_monitor_device_lost(hdev, dev->handle, &dev->bdaddr, dev->addr_type); } list_del(&dev->list); kfree(dev); count++; } } return count; } static int msft_le_monitor_advertisement_cb(struct hci_dev *hdev, u16 opcode, struct adv_monitor *monitor, struct sk_buff *skb) { struct msft_rp_le_monitor_advertisement *rp; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_data *msft = hdev->msft_data; int status = 0; hci_dev_lock(hdev); rp = (struct msft_rp_le_monitor_advertisement *)skb->data; if (skb->len < sizeof(*rp)) { status = HCI_ERROR_UNSPECIFIED; goto unlock; } status = rp->status; if (status) goto unlock; handle_data = kmalloc_obj(*handle_data); if (!handle_data) { status = HCI_ERROR_UNSPECIFIED; goto unlock; } handle_data->mgmt_handle = monitor->handle; handle_data->msft_handle = rp->handle; handle_data->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN; INIT_LIST_HEAD(&handle_data->list); list_add(&handle_data->list, &msft->handle_map); monitor->state = ADV_MONITOR_STATE_OFFLOADED; unlock: if (status) hci_free_adv_monitor(hdev, monitor); hci_dev_unlock(hdev); return status; } /* This function requires the caller holds hci_req_sync_lock */ static void msft_remove_addr_filters_sync(struct hci_dev *hdev, u8 handle) { struct msft_monitor_addr_filter_data *address_filter, *n; struct msft_cp_le_cancel_monitor_advertisement cp; struct msft_data *msft = hdev->msft_data; struct list_head head; struct sk_buff *skb; INIT_LIST_HEAD(&head); /* Cancel all corresponding address monitors */ mutex_lock(&msft->filter_lock); list_for_each_entry_safe(address_filter, n, &msft->address_filters, list) { if (address_filter->pattern_handle != handle) continue; list_del(&address_filter->list); /* Keep the address filter and let * msft_add_address_filter_sync() remove and free the address * filter. */ if (address_filter->state == AF_STATE_ADDING) { address_filter->state = AF_STATE_REMOVING; continue; } /* Keep the address filter and let * msft_cancel_address_filter_sync() remove and free the address * filter */ if (address_filter->state == AF_STATE_REMOVING) continue; list_add_tail(&address_filter->list, &head); } mutex_unlock(&msft->filter_lock); list_for_each_entry_safe(address_filter, n, &head, list) { list_del(&address_filter->list); cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; cp.handle = address_filter->msft_handle; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { kfree(address_filter); continue; } kfree_skb(skb); bt_dev_dbg(hdev, "MSFT: Canceled device %pMR address filter", &address_filter->bdaddr); kfree(address_filter); } } static int msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev, u16 opcode, struct adv_monitor *monitor, struct sk_buff *skb) { struct msft_rp_le_cancel_monitor_advertisement *rp; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_data *msft = hdev->msft_data; int status = 0; u8 msft_handle; rp = (struct msft_rp_le_cancel_monitor_advertisement *)skb->data; if (skb->len < sizeof(*rp)) { status = HCI_ERROR_UNSPECIFIED; goto done; } status = rp->status; if (status) goto done; hci_dev_lock(hdev); handle_data = msft_find_handle_data(hdev, monitor->handle, true); if (handle_data) { if (monitor->state == ADV_MONITOR_STATE_OFFLOADED) monitor->state = ADV_MONITOR_STATE_REGISTERED; /* Do not free the monitor if it is being removed due to * suspend. It will be re-monitored on resume. */ if (!msft->suspending) { hci_free_adv_monitor(hdev, monitor); /* Clear any monitored devices by this Adv Monitor */ msft_monitor_device_del(hdev, handle_data->mgmt_handle, NULL, 0, false); } msft_handle = handle_data->msft_handle; list_del(&handle_data->list); kfree(handle_data); hci_dev_unlock(hdev); msft_remove_addr_filters_sync(hdev, msft_handle); } else { hci_dev_unlock(hdev); } done: return status; } /* This function requires the caller holds hci_req_sync_lock */ static int msft_remove_monitor_sync(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_cp_le_cancel_monitor_advertisement cp; struct msft_monitor_advertisement_handle_data *handle_data; struct sk_buff *skb; handle_data = msft_find_handle_data(hdev, monitor->handle, true); /* If no matched handle, just remove without telling controller */ if (!handle_data) return -ENOENT; cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; cp.handle = handle_data->msft_handle; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) return PTR_ERR(skb); return msft_le_cancel_monitor_advertisement_cb(hdev, hdev->msft_opcode, monitor, skb); } /* This function requires the caller holds hci_req_sync_lock */ int msft_suspend_sync(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; struct adv_monitor *monitor; int handle = 0; if (!msft || !msft_monitor_supported(hdev)) return 0; msft->suspending = true; while (1) { monitor = idr_get_next(&hdev->adv_monitors_idr, &handle); if (!monitor) break; msft_remove_monitor_sync(hdev, monitor); handle++; } /* All monitors have been removed */ msft->suspending = false; return 0; } static bool msft_monitor_rssi_valid(struct adv_monitor *monitor) { struct adv_rssi_thresholds *r = &monitor->rssi; if (r->high_threshold < MSFT_RSSI_THRESHOLD_VALUE_MIN || r->high_threshold > MSFT_RSSI_THRESHOLD_VALUE_MAX || r->low_threshold < MSFT_RSSI_THRESHOLD_VALUE_MIN || r->low_threshold > MSFT_RSSI_THRESHOLD_VALUE_MAX) return false; /* High_threshold_timeout is not supported, * once high_threshold is reached, events are immediately reported. */ if (r->high_threshold_timeout != 0) return false; if (r->low_threshold_timeout > MSFT_RSSI_LOW_TIMEOUT_MAX) return false; /* Sampling period from 0x00 to 0xFF are all allowed */ return true; } static bool msft_monitor_pattern_valid(struct adv_monitor *monitor) { return msft_monitor_rssi_valid(monitor); /* No additional check needed for pattern-based monitor */ } static int msft_add_monitor_sync(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_cp_le_monitor_advertisement *cp; struct msft_le_monitor_advertisement_pattern_data *pattern_data; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_le_monitor_advertisement_pattern *pattern; struct adv_pattern *entry; size_t total_size = sizeof(*cp) + sizeof(*pattern_data); ptrdiff_t offset = 0; u8 pattern_count = 0; struct sk_buff *skb; int err; if (!msft_monitor_pattern_valid(monitor)) return -EINVAL; list_for_each_entry(entry, &monitor->patterns, list) { pattern_count++; total_size += sizeof(*pattern) + entry->length; } cp = kmalloc(total_size, GFP_KERNEL); if (!cp) return -ENOMEM; cp->sub_opcode = MSFT_OP_LE_MONITOR_ADVERTISEMENT; cp->rssi_high = monitor->rssi.high_threshold; cp->rssi_low = monitor->rssi.low_threshold; cp->rssi_low_interval = (u8)monitor->rssi.low_threshold_timeout; cp->rssi_sampling_period = monitor->rssi.sampling_period; cp->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN; pattern_data = (void *)cp->data; pattern_data->count = pattern_count; list_for_each_entry(entry, &monitor->patterns, list) { pattern = (void *)(pattern_data->data + offset); /* the length also includes data_type and offset */ pattern->length = entry->length + 2; pattern->data_type = entry->ad_type; pattern->start_byte = entry->offset; memcpy(pattern->pattern, entry->value, entry->length); offset += sizeof(*pattern) + entry->length; } skb = __hci_cmd_sync(hdev, hdev->msft_opcode, total_size, cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { err = PTR_ERR(skb); goto out_free; } err = msft_le_monitor_advertisement_cb(hdev, hdev->msft_opcode, monitor, skb); if (err) goto out_free; handle_data = msft_find_handle_data(hdev, monitor->handle, true); if (!handle_data) { err = -ENODATA; goto out_free; } handle_data->rssi_high = cp->rssi_high; handle_data->rssi_low = cp->rssi_low; handle_data->rssi_low_interval = cp->rssi_low_interval; handle_data->rssi_sampling_period = cp->rssi_sampling_period; out_free: kfree(cp); return err; } /* This function requires the caller holds hci_req_sync_lock */ static void reregister_monitor(struct hci_dev *hdev) { struct adv_monitor *monitor; struct msft_data *msft = hdev->msft_data; int handle = 0; if (!msft) return; msft->resuming = true; while (1) { monitor = idr_get_next(&hdev->adv_monitors_idr, &handle); if (!monitor) break; msft_add_monitor_sync(hdev, monitor); handle++; } /* All monitors have been reregistered */ msft->resuming = false; } /* This function requires the caller holds hci_req_sync_lock */ int msft_resume_sync(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; if (!msft || !msft_monitor_supported(hdev)) return 0; hci_dev_lock(hdev); /* Clear already tracked devices on resume. Once the monitors are * reregistered, devices in range will be found again after resume. */ hdev->advmon_pend_notify = false; msft_monitor_device_del(hdev, 0, NULL, 0, true); hci_dev_unlock(hdev); reregister_monitor(hdev); return 0; } /* This function requires the caller holds hci_req_sync_lock */ void msft_do_open(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; if (hdev->msft_opcode == HCI_OP_NOP) return; if (!msft) { bt_dev_err(hdev, "MSFT extension not registered"); return; } bt_dev_dbg(hdev, "Initialize MSFT extension"); /* Reset existing MSFT data before re-reading */ kfree(msft->evt_prefix); msft->evt_prefix = NULL; msft->evt_prefix_len = 0; msft->features = 0; if (!read_supported_features(hdev, msft)) { hdev->msft_data = NULL; kfree(msft); return; } if (msft_monitor_supported(hdev)) { msft->resuming = true; msft_set_filter_enable(hdev, true); /* Monitors get removed on power off, so we need to explicitly * tell the controller to re-monitor. */ reregister_monitor(hdev); } } void msft_do_close(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; struct msft_monitor_advertisement_handle_data *handle_data, *tmp; struct msft_monitor_addr_filter_data *address_filter, *n; struct adv_monitor *monitor; if (!msft) return; bt_dev_dbg(hdev, "Cleanup of MSFT extension"); /* The controller will silently remove all monitors on power off. * Therefore, remove handle_data mapping and reset monitor state. */ list_for_each_entry_safe(handle_data, tmp, &msft->handle_map, list) { monitor = idr_find(&hdev->adv_monitors_idr, handle_data->mgmt_handle); if (monitor && monitor->state == ADV_MONITOR_STATE_OFFLOADED) monitor->state = ADV_MONITOR_STATE_REGISTERED; list_del(&handle_data->list); kfree(handle_data); } mutex_lock(&msft->filter_lock); list_for_each_entry_safe(address_filter, n, &msft->address_filters, list) { list_del(&address_filter->list); kfree(address_filter); } mutex_unlock(&msft->filter_lock); hci_dev_lock(hdev); /* Clear any devices that are being monitored and notify device lost */ hdev->advmon_pend_notify = false; msft_monitor_device_del(hdev, 0, NULL, 0, true); hci_dev_unlock(hdev); } static int msft_cancel_address_filter_sync(struct hci_dev *hdev, void *data) { struct msft_monitor_addr_filter_data *address_filter = data; struct msft_cp_le_cancel_monitor_advertisement cp; struct msft_data *msft = hdev->msft_data; struct sk_buff *skb; int err = 0; if (!msft) { bt_dev_err(hdev, "MSFT: msft data is freed"); return -EINVAL; } /* The address filter has been removed by hci dev close */ if (!test_bit(HCI_UP, &hdev->flags)) return 0; mutex_lock(&msft->filter_lock); list_del(&address_filter->list); mutex_unlock(&msft->filter_lock); cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; cp.handle = address_filter->msft_handle; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { bt_dev_err(hdev, "MSFT: Failed to cancel address (%pMR) filter", &address_filter->bdaddr); err = PTR_ERR(skb); goto done; } kfree_skb(skb); bt_dev_dbg(hdev, "MSFT: Canceled device %pMR address filter", &address_filter->bdaddr); done: kfree(address_filter); return err; } void msft_register(struct hci_dev *hdev) { struct msft_data *msft = NULL; bt_dev_dbg(hdev, "Register MSFT extension"); msft = kzalloc_obj(*msft); if (!msft) { bt_dev_err(hdev, "Failed to register MSFT extension"); return; } INIT_LIST_HEAD(&msft->handle_map); INIT_LIST_HEAD(&msft->address_filters); hdev->msft_data = msft; mutex_init(&msft->filter_lock); } void msft_release(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; if (!msft) return; bt_dev_dbg(hdev, "Unregister MSFT extension"); hdev->msft_data = NULL; kfree(msft->evt_prefix); mutex_destroy(&msft->filter_lock); kfree(msft); } /* This function requires the caller holds hdev->lock */ static void msft_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 addr_type, __u16 mgmt_handle) { struct monitored_device *dev; dev = kmalloc_obj(*dev); if (!dev) { bt_dev_err(hdev, "MSFT vendor event %u: no memory", MSFT_EV_LE_MONITOR_DEVICE); return; } bacpy(&dev->bdaddr, bdaddr); dev->addr_type = addr_type; dev->handle = mgmt_handle; dev->notified = false; INIT_LIST_HEAD(&dev->list); list_add(&dev->list, &hdev->monitored_devices); hdev->advmon_pend_notify = true; } /* This function requires the caller holds hdev->lock */ static void msft_device_lost(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 addr_type, __u16 mgmt_handle) { if (!msft_monitor_device_del(hdev, mgmt_handle, bdaddr, addr_type, true)) { bt_dev_err(hdev, "MSFT vendor event %u: dev %pMR not in list", MSFT_EV_LE_MONITOR_DEVICE, bdaddr); } } static void *msft_skb_pull(struct hci_dev *hdev, struct sk_buff *skb, u8 ev, size_t len) { void *data; data = skb_pull_data(skb, len); if (!data) bt_dev_err(hdev, "Malformed MSFT vendor event: 0x%02x", ev); return data; } static int msft_add_address_filter_sync(struct hci_dev *hdev, void *data) { struct msft_monitor_addr_filter_data *address_filter = data; struct msft_rp_le_monitor_advertisement *rp; struct msft_cp_le_monitor_advertisement *cp; struct msft_data *msft = hdev->msft_data; struct sk_buff *skb = NULL; bool remove = false; size_t size; if (!msft) { bt_dev_err(hdev, "MSFT: msft data is freed"); return -EINVAL; } /* The address filter has been removed by hci dev close */ if (!test_bit(HCI_UP, &hdev->flags)) return -ENODEV; /* We are safe to use the address filter from now on. * msft_monitor_device_evt() wouldn't delete this filter because it's * not been added by now. * And all other functions that requiring hci_req_sync_lock wouldn't * touch this filter before this func completes because it's protected * by hci_req_sync_lock. */ if (address_filter->state == AF_STATE_REMOVING) { mutex_lock(&msft->filter_lock); list_del(&address_filter->list); mutex_unlock(&msft->filter_lock); kfree(address_filter); return 0; } size = sizeof(*cp) + sizeof(address_filter->addr_type) + sizeof(address_filter->bdaddr); cp = kzalloc(size, GFP_KERNEL); if (!cp) { bt_dev_err(hdev, "MSFT: Alloc cmd param err"); remove = true; goto done; } cp->sub_opcode = MSFT_OP_LE_MONITOR_ADVERTISEMENT; cp->rssi_high = address_filter->rssi_high; cp->rssi_low = address_filter->rssi_low; cp->rssi_low_interval = address_filter->rssi_low_interval; cp->rssi_sampling_period = address_filter->rssi_sampling_period; cp->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_ADDR; cp->data[0] = address_filter->addr_type; memcpy(&cp->data[1], &address_filter->bdaddr, sizeof(address_filter->bdaddr)); skb = __hci_cmd_sync(hdev, hdev->msft_opcode, size, cp, HCI_CMD_TIMEOUT); kfree(cp); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to enable address %pMR filter", &address_filter->bdaddr); skb = NULL; remove = true; goto done; } rp = skb_pull_data(skb, sizeof(*rp)); if (!rp || rp->sub_opcode != MSFT_OP_LE_MONITOR_ADVERTISEMENT || rp->status) remove = true; done: mutex_lock(&msft->filter_lock); if (remove) { bt_dev_warn(hdev, "MSFT: Remove address (%pMR) filter", &address_filter->bdaddr); list_del(&address_filter->list); kfree(address_filter); } else { address_filter->state = AF_STATE_ADDED; address_filter->msft_handle = rp->handle; bt_dev_dbg(hdev, "MSFT: Address %pMR filter enabled", &address_filter->bdaddr); } mutex_unlock(&msft->filter_lock); kfree_skb(skb); return 0; } /* This function requires the caller holds msft->filter_lock */ static struct msft_monitor_addr_filter_data *msft_add_address_filter (struct hci_dev *hdev, u8 addr_type, bdaddr_t *bdaddr, struct msft_monitor_advertisement_handle_data *handle_data) { struct msft_monitor_addr_filter_data *address_filter = NULL; struct msft_data *msft = hdev->msft_data; int err; address_filter = kzalloc_obj(*address_filter); if (!address_filter) return NULL; address_filter->state = AF_STATE_ADDING; address_filter->msft_handle = 0xff; address_filter->pattern_handle = handle_data->msft_handle; address_filter->mgmt_handle = handle_data->mgmt_handle; address_filter->rssi_high = handle_data->rssi_high; address_filter->rssi_low = handle_data->rssi_low; address_filter->rssi_low_interval = handle_data->rssi_low_interval; address_filter->rssi_sampling_period = handle_data->rssi_sampling_period; address_filter->addr_type = addr_type; bacpy(&address_filter->bdaddr, bdaddr); /* With the above AF_STATE_ADDING, duplicated address filter can be * avoided when receiving monitor device event (found/lost) frequently * for the same device. */ list_add_tail(&address_filter->list, &msft->address_filters); err = hci_cmd_sync_queue(hdev, msft_add_address_filter_sync, address_filter, NULL); if (err < 0) { bt_dev_err(hdev, "MSFT: Add address %pMR filter err", bdaddr); list_del(&address_filter->list); kfree(address_filter); return NULL; } bt_dev_dbg(hdev, "MSFT: Add device %pMR address filter", &address_filter->bdaddr); return address_filter; } /* This function requires the caller holds hdev->lock */ static void msft_monitor_device_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct msft_monitor_addr_filter_data *n, *address_filter = NULL; struct msft_ev_le_monitor_device *ev; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_data *msft = hdev->msft_data; u16 mgmt_handle = 0xffff; u8 addr_type; ev = msft_skb_pull(hdev, skb, MSFT_EV_LE_MONITOR_DEVICE, sizeof(*ev)); if (!ev) return; bt_dev_dbg(hdev, "MSFT vendor event 0x%02x: handle 0x%04x state %d addr %pMR", MSFT_EV_LE_MONITOR_DEVICE, ev->monitor_handle, ev->monitor_state, &ev->bdaddr); handle_data = msft_find_handle_data(hdev, ev->monitor_handle, false); if (!hci_test_quirk(hdev, HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER)) { if (!handle_data) return; mgmt_handle = handle_data->mgmt_handle; goto report_state; } if (handle_data) { /* Don't report any device found/lost event from pattern * monitors. Pattern monitor always has its address filters for * tracking devices. */ address_filter = msft_find_address_data(hdev, ev->addr_type, &ev->bdaddr, handle_data->msft_handle); if (address_filter) return; if (ev->monitor_state && handle_data->cond_type == MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN) msft_add_address_filter(hdev, ev->addr_type, &ev->bdaddr, handle_data); return; } /* This device event is not from pattern monitor. * Report it if there is a corresponding address_filter for it. */ list_for_each_entry(n, &msft->address_filters, list) { if (n->state == AF_STATE_ADDED && n->msft_handle == ev->monitor_handle) { mgmt_handle = n->mgmt_handle; address_filter = n; break; } } if (!address_filter) { bt_dev_warn(hdev, "MSFT: Unexpected device event %pMR, %u, %u", &ev->bdaddr, ev->monitor_handle, ev->monitor_state); return; } report_state: switch (ev->addr_type) { case ADDR_LE_DEV_PUBLIC: addr_type = BDADDR_LE_PUBLIC; break; case ADDR_LE_DEV_RANDOM: addr_type = BDADDR_LE_RANDOM; break; default: bt_dev_err(hdev, "MSFT vendor event 0x%02x: unknown addr type 0x%02x", MSFT_EV_LE_MONITOR_DEVICE, ev->addr_type); return; } if (ev->monitor_state) { msft_device_found(hdev, &ev->bdaddr, addr_type, mgmt_handle); } else { if (address_filter && address_filter->state == AF_STATE_ADDED) { address_filter->state = AF_STATE_REMOVING; hci_cmd_sync_queue(hdev, msft_cancel_address_filter_sync, address_filter, NULL); } msft_device_lost(hdev, &ev->bdaddr, addr_type, mgmt_handle); } } void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { struct msft_data *msft = hdev->msft_data; u8 *evt_prefix; u8 *evt; if (!msft) return; /* When the extension has defined an event prefix, check that it * matches, and otherwise just return. */ if (msft->evt_prefix_len > 0) { evt_prefix = msft_skb_pull(hdev, skb, 0, msft->evt_prefix_len); if (!evt_prefix) return; if (memcmp(evt_prefix, msft->evt_prefix, msft->evt_prefix_len)) return; } /* Every event starts at least with an event code and the rest of * the data is variable and depends on the event code. */ if (skb->len < 1) return; evt = msft_skb_pull(hdev, skb, 0, sizeof(*evt)); if (!evt) return; hci_dev_lock(hdev); switch (*evt) { case MSFT_EV_LE_MONITOR_DEVICE: mutex_lock(&msft->filter_lock); msft_monitor_device_evt(hdev, skb); mutex_unlock(&msft->filter_lock); break; default: bt_dev_dbg(hdev, "MSFT vendor event 0x%02x", *evt); break; } hci_dev_unlock(hdev); } __u64 msft_get_features(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; return msft ? msft->features : 0; } static void msft_le_set_advertisement_filter_enable_cb(struct hci_dev *hdev, void *user_data, u8 status) { struct msft_cp_le_set_advertisement_filter_enable *cp = user_data; struct msft_data *msft = hdev->msft_data; /* Error 0x0C would be returned if the filter enabled status is * already set to whatever we were trying to set. * Although the default state should be disabled, some controller set * the initial value to enabled. Because there is no way to know the * actual initial value before sending this command, here we also treat * error 0x0C as success. */ if (status != 0x00 && status != 0x0C) return; hci_dev_lock(hdev); msft->filter_enabled = cp->enable; if (status == 0x0C) bt_dev_warn(hdev, "MSFT filter_enable is already %s", cp->enable ? "on" : "off"); hci_dev_unlock(hdev); } /* This function requires the caller holds hci_req_sync_lock */ int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_data *msft = hdev->msft_data; if (!msft) return -EOPNOTSUPP; if (msft->resuming || msft->suspending) return -EBUSY; return msft_add_monitor_sync(hdev, monitor); } /* This function requires the caller holds hci_req_sync_lock */ int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_data *msft = hdev->msft_data; if (!msft) return -EOPNOTSUPP; if (msft->resuming || msft->suspending) return -EBUSY; return msft_remove_monitor_sync(hdev, monitor); } int msft_set_filter_enable(struct hci_dev *hdev, bool enable) { struct msft_cp_le_set_advertisement_filter_enable cp; struct msft_data *msft = hdev->msft_data; int err; if (!msft) return -EOPNOTSUPP; cp.sub_opcode = MSFT_OP_LE_SET_ADVERTISEMENT_FILTER_ENABLE; cp.enable = enable; err = __hci_cmd_sync_status(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); msft_le_set_advertisement_filter_enable_cb(hdev, &cp, err); return 0; } bool msft_curve_validity(struct hci_dev *hdev) { return hdev->msft_curve_validity; }
11 7 66 66 65 7 7 7 7 5 5 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 /* * net/tipc/monitor.c * * Copyright (c) 2016, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <net/genetlink.h> #include "core.h" #include "addr.h" #include "monitor.h" #include "bearer.h" #define MAX_MON_DOMAIN 64 #define MON_TIMEOUT 120000 #define MAX_PEER_DOWN_EVENTS 4 /* struct tipc_mon_domain: domain record to be transferred between peers * @len: actual size of domain record * @gen: current generation of sender's domain * @ack_gen: most recent generation of self's domain acked by peer * @member_cnt: number of domain member nodes described in this record * @up_map: bit map indicating which of the members the sender considers up * @members: identity of the domain members */ struct tipc_mon_domain { u16 len; u16 gen; u16 ack_gen; u16 member_cnt; u64 up_map; u32 members[MAX_MON_DOMAIN]; }; /* struct tipc_peer: state of a peer node and its domain * @addr: tipc node identity of peer * @head_map: shows which other nodes currently consider peer 'up' * @domain: most recent domain record from peer * @hash: position in hashed lookup list * @list: position in linked list, in circular ascending order by 'addr' * @applied: number of reported domain members applied on this monitor list * @is_up: peer is up as seen from this node * @is_head: peer is assigned domain head as seen from this node * @is_local: peer is in local domain and should be continuously monitored * @down_cnt: - numbers of other peers which have reported this on lost */ struct tipc_peer { u32 addr; struct tipc_mon_domain *domain; struct hlist_node hash; struct list_head list; u8 applied; u8 down_cnt; bool is_up; bool is_head; bool is_local; }; struct tipc_monitor { struct hlist_head peers[NODE_HTABLE_SIZE]; int peer_cnt; struct tipc_peer *self; rwlock_t lock; struct tipc_mon_domain cache; u16 list_gen; u16 dom_gen; struct net *net; struct timer_list timer; unsigned long timer_intv; }; static struct tipc_monitor *tipc_monitor(struct net *net, int bearer_id) { return tipc_net(net)->monitors[bearer_id]; } const int tipc_max_domain_size = sizeof(struct tipc_mon_domain); static inline u16 mon_cpu_to_le16(u16 val) { return (__force __u16)htons(val); } static inline u32 mon_cpu_to_le32(u32 val) { return (__force __u32)htonl(val); } static inline u64 mon_cpu_to_le64(u64 val) { return (__force __u64)cpu_to_be64(val); } static inline u16 mon_le16_to_cpu(u16 val) { return ntohs((__force __be16)val); } static inline u32 mon_le32_to_cpu(u32 val) { return ntohl((__force __be32)val); } static inline u64 mon_le64_to_cpu(u64 val) { return be64_to_cpu((__force __be64)val); } /* dom_rec_len(): actual length of domain record for transport */ static int dom_rec_len(struct tipc_mon_domain *dom, u16 mcnt) { return (offsetof(struct tipc_mon_domain, members)) + (mcnt * sizeof(u32)); } /* dom_size() : calculate size of own domain based on number of peers */ static int dom_size(int peers) { int i = 0; while ((i * i) < peers) i++; return min(i, MAX_MON_DOMAIN); } static void map_set(u64 *up_map, int i, unsigned int v) { *up_map &= ~(1ULL << i); *up_map |= ((u64)v << i); } static int map_get(u64 up_map, int i) { return (up_map & (1ULL << i)) >> i; } static struct tipc_peer *peer_prev(struct tipc_peer *peer) { return list_last_entry(&peer->list, struct tipc_peer, list); } static struct tipc_peer *peer_nxt(struct tipc_peer *peer) { return list_first_entry(&peer->list, struct tipc_peer, list); } static struct tipc_peer *peer_head(struct tipc_peer *peer) { while (!peer->is_head) peer = peer_prev(peer); return peer; } static struct tipc_peer *get_peer(struct tipc_monitor *mon, u32 addr) { struct tipc_peer *peer; unsigned int thash = tipc_hashfn(addr); hlist_for_each_entry(peer, &mon->peers[thash], hash) { if (peer->addr == addr) return peer; } return NULL; } static struct tipc_peer *get_self(struct net *net, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); return mon->self; } static inline bool tipc_mon_is_active(struct net *net, struct tipc_monitor *mon) { struct tipc_net *tn = tipc_net(net); return mon->peer_cnt > tn->mon_threshold; } /* mon_identify_lost_members() : - identify amd mark potentially lost members */ static void mon_identify_lost_members(struct tipc_peer *peer, struct tipc_mon_domain *dom_bef, int applied_bef) { struct tipc_peer *member = peer; struct tipc_mon_domain *dom_aft = peer->domain; int applied_aft = peer->applied; int i; for (i = 0; i < applied_bef; i++) { member = peer_nxt(member); /* Do nothing if self or peer already see member as down */ if (!member->is_up || !map_get(dom_bef->up_map, i)) continue; /* Loss of local node must be detected by active probing */ if (member->is_local) continue; /* Start probing if member was removed from applied domain */ if (!applied_aft || (applied_aft < i)) { member->down_cnt = 1; continue; } /* Member loss is confirmed if it is still in applied domain */ if (!map_get(dom_aft->up_map, i)) member->down_cnt++; } } /* mon_apply_domain() : match a peer's domain record against monitor list */ static void mon_apply_domain(struct tipc_monitor *mon, struct tipc_peer *peer) { struct tipc_mon_domain *dom = peer->domain; struct tipc_peer *member; u32 addr; int i; if (!dom || !peer->is_up) return; /* Scan across domain members and match against monitor list */ peer->applied = 0; member = peer_nxt(peer); for (i = 0; i < dom->member_cnt; i++) { addr = dom->members[i]; if (addr != member->addr) return; peer->applied++; member = peer_nxt(member); } } /* mon_update_local_domain() : update after peer addition/removal/up/down */ static void mon_update_local_domain(struct tipc_monitor *mon) { struct tipc_peer *self = mon->self; struct tipc_mon_domain *cache = &mon->cache; struct tipc_mon_domain *dom = self->domain; struct tipc_peer *peer = self; u64 prev_up_map = dom->up_map; u16 member_cnt, i; bool diff; /* Update local domain size based on current size of cluster */ member_cnt = dom_size(mon->peer_cnt) - 1; self->applied = member_cnt; /* Update native and cached outgoing local domain records */ dom->len = dom_rec_len(dom, member_cnt); diff = dom->member_cnt != member_cnt; dom->member_cnt = member_cnt; for (i = 0; i < member_cnt; i++) { peer = peer_nxt(peer); diff |= dom->members[i] != peer->addr; dom->members[i] = peer->addr; map_set(&dom->up_map, i, peer->is_up); cache->members[i] = mon_cpu_to_le32(peer->addr); } diff |= dom->up_map != prev_up_map; if (!diff) return; dom->gen = ++mon->dom_gen; cache->len = mon_cpu_to_le16(dom->len); cache->gen = mon_cpu_to_le16(dom->gen); cache->member_cnt = mon_cpu_to_le16(member_cnt); cache->up_map = mon_cpu_to_le64(dom->up_map); mon_apply_domain(mon, self); } /* mon_update_neighbors() : update preceding neighbors of added/removed peer */ static void mon_update_neighbors(struct tipc_monitor *mon, struct tipc_peer *peer) { int dz, i; dz = dom_size(mon->peer_cnt); for (i = 0; i < dz; i++) { mon_apply_domain(mon, peer); peer = peer_prev(peer); } } /* mon_assign_roles() : reassign peer roles after a network change * The monitor list is consistent at this stage; i.e., each peer is monitoring * a set of domain members as matched between domain record and the monitor list */ static void mon_assign_roles(struct tipc_monitor *mon, struct tipc_peer *head) { struct tipc_peer *peer = peer_nxt(head); struct tipc_peer *self = mon->self; int i = 0; for (; peer != self; peer = peer_nxt(peer)) { peer->is_local = false; /* Update domain member */ if (i++ < head->applied) { peer->is_head = false; if (head == self) peer->is_local = true; continue; } /* Assign next domain head */ if (!peer->is_up) continue; if (peer->is_head) break; head = peer; head->is_head = true; i = 0; } mon->list_gen++; } void tipc_mon_remove_peer(struct net *net, u32 addr, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self; struct tipc_peer *peer, *prev, *head; if (!mon) return; self = get_self(net, bearer_id); write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer) goto exit; prev = peer_prev(peer); list_del(&peer->list); hlist_del(&peer->hash); kfree(peer->domain); kfree(peer); mon->peer_cnt--; head = peer_head(prev); if (head == self) mon_update_local_domain(mon); mon_update_neighbors(mon, prev); /* Revert to full-mesh monitoring if we reach threshold */ if (!tipc_mon_is_active(net, mon)) { list_for_each_entry(peer, &self->list, list) { kfree(peer->domain); peer->domain = NULL; peer->applied = 0; } } mon_assign_roles(mon, head); exit: write_unlock_bh(&mon->lock); } static bool tipc_mon_add_peer(struct tipc_monitor *mon, u32 addr, struct tipc_peer **peer) { struct tipc_peer *self = mon->self; struct tipc_peer *cur, *prev, *p; p = kzalloc_obj(*p, GFP_ATOMIC); *peer = p; if (!p) return false; p->addr = addr; /* Add new peer to lookup list */ INIT_LIST_HEAD(&p->list); hlist_add_head(&p->hash, &mon->peers[tipc_hashfn(addr)]); /* Sort new peer into iterator list, in ascending circular order */ prev = self; list_for_each_entry(cur, &self->list, list) { if ((addr > prev->addr) && (addr < cur->addr)) break; if (((addr < cur->addr) || (addr > prev->addr)) && (prev->addr > cur->addr)) break; prev = cur; } list_add_tail(&p->list, &cur->list); mon->peer_cnt++; mon_update_neighbors(mon, p); return true; } void tipc_mon_peer_up(struct net *net, u32 addr, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self = get_self(net, bearer_id); struct tipc_peer *peer, *head; write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer && !tipc_mon_add_peer(mon, addr, &peer)) goto exit; peer->is_up = true; head = peer_head(peer); if (head == self) mon_update_local_domain(mon); mon_assign_roles(mon, head); exit: write_unlock_bh(&mon->lock); } void tipc_mon_peer_down(struct net *net, u32 addr, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self; struct tipc_peer *peer, *head; struct tipc_mon_domain *dom; int applied; if (!mon) return; self = get_self(net, bearer_id); write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer) { pr_warn("Mon: unknown link %x/%u DOWN\n", addr, bearer_id); goto exit; } applied = peer->applied; peer->applied = 0; dom = peer->domain; peer->domain = NULL; if (peer->is_head) mon_identify_lost_members(peer, dom, applied); kfree(dom); peer->is_up = false; peer->is_head = false; peer->is_local = false; peer->down_cnt = 0; head = peer_head(peer); if (head == self) mon_update_local_domain(mon); mon_assign_roles(mon, head); exit: write_unlock_bh(&mon->lock); } /* tipc_mon_rcv - process monitor domain event message */ void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr, struct tipc_mon_state *state, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_mon_domain *arrv_dom = data; struct tipc_mon_domain dom_bef; struct tipc_mon_domain *dom; struct tipc_peer *peer; u16 new_member_cnt = mon_le16_to_cpu(arrv_dom->member_cnt); int new_dlen = dom_rec_len(arrv_dom, new_member_cnt); u16 new_gen = mon_le16_to_cpu(arrv_dom->gen); u16 acked_gen = mon_le16_to_cpu(arrv_dom->ack_gen); u16 arrv_dlen = mon_le16_to_cpu(arrv_dom->len); bool probing = state->probing; int i, applied_bef; state->probing = false; /* Sanity check received domain record */ if (new_member_cnt > MAX_MON_DOMAIN) return; if (dlen < dom_rec_len(arrv_dom, 0)) return; if (dlen != dom_rec_len(arrv_dom, new_member_cnt)) return; if (dlen < new_dlen || arrv_dlen != new_dlen) return; /* Synch generation numbers with peer if link just came up */ if (!state->synched) { state->peer_gen = new_gen - 1; state->acked_gen = acked_gen; state->synched = true; } if (more(acked_gen, state->acked_gen)) state->acked_gen = acked_gen; /* Drop duplicate unless we are waiting for a probe response */ if (!more(new_gen, state->peer_gen) && !probing) return; write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer || !peer->is_up) goto exit; /* Peer is confirmed, stop any ongoing probing */ peer->down_cnt = 0; /* Task is done for duplicate record */ if (!more(new_gen, state->peer_gen)) goto exit; state->peer_gen = new_gen; /* Cache current domain record for later use */ dom_bef.member_cnt = 0; dom = peer->domain; if (dom) memcpy(&dom_bef, dom, dom->len); /* Transform and store received domain record */ if (!dom || (dom->len < new_dlen)) { kfree(dom); dom = kmalloc(new_dlen, GFP_ATOMIC); peer->domain = dom; if (!dom) goto exit; } dom->len = new_dlen; dom->gen = new_gen; dom->member_cnt = new_member_cnt; dom->up_map = mon_le64_to_cpu(arrv_dom->up_map); for (i = 0; i < new_member_cnt; i++) dom->members[i] = mon_le32_to_cpu(arrv_dom->members[i]); /* Update peers affected by this domain record */ applied_bef = peer->applied; mon_apply_domain(mon, peer); mon_identify_lost_members(peer, &dom_bef, applied_bef); mon_assign_roles(mon, peer_head(peer)); exit: write_unlock_bh(&mon->lock); } void tipc_mon_prep(struct net *net, void *data, int *dlen, struct tipc_mon_state *state, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_mon_domain *dom = data; u16 gen = mon->dom_gen; u16 len; /* Send invalid record if not active */ if (!tipc_mon_is_active(net, mon)) { dom->len = 0; return; } /* Send only a dummy record with ack if peer has acked our last sent */ if (likely(state->acked_gen == gen)) { len = dom_rec_len(dom, 0); *dlen = len; dom->len = mon_cpu_to_le16(len); dom->gen = mon_cpu_to_le16(gen); dom->ack_gen = mon_cpu_to_le16(state->peer_gen); dom->member_cnt = 0; return; } /* Send the full record */ read_lock_bh(&mon->lock); len = mon_le16_to_cpu(mon->cache.len); *dlen = len; memcpy(data, &mon->cache, len); read_unlock_bh(&mon->lock); dom->ack_gen = mon_cpu_to_le16(state->peer_gen); } void tipc_mon_get_state(struct net *net, u32 addr, struct tipc_mon_state *state, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *peer; if (!tipc_mon_is_active(net, mon)) { state->probing = false; state->monitoring = true; return; } /* Used cached state if table has not changed */ if (!state->probing && (state->list_gen == mon->list_gen) && (state->acked_gen == mon->dom_gen)) return; read_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (peer) { state->probing = state->acked_gen != mon->dom_gen; state->probing |= peer->down_cnt; state->reset |= peer->down_cnt >= MAX_PEER_DOWN_EVENTS; state->monitoring = peer->is_local; state->monitoring |= peer->is_head; state->list_gen = mon->list_gen; } read_unlock_bh(&mon->lock); } static void mon_timeout(struct timer_list *t) { struct tipc_monitor *mon = timer_container_of(mon, t, timer); struct tipc_peer *self; int best_member_cnt = dom_size(mon->peer_cnt) - 1; write_lock_bh(&mon->lock); self = mon->self; if (self && (best_member_cnt != self->applied)) { mon_update_local_domain(mon); mon_assign_roles(mon, self); } write_unlock_bh(&mon->lock); mod_timer(&mon->timer, jiffies + mon->timer_intv); } int tipc_mon_create(struct net *net, int bearer_id) { struct tipc_net *tn = tipc_net(net); struct tipc_monitor *mon; struct tipc_peer *self; struct tipc_mon_domain *dom; if (tn->monitors[bearer_id]) return 0; mon = kzalloc_obj(*mon, GFP_ATOMIC); self = kzalloc_obj(*self, GFP_ATOMIC); dom = kzalloc_obj(*dom, GFP_ATOMIC); if (!mon || !self || !dom) { kfree(mon); kfree(self); kfree(dom); return -ENOMEM; } tn->monitors[bearer_id] = mon; rwlock_init(&mon->lock); mon->net = net; mon->peer_cnt = 1; mon->self = self; self->domain = dom; self->addr = tipc_own_addr(net); self->is_up = true; self->is_head = true; INIT_LIST_HEAD(&self->list); timer_setup(&mon->timer, mon_timeout, 0); mon->timer_intv = msecs_to_jiffies(MON_TIMEOUT + (tn->random & 0xffff)); mod_timer(&mon->timer, jiffies + mon->timer_intv); return 0; } void tipc_mon_delete(struct net *net, int bearer_id) { struct tipc_net *tn = tipc_net(net); struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self; struct tipc_peer *peer, *tmp; if (!mon) return; self = get_self(net, bearer_id); write_lock_bh(&mon->lock); tn->monitors[bearer_id] = NULL; list_for_each_entry_safe(peer, tmp, &self->list, list) { list_del(&peer->list); hlist_del(&peer->hash); kfree(peer->domain); kfree(peer); } mon->self = NULL; write_unlock_bh(&mon->lock); timer_shutdown_sync(&mon->timer); kfree(self->domain); kfree(self); kfree(mon); } void tipc_mon_reinit_self(struct net *net) { struct tipc_monitor *mon; int bearer_id; for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { mon = tipc_monitor(net, bearer_id); if (!mon) continue; write_lock_bh(&mon->lock); if (mon->self) mon->self->addr = tipc_own_addr(net); write_unlock_bh(&mon->lock); } } int tipc_nl_monitor_set_threshold(struct net *net, u32 cluster_size) { struct tipc_net *tn = tipc_net(net); if (cluster_size > TIPC_CLUSTER_SIZE) return -EINVAL; tn->mon_threshold = cluster_size; return 0; } int tipc_nl_monitor_get_threshold(struct net *net) { struct tipc_net *tn = tipc_net(net); return tn->mon_threshold; } static int __tipc_nl_add_monitor_peer(struct tipc_peer *peer, struct tipc_nl_msg *msg) { struct tipc_mon_domain *dom = peer->domain; struct nlattr *attrs; void *hdr; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_MON_PEER_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON_PEER); if (!attrs) goto msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_ADDR, peer->addr)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_APPLIED, peer->applied)) goto attr_msg_full; if (peer->is_up) if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_UP)) goto attr_msg_full; if (peer->is_local) if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_LOCAL)) goto attr_msg_full; if (peer->is_head) if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_HEAD)) goto attr_msg_full; if (dom) { if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_DOMGEN, dom->gen)) goto attr_msg_full; if (nla_put_u64_64bit(msg->skb, TIPC_NLA_MON_PEER_UPMAP, dom->up_map, TIPC_NLA_MON_PEER_PAD)) goto attr_msg_full; if (nla_put(msg->skb, TIPC_NLA_MON_PEER_MEMBERS, dom->member_cnt * sizeof(u32), &dom->members)) goto attr_msg_full; } nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_add_monitor_peer(struct net *net, struct tipc_nl_msg *msg, u32 bearer_id, u32 *prev_node) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *peer; if (!mon) return -EINVAL; read_lock_bh(&mon->lock); peer = mon->self; do { if (*prev_node) { if (peer->addr == *prev_node) *prev_node = 0; else continue; } if (__tipc_nl_add_monitor_peer(peer, msg)) { *prev_node = peer->addr; read_unlock_bh(&mon->lock); return -EMSGSIZE; } } while ((peer = peer_nxt(peer)) != mon->self); read_unlock_bh(&mon->lock); return 0; } int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg, u32 bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); char bearer_name[TIPC_MAX_BEARER_NAME]; struct nlattr *attrs; void *hdr; int ret; ret = tipc_bearer_get_name(net, bearer_name, bearer_id); if (ret || !mon) return 0; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_MON_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON); if (!attrs) goto msg_full; read_lock_bh(&mon->lock); if (nla_put_u32(msg->skb, TIPC_NLA_MON_REF, bearer_id)) goto attr_msg_full; if (tipc_mon_is_active(net, mon)) if (nla_put_flag(msg->skb, TIPC_NLA_MON_ACTIVE)) goto attr_msg_full; if (nla_put_string(msg->skb, TIPC_NLA_MON_BEARER_NAME, bearer_name)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEERCNT, mon->peer_cnt)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_LISTGEN, mon->list_gen)) goto attr_msg_full; read_unlock_bh(&mon->lock); nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: read_unlock_bh(&mon->lock); nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 // SPDX-License-Identifier: GPL-2.0 /* * Copyright 2019, 2020 Amazon.com, Inc. or its affiliates. All rights reserved. * * User extended attribute client side cache functions. * * Author: Frank van der Linden <fllinden@amazon.com> */ #include <linux/errno.h> #include <linux/nfs_fs.h> #include <linux/hashtable.h> #include <linux/refcount.h> #include <uapi/linux/xattr.h> #include "nfs4_fs.h" #include "internal.h" /* * User extended attributes client side caching is implemented by having * a cache structure attached to NFS inodes. This structure is allocated * when needed, and freed when the cache is zapped. * * The cache structure contains as hash table of entries, and a pointer * to a special-cased entry for the listxattr cache. * * Accessing and allocating / freeing the caches is done via reference * counting. The cache entries use a similar refcounting scheme. * * This makes freeing a cache, both from the shrinker and from the * zap cache path, easy. It also means that, in current use cases, * the large majority of inodes will not waste any memory, as they * will never have any user extended attributes assigned to them. * * Attribute entries are hashed in to a simple hash table. They are * also part of an LRU. * * There are three shrinkers. * * Two shrinkers deal with the cache entries themselves: one for * large entries (> PAGE_SIZE), and one for smaller entries. The * shrinker for the larger entries works more aggressively than * those for the smaller entries. * * The other shrinker frees the cache structures themselves. */ /* * 64 buckets is a good default. There is likely no reasonable * workload that uses more than even 64 user extended attributes. * You can certainly add a lot more - but you get what you ask for * in those circumstances. */ #define NFS4_XATTR_HASH_SIZE 64 #define NFSDBG_FACILITY NFSDBG_XATTRCACHE struct nfs4_xattr_cache; struct nfs4_xattr_entry; struct nfs4_xattr_bucket { spinlock_t lock; struct hlist_head hlist; struct nfs4_xattr_cache *cache; bool draining; }; struct nfs4_xattr_cache { struct kref ref; struct nfs4_xattr_bucket buckets[NFS4_XATTR_HASH_SIZE]; struct list_head lru; struct list_head dispose; atomic_long_t nent; spinlock_t listxattr_lock; struct inode *inode; struct nfs4_xattr_entry *listxattr; }; struct nfs4_xattr_entry { struct kref ref; struct hlist_node hnode; struct list_head lru; struct list_head dispose; char *xattr_name; void *xattr_value; size_t xattr_size; struct nfs4_xattr_bucket *bucket; uint32_t flags; }; #define NFS4_XATTR_ENTRY_EXTVAL 0x0001 /* * LRU list of NFS inodes that have xattr caches. */ static struct list_lru nfs4_xattr_cache_lru; static struct list_lru nfs4_xattr_entry_lru; static struct list_lru nfs4_xattr_large_entry_lru; static struct kmem_cache *nfs4_xattr_cache_cachep; /* * Hashing helper functions. */ static void nfs4_xattr_hash_init(struct nfs4_xattr_cache *cache) { unsigned int i; for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) { INIT_HLIST_HEAD(&cache->buckets[i].hlist); spin_lock_init(&cache->buckets[i].lock); cache->buckets[i].cache = cache; cache->buckets[i].draining = false; } } /* * Locking order: * 1. inode i_lock or bucket lock * 2. list_lru lock (taken by list_lru_* functions) */ /* * Wrapper functions to add a cache entry to the right LRU. */ static bool nfs4_xattr_entry_lru_add(struct nfs4_xattr_entry *entry) { struct list_lru *lru; lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; return list_lru_add_obj(lru, &entry->lru); } static bool nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry) { struct list_lru *lru; lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; return list_lru_del_obj(lru, &entry->lru); } /* * This function allocates cache entries. They are the normal * extended attribute name/value pairs, but may also be a listxattr * cache. Those allocations use the same entry so that they can be * treated as one by the memory shrinker. * * xattr cache entries are allocated together with names. If the * value fits in to one page with the entry structure and the name, * it will also be part of the same allocation (kmalloc). This is * expected to be the vast majority of cases. Larger allocations * have a value pointer that is allocated separately by kvmalloc. * * Parameters: * * @name: Name of the extended attribute. NULL for listxattr cache * entry. * @value: Value of attribute, or listxattr cache. NULL if the * value is to be copied from pages instead. * @pages: Pages to copy the value from, if not NULL. Passed in to * make it easier to copy the value after an RPC, even if * the value will not be passed up to application (e.g. * for a 'query' getxattr with NULL buffer). * @len: Length of the value. Can be 0 for zero-length attributes. * @value and @pages will be NULL if @len is 0. */ static struct nfs4_xattr_entry * nfs4_xattr_alloc_entry(const char *name, const void *value, struct page **pages, size_t len) { struct nfs4_xattr_entry *entry; void *valp; char *namep; size_t alloclen, slen; char *buf; uint32_t flags; BUILD_BUG_ON(sizeof(struct nfs4_xattr_entry) + XATTR_NAME_MAX + 1 > PAGE_SIZE); alloclen = sizeof(struct nfs4_xattr_entry); if (name != NULL) { slen = strlen(name) + 1; alloclen += slen; } else slen = 0; if (alloclen + len <= PAGE_SIZE) { alloclen += len; flags = 0; } else { flags = NFS4_XATTR_ENTRY_EXTVAL; } buf = kmalloc(alloclen, GFP_KERNEL); if (buf == NULL) return NULL; entry = (struct nfs4_xattr_entry *)buf; if (name != NULL) { namep = buf + sizeof(struct nfs4_xattr_entry); memcpy(namep, name, slen); } else { namep = NULL; } if (flags & NFS4_XATTR_ENTRY_EXTVAL) { valp = kvmalloc(len, GFP_KERNEL); if (valp == NULL) { kfree(buf); return NULL; } } else if (len != 0) { valp = buf + sizeof(struct nfs4_xattr_entry) + slen; } else valp = NULL; if (valp != NULL) { if (value != NULL) memcpy(valp, value, len); else _copy_from_pages(valp, pages, 0, len); } entry->flags = flags; entry->xattr_value = valp; kref_init(&entry->ref); entry->xattr_name = namep; entry->xattr_size = len; entry->bucket = NULL; INIT_LIST_HEAD(&entry->lru); INIT_LIST_HEAD(&entry->dispose); INIT_HLIST_NODE(&entry->hnode); return entry; } static void nfs4_xattr_free_entry(struct nfs4_xattr_entry *entry) { if (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) kvfree(entry->xattr_value); kfree(entry); } static void nfs4_xattr_free_entry_cb(struct kref *kref) { struct nfs4_xattr_entry *entry; entry = container_of(kref, struct nfs4_xattr_entry, ref); if (WARN_ON(!list_empty(&entry->lru))) return; nfs4_xattr_free_entry(entry); } static void nfs4_xattr_free_cache_cb(struct kref *kref) { struct nfs4_xattr_cache *cache; int i; cache = container_of(kref, struct nfs4_xattr_cache, ref); for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) { if (WARN_ON(!hlist_empty(&cache->buckets[i].hlist))) return; cache->buckets[i].draining = false; } cache->listxattr = NULL; kmem_cache_free(nfs4_xattr_cache_cachep, cache); } static struct nfs4_xattr_cache * nfs4_xattr_alloc_cache(void) { struct nfs4_xattr_cache *cache; cache = kmem_cache_alloc(nfs4_xattr_cache_cachep, GFP_KERNEL); if (cache == NULL) return NULL; kref_init(&cache->ref); atomic_long_set(&cache->nent, 0); return cache; } /* * Set the listxattr cache, which is a special-cased cache entry. * The special value ERR_PTR(-ESTALE) is used to indicate that * the cache is being drained - this prevents a new listxattr * cache from being added to what is now a stale cache. */ static int nfs4_xattr_set_listcache(struct nfs4_xattr_cache *cache, struct nfs4_xattr_entry *new) { struct nfs4_xattr_entry *old; int ret = 1; spin_lock(&cache->listxattr_lock); old = cache->listxattr; if (old == ERR_PTR(-ESTALE)) { ret = 0; goto out; } cache->listxattr = new; if (new != NULL && new != ERR_PTR(-ESTALE)) nfs4_xattr_entry_lru_add(new); if (old != NULL) { nfs4_xattr_entry_lru_del(old); kref_put(&old->ref, nfs4_xattr_free_entry_cb); } out: spin_unlock(&cache->listxattr_lock); return ret; } /* * Unlink a cache from its parent inode, clearing out an invalid * cache. Must be called with i_lock held. */ static struct nfs4_xattr_cache * nfs4_xattr_cache_unlink(struct inode *inode) { struct nfs_inode *nfsi; struct nfs4_xattr_cache *oldcache; nfsi = NFS_I(inode); oldcache = nfsi->xattr_cache; if (oldcache != NULL) { list_lru_del_obj(&nfs4_xattr_cache_lru, &oldcache->lru); oldcache->inode = NULL; } nfsi->xattr_cache = NULL; nfsi->cache_validity &= ~NFS_INO_INVALID_XATTR; return oldcache; } /* * Discard a cache. Called by get_cache() if there was an old, * invalid cache. Can also be called from a shrinker callback. * * The cache is dead, it has already been unlinked from its inode, * and no longer appears on the cache LRU list. * * Mark all buckets as draining, so that no new entries are added. This * could still happen in the unlikely, but possible case that another * thread had grabbed a reference before it was unlinked from the inode, * and is still holding it for an add operation. * * Remove all entries from the LRU lists, so that there is no longer * any way to 'find' this cache. Then, remove the entries from the hash * table. * * At that point, the cache will remain empty and can be freed when the final * reference drops, which is very likely the kref_put at the end of * this function, or the one called immediately afterwards in the * shrinker callback. */ static void nfs4_xattr_discard_cache(struct nfs4_xattr_cache *cache) { unsigned int i; struct nfs4_xattr_entry *entry; struct nfs4_xattr_bucket *bucket; struct hlist_node *n; nfs4_xattr_set_listcache(cache, ERR_PTR(-ESTALE)); for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) { bucket = &cache->buckets[i]; spin_lock(&bucket->lock); bucket->draining = true; hlist_for_each_entry_safe(entry, n, &bucket->hlist, hnode) { nfs4_xattr_entry_lru_del(entry); hlist_del_init(&entry->hnode); kref_put(&entry->ref, nfs4_xattr_free_entry_cb); } spin_unlock(&bucket->lock); } atomic_long_set(&cache->nent, 0); kref_put(&cache->ref, nfs4_xattr_free_cache_cb); } /* * Get a referenced copy of the cache structure. Avoid doing allocs * while holding i_lock. Which means that we do some optimistic allocation, * and might have to free the result in rare cases. * * This function only checks the NFS_INO_INVALID_XATTR cache validity bit * and acts accordingly, replacing the cache when needed. For the read case * (!add), this means that the caller must make sure that the cache * is valid before caling this function. getxattr and listxattr call * revalidate_inode to do this. The attribute cache timeout (for the * non-delegated case) is expected to be dealt with in the revalidate * call. */ static struct nfs4_xattr_cache * nfs4_xattr_get_cache(struct inode *inode, int add) { struct nfs_inode *nfsi; struct nfs4_xattr_cache *cache, *oldcache, *newcache; nfsi = NFS_I(inode); cache = oldcache = NULL; spin_lock(&inode->i_lock); if (nfsi->cache_validity & NFS_INO_INVALID_XATTR) oldcache = nfs4_xattr_cache_unlink(inode); else cache = nfsi->xattr_cache; if (cache != NULL) kref_get(&cache->ref); spin_unlock(&inode->i_lock); if (add && cache == NULL) { newcache = NULL; cache = nfs4_xattr_alloc_cache(); if (cache == NULL) goto out; spin_lock(&inode->i_lock); if (nfsi->cache_validity & NFS_INO_INVALID_XATTR) { /* * The cache was invalidated again. Give up, * since what we want to enter is now likely * outdated anyway. */ spin_unlock(&inode->i_lock); kref_put(&cache->ref, nfs4_xattr_free_cache_cb); cache = NULL; goto out; } /* * Check if someone beat us to it. */ if (nfsi->xattr_cache != NULL) { newcache = nfsi->xattr_cache; kref_get(&newcache->ref); } else { kref_get(&cache->ref); nfsi->xattr_cache = cache; cache->inode = inode; list_lru_add_obj(&nfs4_xattr_cache_lru, &cache->lru); } spin_unlock(&inode->i_lock); /* * If there was a race, throw away the cache we just * allocated, and use the new one allocated by someone * else. */ if (newcache != NULL) { kref_put(&cache->ref, nfs4_xattr_free_cache_cb); cache = newcache; } } out: /* * Discard the now orphaned old cache. */ if (oldcache != NULL) nfs4_xattr_discard_cache(oldcache); return cache; } static inline struct nfs4_xattr_bucket * nfs4_xattr_hash_bucket(struct nfs4_xattr_cache *cache, const char *name) { return &cache->buckets[jhash(name, strlen(name), 0) & (ARRAY_SIZE(cache->buckets) - 1)]; } static struct nfs4_xattr_entry * nfs4_xattr_get_entry(struct nfs4_xattr_bucket *bucket, const char *name) { struct nfs4_xattr_entry *entry; entry = NULL; hlist_for_each_entry(entry, &bucket->hlist, hnode) { if (!strcmp(entry->xattr_name, name)) break; } return entry; } static int nfs4_xattr_hash_add(struct nfs4_xattr_cache *cache, struct nfs4_xattr_entry *entry) { struct nfs4_xattr_bucket *bucket; struct nfs4_xattr_entry *oldentry = NULL; int ret = 1; bucket = nfs4_xattr_hash_bucket(cache, entry->xattr_name); entry->bucket = bucket; spin_lock(&bucket->lock); if (bucket->draining) { ret = 0; goto out; } oldentry = nfs4_xattr_get_entry(bucket, entry->xattr_name); if (oldentry != NULL) { hlist_del_init(&oldentry->hnode); nfs4_xattr_entry_lru_del(oldentry); } else { atomic_long_inc(&cache->nent); } hlist_add_head(&entry->hnode, &bucket->hlist); nfs4_xattr_entry_lru_add(entry); out: spin_unlock(&bucket->lock); if (oldentry != NULL) kref_put(&oldentry->ref, nfs4_xattr_free_entry_cb); return ret; } static void nfs4_xattr_hash_remove(struct nfs4_xattr_cache *cache, const char *name) { struct nfs4_xattr_bucket *bucket; struct nfs4_xattr_entry *entry; bucket = nfs4_xattr_hash_bucket(cache, name); spin_lock(&bucket->lock); entry = nfs4_xattr_get_entry(bucket, name); if (entry != NULL) { hlist_del_init(&entry->hnode); nfs4_xattr_entry_lru_del(entry); atomic_long_dec(&cache->nent); } spin_unlock(&bucket->lock); if (entry != NULL) kref_put(&entry->ref, nfs4_xattr_free_entry_cb); } static struct nfs4_xattr_entry * nfs4_xattr_hash_find(struct nfs4_xattr_cache *cache, const char *name) { struct nfs4_xattr_bucket *bucket; struct nfs4_xattr_entry *entry; bucket = nfs4_xattr_hash_bucket(cache, name); spin_lock(&bucket->lock); entry = nfs4_xattr_get_entry(bucket, name); if (entry != NULL) kref_get(&entry->ref); spin_unlock(&bucket->lock); return entry; } /* * Entry point to retrieve an entry from the cache. */ ssize_t nfs4_xattr_cache_get(struct inode *inode, const char *name, char *buf, ssize_t buflen) { struct nfs4_xattr_cache *cache; struct nfs4_xattr_entry *entry; ssize_t ret; cache = nfs4_xattr_get_cache(inode, 0); if (cache == NULL) return -ENOENT; ret = 0; entry = nfs4_xattr_hash_find(cache, name); if (entry != NULL) { dprintk("%s: cache hit '%s', len %lu\n", __func__, entry->xattr_name, (unsigned long)entry->xattr_size); if (buflen == 0) { /* Length probe only */ ret = entry->xattr_size; } else if (buflen < entry->xattr_size) ret = -ERANGE; else { memcpy(buf, entry->xattr_value, entry->xattr_size); ret = entry->xattr_size; } kref_put(&entry->ref, nfs4_xattr_free_entry_cb); } else { dprintk("%s: cache miss '%s'\n", __func__, name); ret = -ENOENT; } kref_put(&cache->ref, nfs4_xattr_free_cache_cb); return ret; } /* * Retrieve a cached list of xattrs from the cache. */ ssize_t nfs4_xattr_cache_list(struct inode *inode, char *buf, ssize_t buflen) { struct nfs4_xattr_cache *cache; struct nfs4_xattr_entry *entry; ssize_t ret; cache = nfs4_xattr_get_cache(inode, 0); if (cache == NULL) return -ENOENT; spin_lock(&cache->listxattr_lock); entry = cache->listxattr; if (entry != NULL && entry != ERR_PTR(-ESTALE)) { if (buflen == 0) { /* Length probe only */ ret = entry->xattr_size; } else if (entry->xattr_size > buflen) ret = -ERANGE; else { memcpy(buf, entry->xattr_value, entry->xattr_size); ret = entry->xattr_size; } } else { ret = -ENOENT; } spin_unlock(&cache->listxattr_lock); kref_put(&cache->ref, nfs4_xattr_free_cache_cb); return ret; } /* * Add an xattr to the cache. * * This also invalidates the xattr list cache. */ void nfs4_xattr_cache_add(struct inode *inode, const char *name, const char *buf, struct page **pages, ssize_t buflen) { struct nfs4_xattr_cache *cache; struct nfs4_xattr_entry *entry; dprintk("%s: add '%s' len %lu\n", __func__, name, (unsigned long)buflen); cache = nfs4_xattr_get_cache(inode, 1); if (cache == NULL) return; entry = nfs4_xattr_alloc_entry(name, buf, pages, buflen); if (entry == NULL) goto out; (void)nfs4_xattr_set_listcache(cache, NULL); if (!nfs4_xattr_hash_add(cache, entry)) kref_put(&entry->ref, nfs4_xattr_free_entry_cb); out: kref_put(&cache->ref, nfs4_xattr_free_cache_cb); } /* * Remove an xattr from the cache. * * This also invalidates the xattr list cache. */ void nfs4_xattr_cache_remove(struct inode *inode, const char *name) { struct nfs4_xattr_cache *cache; dprintk("%s: remove '%s'\n", __func__, name); cache = nfs4_xattr_get_cache(inode, 0); if (cache == NULL) return; (void)nfs4_xattr_set_listcache(cache, NULL); nfs4_xattr_hash_remove(cache, name); kref_put(&cache->ref, nfs4_xattr_free_cache_cb); } /* * Cache listxattr output, replacing any possible old one. */ void nfs4_xattr_cache_set_list(struct inode *inode, const char *buf, ssize_t buflen) { struct nfs4_xattr_cache *cache; struct nfs4_xattr_entry *entry; cache = nfs4_xattr_get_cache(inode, 1); if (cache == NULL) return; entry = nfs4_xattr_alloc_entry(NULL, buf, NULL, buflen); if (entry == NULL) goto out; /* * This is just there to be able to get to bucket->cache, * which is obviously the same for all buckets, so just * use bucket 0. */ entry->bucket = &cache->buckets[0]; if (!nfs4_xattr_set_listcache(cache, entry)) kref_put(&entry->ref, nfs4_xattr_free_entry_cb); out: kref_put(&cache->ref, nfs4_xattr_free_cache_cb); } /* * Zap the entire cache. Called when an inode is evicted. */ void nfs4_xattr_cache_zap(struct inode *inode) { struct nfs4_xattr_cache *oldcache; spin_lock(&inode->i_lock); oldcache = nfs4_xattr_cache_unlink(inode); spin_unlock(&inode->i_lock); if (oldcache) nfs4_xattr_discard_cache(oldcache); } /* * The entry LRU is shrunk more aggressively than the cache LRU, * by settings @seeks to 1. * * Cache structures are freed only when they've become empty, after * pruning all but one entry. */ static unsigned long nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc); static unsigned long nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc); static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink, struct shrink_control *sc); static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc); static struct shrinker *nfs4_xattr_cache_shrinker; static struct shrinker *nfs4_xattr_entry_shrinker; static struct shrinker *nfs4_xattr_large_entry_shrinker; static enum lru_status cache_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *dispose = arg; struct inode *inode; struct nfs4_xattr_cache *cache = container_of(item, struct nfs4_xattr_cache, lru); if (atomic_long_read(&cache->nent) > 1) return LRU_SKIP; /* * If a cache structure is on the LRU list, we know that * its inode is valid. Try to lock it to break the link. * Since we're inverting the lock order here, only try. */ inode = cache->inode; if (!spin_trylock(&inode->i_lock)) return LRU_SKIP; kref_get(&cache->ref); cache->inode = NULL; NFS_I(inode)->xattr_cache = NULL; NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_XATTR; list_lru_isolate(lru, &cache->lru); spin_unlock(&inode->i_lock); list_add_tail(&cache->dispose, dispose); return LRU_REMOVED; } static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { LIST_HEAD(dispose); unsigned long freed; struct nfs4_xattr_cache *cache; freed = list_lru_shrink_walk(&nfs4_xattr_cache_lru, sc, cache_lru_isolate, &dispose); while (!list_empty(&dispose)) { cache = list_first_entry(&dispose, struct nfs4_xattr_cache, dispose); list_del_init(&cache->dispose); nfs4_xattr_discard_cache(cache); kref_put(&cache->ref, nfs4_xattr_free_cache_cb); } return freed; } static unsigned long nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc) { unsigned long count; count = list_lru_shrink_count(&nfs4_xattr_cache_lru, sc); return vfs_pressure_ratio(count); } static enum lru_status entry_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *dispose = arg; struct nfs4_xattr_bucket *bucket; struct nfs4_xattr_cache *cache; struct nfs4_xattr_entry *entry = container_of(item, struct nfs4_xattr_entry, lru); bucket = entry->bucket; cache = bucket->cache; /* * Unhook the entry from its parent (either a cache bucket * or a cache structure if it's a listxattr buf), so that * it's no longer found. Then add it to the isolate list, * to be freed later. * * In both cases, we're reverting lock order, so use * trylock and skip the entry if we can't get the lock. */ if (entry->xattr_name != NULL) { /* Regular cache entry */ if (!spin_trylock(&bucket->lock)) return LRU_SKIP; kref_get(&entry->ref); hlist_del_init(&entry->hnode); atomic_long_dec(&cache->nent); list_lru_isolate(lru, &entry->lru); spin_unlock(&bucket->lock); } else { /* Listxattr cache entry */ if (!spin_trylock(&cache->listxattr_lock)) return LRU_SKIP; kref_get(&entry->ref); cache->listxattr = NULL; list_lru_isolate(lru, &entry->lru); spin_unlock(&cache->listxattr_lock); } list_add_tail(&entry->dispose, dispose); return LRU_REMOVED; } static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc) { LIST_HEAD(dispose); unsigned long freed; struct nfs4_xattr_entry *entry; struct list_lru *lru; lru = (shrink == nfs4_xattr_large_entry_shrinker) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; freed = list_lru_shrink_walk(lru, sc, entry_lru_isolate, &dispose); while (!list_empty(&dispose)) { entry = list_first_entry(&dispose, struct nfs4_xattr_entry, dispose); list_del_init(&entry->dispose); /* * Drop two references: the one that we just grabbed * in entry_lru_isolate, and the one that was set * when the entry was first allocated. */ kref_put(&entry->ref, nfs4_xattr_free_entry_cb); kref_put(&entry->ref, nfs4_xattr_free_entry_cb); } return freed; } static unsigned long nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc) { unsigned long count; struct list_lru *lru; lru = (shrink == nfs4_xattr_large_entry_shrinker) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; count = list_lru_shrink_count(lru, sc); return vfs_pressure_ratio(count); } static void nfs4_xattr_cache_init_once(void *p) { struct nfs4_xattr_cache *cache = p; spin_lock_init(&cache->listxattr_lock); atomic_long_set(&cache->nent, 0); nfs4_xattr_hash_init(cache); cache->listxattr = NULL; INIT_LIST_HEAD(&cache->lru); INIT_LIST_HEAD(&cache->dispose); } typedef unsigned long (*count_objects_cb)(struct shrinker *s, struct shrink_control *sc); typedef unsigned long (*scan_objects_cb)(struct shrinker *s, struct shrink_control *sc); static int __init nfs4_xattr_shrinker_init(struct shrinker **shrinker, struct list_lru *lru, const char *name, count_objects_cb count, scan_objects_cb scan, long batch, int seeks) { int ret; *shrinker = shrinker_alloc(SHRINKER_MEMCG_AWARE, name); if (!*shrinker) return -ENOMEM; ret = list_lru_init_memcg(lru, *shrinker); if (ret) { shrinker_free(*shrinker); return ret; } (*shrinker)->count_objects = count; (*shrinker)->scan_objects = scan; (*shrinker)->batch = batch; (*shrinker)->seeks = seeks; shrinker_register(*shrinker); return ret; } static void nfs4_xattr_shrinker_destroy(struct shrinker *shrinker, struct list_lru *lru) { shrinker_free(shrinker); list_lru_destroy(lru); } int __init nfs4_xattr_cache_init(void) { int ret = 0; nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache", sizeof(struct nfs4_xattr_cache), 0, (SLAB_RECLAIM_ACCOUNT), nfs4_xattr_cache_init_once); if (nfs4_xattr_cache_cachep == NULL) return -ENOMEM; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_cache_shrinker, &nfs4_xattr_cache_lru, "nfs-xattr_cache", nfs4_xattr_cache_count, nfs4_xattr_cache_scan, 0, DEFAULT_SEEKS); if (ret) goto out1; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_entry_shrinker, &nfs4_xattr_entry_lru, "nfs-xattr_entry", nfs4_xattr_entry_count, nfs4_xattr_entry_scan, 512, DEFAULT_SEEKS); if (ret) goto out2; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_large_entry_shrinker, &nfs4_xattr_large_entry_lru, "nfs-xattr_large_entry", nfs4_xattr_entry_count, nfs4_xattr_entry_scan, 512, 1); if (!ret) return 0; nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker, &nfs4_xattr_entry_lru); out2: nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker, &nfs4_xattr_cache_lru); out1: kmem_cache_destroy(nfs4_xattr_cache_cachep); return ret; } void nfs4_xattr_cache_exit(void) { nfs4_xattr_shrinker_destroy(nfs4_xattr_large_entry_shrinker, &nfs4_xattr_large_entry_lru); nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker, &nfs4_xattr_entry_lru); nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker, &nfs4_xattr_cache_lru); kmem_cache_destroy(nfs4_xattr_cache_cachep); }
5 5 5 5 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 // SPDX-License-Identifier: GPL-2.0-or-later /* * ChaCha20-Poly1305 AEAD, RFC7539 * * Copyright (C) 2015 Martin Willi */ #include <crypto/internal/aead.h> #include <crypto/internal/hash.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <crypto/chacha.h> #include <crypto/poly1305.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/string.h> struct chachapoly_instance_ctx { struct crypto_skcipher_spawn chacha; unsigned int saltlen; }; struct chachapoly_ctx { struct crypto_skcipher *chacha; /* key bytes we use for the ChaCha20 IV */ unsigned int saltlen; u8 salt[] __counted_by(saltlen); }; struct chacha_req { u8 iv[CHACHA_IV_SIZE]; struct scatterlist src[1]; struct skcipher_request req; /* must be last member */ }; struct chachapoly_req_ctx { struct scatterlist src[2]; struct scatterlist dst[2]; /* the key we generate for Poly1305 using Chacha20 */ u8 key[POLY1305_KEY_SIZE]; /* calculated Poly1305 tag */ u8 tag[POLY1305_DIGEST_SIZE]; /* length of data to en/decrypt, without ICV */ unsigned int cryptlen; /* Actual AD, excluding IV */ unsigned int assoclen; /* request flags, with MAY_SLEEP cleared if needed */ u32 flags; union { struct chacha_req chacha; } u; }; static inline void async_done_continue(struct aead_request *req, int err, int (*cont)(struct aead_request *)) { if (!err) { struct chachapoly_req_ctx *rctx = aead_request_ctx(req); rctx->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; err = cont(req); } if (err != -EINPROGRESS && err != -EBUSY) aead_request_complete(req, err); } static void chacha_iv(u8 *iv, struct aead_request *req, u32 icb) { struct chachapoly_ctx *ctx = crypto_aead_ctx(crypto_aead_reqtfm(req)); __le32 leicb = cpu_to_le32(icb); memcpy(iv, &leicb, sizeof(leicb)); memcpy(iv + sizeof(leicb), ctx->salt, ctx->saltlen); memcpy(iv + sizeof(leicb) + ctx->saltlen, req->iv, CHACHA_IV_SIZE - sizeof(leicb) - ctx->saltlen); } static int poly_verify_tag(struct aead_request *req) { struct chachapoly_req_ctx *rctx = aead_request_ctx(req); u8 tag[sizeof(rctx->tag)]; scatterwalk_map_and_copy(tag, req->src, req->assoclen + rctx->cryptlen, sizeof(tag), 0); if (crypto_memneq(tag, rctx->tag, sizeof(tag))) return -EBADMSG; return 0; } static void chacha_decrypt_done(void *data, int err) { async_done_continue(data, err, poly_verify_tag); } static int chacha_decrypt(struct aead_request *req) { struct chachapoly_ctx *ctx = crypto_aead_ctx(crypto_aead_reqtfm(req)); struct chachapoly_req_ctx *rctx = aead_request_ctx(req); struct chacha_req *creq = &rctx->u.chacha; struct scatterlist *src, *dst; int err; if (rctx->cryptlen == 0) goto skip; chacha_iv(creq->iv, req, 1); src = scatterwalk_ffwd(rctx->src, req->src, req->assoclen); dst = src; if (req->src != req->dst) dst = scatterwalk_ffwd(rctx->dst, req->dst, req->assoclen); skcipher_request_set_callback(&creq->req, rctx->flags, chacha_decrypt_done, req); skcipher_request_set_tfm(&creq->req, ctx->chacha); skcipher_request_set_crypt(&creq->req, src, dst, rctx->cryptlen, creq->iv); err = crypto_skcipher_decrypt(&creq->req); if (err) return err; skip: return poly_verify_tag(req); } static int poly_hash(struct aead_request *req) { struct chachapoly_req_ctx *rctx = aead_request_ctx(req); const void *zp = page_address(ZERO_PAGE(0)); struct scatterlist *sg = req->src; struct poly1305_desc_ctx desc; struct scatter_walk walk; struct { union { struct { __le64 assoclen; __le64 cryptlen; }; u8 u8[16]; }; } tail; unsigned int padlen; unsigned int total; if (sg != req->dst) memcpy_sglist(req->dst, sg, req->assoclen); if (rctx->cryptlen == req->cryptlen) /* encrypting */ sg = req->dst; poly1305_init(&desc, rctx->key); scatterwalk_start(&walk, sg); total = rctx->assoclen; while (total) { unsigned int n = scatterwalk_next(&walk, total); poly1305_update(&desc, walk.addr, n); scatterwalk_done_src(&walk, n); total -= n; } padlen = -rctx->assoclen % POLY1305_BLOCK_SIZE; poly1305_update(&desc, zp, padlen); scatterwalk_skip(&walk, req->assoclen - rctx->assoclen); total = rctx->cryptlen; while (total) { unsigned int n = scatterwalk_next(&walk, total); poly1305_update(&desc, walk.addr, n); scatterwalk_done_src(&walk, n); total -= n; } padlen = -rctx->cryptlen % POLY1305_BLOCK_SIZE; poly1305_update(&desc, zp, padlen); tail.assoclen = cpu_to_le64(rctx->assoclen); tail.cryptlen = cpu_to_le64(rctx->cryptlen); poly1305_update(&desc, tail.u8, sizeof(tail)); memzero_explicit(&tail, sizeof(tail)); poly1305_final(&desc, rctx->tag); if (rctx->cryptlen != req->cryptlen) return chacha_decrypt(req); memcpy_to_scatterwalk(&walk, rctx->tag, sizeof(rctx->tag)); return 0; } static void poly_genkey_done(void *data, int err) { async_done_continue(data, err, poly_hash); } static int poly_genkey(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct chachapoly_ctx *ctx = crypto_aead_ctx(tfm); struct chachapoly_req_ctx *rctx = aead_request_ctx(req); struct chacha_req *creq = &rctx->u.chacha; int err; rctx->assoclen = req->assoclen; if (crypto_aead_ivsize(tfm) == 8) { if (rctx->assoclen < 8) return -EINVAL; rctx->assoclen -= 8; } memset(rctx->key, 0, sizeof(rctx->key)); sg_init_one(creq->src, rctx->key, sizeof(rctx->key)); chacha_iv(creq->iv, req, 0); skcipher_request_set_callback(&creq->req, rctx->flags, poly_genkey_done, req); skcipher_request_set_tfm(&creq->req, ctx->chacha); skcipher_request_set_crypt(&creq->req, creq->src, creq->src, POLY1305_KEY_SIZE, creq->iv); err = crypto_skcipher_decrypt(&creq->req); if (err) return err; return poly_hash(req); } static void chacha_encrypt_done(void *data, int err) { async_done_continue(data, err, poly_genkey); } static int chacha_encrypt(struct aead_request *req) { struct chachapoly_ctx *ctx = crypto_aead_ctx(crypto_aead_reqtfm(req)); struct chachapoly_req_ctx *rctx = aead_request_ctx(req); struct chacha_req *creq = &rctx->u.chacha; struct scatterlist *src, *dst; int err; if (req->cryptlen == 0) goto skip; chacha_iv(creq->iv, req, 1); src = scatterwalk_ffwd(rctx->src, req->src, req->assoclen); dst = src; if (req->src != req->dst) dst = scatterwalk_ffwd(rctx->dst, req->dst, req->assoclen); skcipher_request_set_callback(&creq->req, rctx->flags, chacha_encrypt_done, req); skcipher_request_set_tfm(&creq->req, ctx->chacha); skcipher_request_set_crypt(&creq->req, src, dst, req->cryptlen, creq->iv); err = crypto_skcipher_encrypt(&creq->req); if (err) return err; skip: return poly_genkey(req); } static int chachapoly_encrypt(struct aead_request *req) { struct chachapoly_req_ctx *rctx = aead_request_ctx(req); rctx->cryptlen = req->cryptlen; rctx->flags = aead_request_flags(req); /* encrypt call chain: * - chacha_encrypt/done() * - poly_genkey/done() * - poly_hash() */ return chacha_encrypt(req); } static int chachapoly_decrypt(struct aead_request *req) { struct chachapoly_req_ctx *rctx = aead_request_ctx(req); rctx->cryptlen = req->cryptlen - POLY1305_DIGEST_SIZE; rctx->flags = aead_request_flags(req); /* decrypt call chain: * - poly_genkey/done() * - poly_hash() * - chacha_decrypt/done() * - poly_verify_tag() */ return poly_genkey(req); } static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { struct chachapoly_ctx *ctx = crypto_aead_ctx(aead); if (keylen != ctx->saltlen + CHACHA_KEY_SIZE) return -EINVAL; keylen -= ctx->saltlen; memcpy(ctx->salt, key + keylen, ctx->saltlen); crypto_skcipher_clear_flags(ctx->chacha, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(ctx->chacha, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); return crypto_skcipher_setkey(ctx->chacha, key, keylen); } static int chachapoly_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { if (authsize != POLY1305_DIGEST_SIZE) return -EINVAL; return 0; } static int chachapoly_init(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct chachapoly_instance_ctx *ictx = aead_instance_ctx(inst); struct chachapoly_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_skcipher *chacha; unsigned long align; chacha = crypto_spawn_skcipher(&ictx->chacha); if (IS_ERR(chacha)) return PTR_ERR(chacha); ctx->chacha = chacha; ctx->saltlen = ictx->saltlen; align = crypto_aead_alignmask(tfm); align &= ~(crypto_tfm_ctx_alignment() - 1); crypto_aead_set_reqsize( tfm, align + offsetof(struct chachapoly_req_ctx, u) + offsetof(struct chacha_req, req) + sizeof(struct skcipher_request) + crypto_skcipher_reqsize(chacha)); return 0; } static void chachapoly_exit(struct crypto_aead *tfm) { struct chachapoly_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_skcipher(ctx->chacha); } static void chachapoly_free(struct aead_instance *inst) { struct chachapoly_instance_ctx *ctx = aead_instance_ctx(inst); crypto_drop_skcipher(&ctx->chacha); kfree(inst); } static int chachapoly_create(struct crypto_template *tmpl, struct rtattr **tb, const char *name, unsigned int ivsize) { u32 mask; struct aead_instance *inst; struct chachapoly_instance_ctx *ctx; struct skcipher_alg_common *chacha; int err; if (ivsize > CHACHAPOLY_IV_SIZE) return -EINVAL; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; ctx = aead_instance_ctx(inst); ctx->saltlen = CHACHAPOLY_IV_SIZE - ivsize; err = crypto_grab_skcipher(&ctx->chacha, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; chacha = crypto_spawn_skcipher_alg_common(&ctx->chacha); err = -EINVAL; if (strcmp(crypto_attr_alg_name(tb[2]), "poly1305") && strcmp(crypto_attr_alg_name(tb[2]), "poly1305-generic")) goto err_free_inst; /* Need 16-byte IV size, including Initial Block Counter value */ if (chacha->ivsize != CHACHA_IV_SIZE) goto err_free_inst; /* Not a stream cipher? */ if (chacha->base.cra_blocksize != 1) goto err_free_inst; err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "%s(%s,poly1305)", name, chacha->base.cra_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s(%s,poly1305-generic)", name, chacha->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = chacha->base.cra_priority; inst->alg.base.cra_blocksize = 1; inst->alg.base.cra_alignmask = chacha->base.cra_alignmask; inst->alg.base.cra_ctxsize = sizeof(struct chachapoly_ctx) + ctx->saltlen; inst->alg.ivsize = ivsize; inst->alg.chunksize = chacha->chunksize; inst->alg.maxauthsize = POLY1305_DIGEST_SIZE; inst->alg.init = chachapoly_init; inst->alg.exit = chachapoly_exit; inst->alg.encrypt = chachapoly_encrypt; inst->alg.decrypt = chachapoly_decrypt; inst->alg.setkey = chachapoly_setkey; inst->alg.setauthsize = chachapoly_setauthsize; inst->free = chachapoly_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: chachapoly_free(inst); } return err; } static int rfc7539_create(struct crypto_template *tmpl, struct rtattr **tb) { return chachapoly_create(tmpl, tb, "rfc7539", 12); } static int rfc7539esp_create(struct crypto_template *tmpl, struct rtattr **tb) { return chachapoly_create(tmpl, tb, "rfc7539esp", 8); } static struct crypto_template rfc7539_tmpls[] = { { .name = "rfc7539", .create = rfc7539_create, .module = THIS_MODULE, }, { .name = "rfc7539esp", .create = rfc7539esp_create, .module = THIS_MODULE, }, }; static int __init chacha20poly1305_module_init(void) { return crypto_register_templates(rfc7539_tmpls, ARRAY_SIZE(rfc7539_tmpls)); } static void __exit chacha20poly1305_module_exit(void) { crypto_unregister_templates(rfc7539_tmpls, ARRAY_SIZE(rfc7539_tmpls)); } module_init(chacha20poly1305_module_init); module_exit(chacha20poly1305_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); MODULE_DESCRIPTION("ChaCha20-Poly1305 AEAD"); MODULE_ALIAS_CRYPTO("rfc7539"); MODULE_ALIAS_CRYPTO("rfc7539esp");
3 3 2 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2021 NXP */ #include "common.h" #include "netlink.h" struct phc_vclocks_req_info { struct ethnl_req_info base; }; struct phc_vclocks_reply_data { struct ethnl_reply_data base; int num; int *index; }; #define PHC_VCLOCKS_REPDATA(__reply_base) \ container_of(__reply_base, struct phc_vclocks_reply_data, base) const struct nla_policy ethnl_phc_vclocks_get_policy[] = { [ETHTOOL_A_PHC_VCLOCKS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int phc_vclocks_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; data->num = ethtool_get_phc_vclocks(dev, &data->index); ethnl_ops_complete(dev); return ret; } static int phc_vclocks_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); int len = 0; if (data->num > 0) { len += nla_total_size(sizeof(u32)); len += nla_total_size(sizeof(s32) * data->num); } return len; } static int phc_vclocks_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); if (data->num <= 0) return 0; if (nla_put_u32(skb, ETHTOOL_A_PHC_VCLOCKS_NUM, data->num) || nla_put(skb, ETHTOOL_A_PHC_VCLOCKS_INDEX, sizeof(s32) * data->num, data->index)) return -EMSGSIZE; return 0; } static void phc_vclocks_cleanup_data(struct ethnl_reply_data *reply_base) { const struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); kfree(data->index); } const struct ethnl_request_ops ethnl_phc_vclocks_request_ops = { .request_cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET, .reply_cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY, .hdr_attr = ETHTOOL_A_PHC_VCLOCKS_HEADER, .req_info_size = sizeof(struct phc_vclocks_req_info), .reply_data_size = sizeof(struct phc_vclocks_reply_data), .prepare_data = phc_vclocks_prepare_data, .reply_size = phc_vclocks_reply_size, .fill_reply = phc_vclocks_fill_reply, .cleanup_data = phc_vclocks_cleanup_data, };
5472 5483 5483 5485 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 // SPDX-License-Identifier: GPL-2.0-only // Copyright (C) 2022 Linutronix GmbH, John Ogness // Copyright (C) 2022 Intel, Thomas Gleixner #include <linux/atomic.h> #include <linux/bug.h> #include <linux/console.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/init.h> #include <linux/irqflags.h> #include <linux/kdb.h> #include <linux/kthread.h> #include <linux/minmax.h> #include <linux/panic.h> #include <linux/percpu.h> #include <linux/preempt.h> #include <linux/slab.h> #include <linux/smp.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include "internal.h" #include "printk_ringbuffer.h" /* * Printk console printing implementation for consoles which does not depend * on the legacy style console_lock mechanism. * * The state of the console is maintained in the "nbcon_state" atomic * variable. * * The console is locked when: * * - The 'prio' field contains the priority of the context that owns the * console. Only higher priority contexts are allowed to take over the * lock. A value of 0 (NBCON_PRIO_NONE) means the console is not locked. * * - The 'cpu' field denotes on which CPU the console is locked. It is used * to prevent busy waiting on the same CPU. Also it informs the lock owner * that it has lost the lock in a more complex scenario when the lock was * taken over by a higher priority context, released, and taken on another * CPU with the same priority as the interrupted owner. * * The acquire mechanism uses a few more fields: * * - The 'req_prio' field is used by the handover approach to make the * current owner aware that there is a context with a higher priority * waiting for the friendly handover. * * - The 'unsafe' field allows to take over the console in a safe way in the * middle of emitting a message. The field is set only when accessing some * shared resources or when the console device is manipulated. It can be * cleared, for example, after emitting one character when the console * device is in a consistent state. * * - The 'unsafe_takeover' field is set when a hostile takeover took the * console in an unsafe state. The console will stay in the unsafe state * until re-initialized. * * The acquire mechanism uses three approaches: * * 1) Direct acquire when the console is not owned or is owned by a lower * priority context and is in a safe state. * * 2) Friendly handover mechanism uses a request/grant handshake. It is used * when the current owner has lower priority and the console is in an * unsafe state. * * The requesting context: * * a) Sets its priority into the 'req_prio' field. * * b) Waits (with a timeout) for the owning context to unlock the * console. * * c) Takes the lock and clears the 'req_prio' field. * * The owning context: * * a) Observes the 'req_prio' field set on exit from the unsafe * console state. * * b) Gives up console ownership by clearing the 'prio' field. * * 3) Unsafe hostile takeover allows to take over the lock even when the * console is an unsafe state. It is used only in panic() by the final * attempt to flush consoles in a try and hope mode. * * Note that separate record buffers are used in panic(). As a result, * the messages can be read and formatted without any risk even after * using the hostile takeover in unsafe state. * * The release function simply clears the 'prio' field. * * All operations on @console::nbcon_state are atomic cmpxchg based to * handle concurrency. * * The acquire/release functions implement only minimal policies: * * - Preference for higher priority contexts. * - Protection of the panic CPU. * * All other policy decisions must be made at the call sites: * * - What is marked as an unsafe section. * - Whether to spin-wait if there is already an owner and the console is * in an unsafe state. * - Whether to attempt an unsafe hostile takeover. * * The design allows to implement the well known: * * acquire() * output_one_printk_record() * release() * * The output of one printk record might be interrupted with a higher priority * context. The new owner is supposed to reprint the entire interrupted record * from scratch. */ /* Counter of active nbcon emergency contexts. */ static atomic_t nbcon_cpu_emergency_cnt = ATOMIC_INIT(0); /** * nbcon_state_set - Helper function to set the console state * @con: Console to update * @new: The new state to write * * Only to be used when the console is not yet or no longer visible in the * system. Otherwise use nbcon_state_try_cmpxchg(). */ static inline void nbcon_state_set(struct console *con, struct nbcon_state *new) { atomic_set(&ACCESS_PRIVATE(con, nbcon_state), new->atom); } /** * nbcon_state_read - Helper function to read the console state * @con: Console to read * @state: The state to store the result */ static inline void nbcon_state_read(struct console *con, struct nbcon_state *state) { state->atom = atomic_read(&ACCESS_PRIVATE(con, nbcon_state)); } /** * nbcon_state_try_cmpxchg() - Helper function for atomic_try_cmpxchg() on console state * @con: Console to update * @cur: Old/expected state * @new: New state * * Return: True on success. False on fail and @cur is updated. */ static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_state *cur, struct nbcon_state *new) { return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom); } /** * nbcon_seq_read - Read the current console sequence * @con: Console to read the sequence of * * Return: Sequence number of the next record to print on @con. */ u64 nbcon_seq_read(struct console *con) { unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq)); return __ulseq_to_u64seq(prb, nbcon_seq); } /** * nbcon_seq_force - Force console sequence to a specific value * @con: Console to work on * @seq: Sequence number value to set * * Only to be used during init (before registration) or in extreme situations * (such as panic with CONSOLE_REPLAY_ALL). */ void nbcon_seq_force(struct console *con, u64 seq) { /* * If the specified record no longer exists, the oldest available record * is chosen. This is especially important on 32bit systems because only * the lower 32 bits of the sequence number are stored. The upper 32 bits * are derived from the sequence numbers available in the ringbuffer. */ u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb)); atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq)); } /** * nbcon_seq_try_update - Try to update the console sequence number * @ctxt: Pointer to an acquire context that contains * all information about the acquire mode * @new_seq: The new sequence number to set * * @ctxt->seq is updated to the new value of @con::nbcon_seq (expanded to * the 64bit value). This could be a different value than @new_seq if * nbcon_seq_force() was used or the current context no longer owns the * console. In the later case, it will stop printing anyway. */ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) { unsigned long nbcon_seq = __u64seq_to_ulseq(ctxt->seq); struct console *con = ctxt->console; if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq, __u64seq_to_ulseq(new_seq))) { ctxt->seq = new_seq; } else { ctxt->seq = nbcon_seq_read(con); } } /** * nbcon_context_try_acquire_direct - Try to acquire directly * @ctxt: The context of the caller * @cur: The current console state * @is_reacquire: This acquire is a reacquire * * Acquire the console when it is released. Also acquire the console when * the current owner has a lower priority and the console is in a safe state. * * Return: 0 on success. Otherwise, an error code on failure. Also @cur * is updated to the latest state when failed to modify it. * * Errors: * * -EPERM: A panic is in progress and this is neither the panic * CPU nor is this a reacquire. Or the current owner or * waiter has the same or higher priority. No acquire * method can be successful in these cases. * * -EBUSY: The current owner has a lower priority but the console * in an unsafe state. The caller should try using * the handover acquire method. */ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, struct nbcon_state *cur, bool is_reacquire) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state new; do { /* * Panic does not imply that the console is owned. However, * since all non-panic CPUs are stopped during panic(), it * is safer to have them avoid gaining console ownership. * * One exception is when kdb has locked for printing on this CPU. * * Second exception is a reacquire (and an unsafe takeover * has not previously occurred) then it is allowed to attempt * a direct acquire in panic. This gives console drivers an * opportunity to perform any necessary cleanup if they were * interrupted by the panic CPU while printing. */ if (panic_on_other_cpu() && !kdb_printf_on_this_cpu() && (!is_reacquire || cur->unsafe_takeover)) { return -EPERM; } if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio) return -EPERM; if (cur->unsafe) return -EBUSY; /* * The console should never be safe for a direct acquire * if an unsafe hostile takeover has ever happened. */ WARN_ON_ONCE(cur->unsafe_takeover); new.atom = cur->atom; new.prio = ctxt->prio; new.req_prio = NBCON_PRIO_NONE; new.unsafe = cur->unsafe_takeover; new.cpu = cpu; } while (!nbcon_state_try_cmpxchg(con, cur, &new)); return 0; } static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio) { /* * The request context is well defined by the @req_prio because: * * - Only a context with a priority higher than the owner can become * a waiter. * - Only a context with a priority higher than the waiter can * directly take over the request. * - There are only three priorities. * - Only one CPU is allowed to request PANIC priority. * - Lower priorities are ignored during panic() until reboot. * * As a result, the following scenario is *not* possible: * * 1. This context is currently a waiter. * 2. Another context with a higher priority than this context * directly takes ownership. * 3. The higher priority context releases the ownership. * 4. Another lower priority context takes the ownership. * 5. Another context with the same priority as this context * creates a request and starts waiting. * * Event #1 implies this context is EMERGENCY. * Event #2 implies the new context is PANIC. * Event #3 occurs when panic() has flushed the console. * Event #4 occurs when a non-panic CPU reacquires. * Event #5 is not possible due to the panic_on_other_cpu() check * in nbcon_context_try_acquire_handover(). */ return (cur->req_prio == expected_prio); } /** * nbcon_context_try_acquire_requested - Try to acquire after having * requested a handover * @ctxt: The context of the caller * @cur: The current console state * * This is a helper function for nbcon_context_try_acquire_handover(). * It is called when the console is in an unsafe state. The current * owner will release the console on exit from the unsafe region. * * Return: 0 on success and @cur is updated to the new console state. * Otherwise an error code on failure. * * Errors: * * -EPERM: A panic is in progress and this is not the panic CPU * or this context is no longer the waiter. * * -EBUSY: The console is still locked. The caller should * continue waiting. * * Note: The caller must still remove the request when an error has occurred * except when this context is no longer the waiter. */ static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt, struct nbcon_state *cur) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state new; /* Note that the caller must still remove the request! */ if (panic_on_other_cpu()) return -EPERM; /* * Note that the waiter will also change if there was an unsafe * hostile takeover. */ if (!nbcon_waiter_matches(cur, ctxt->prio)) return -EPERM; /* If still locked, caller should continue waiting. */ if (cur->prio != NBCON_PRIO_NONE) return -EBUSY; /* * The previous owner should have never released ownership * in an unsafe region. */ WARN_ON_ONCE(cur->unsafe); new.atom = cur->atom; new.prio = ctxt->prio; new.req_prio = NBCON_PRIO_NONE; new.unsafe = cur->unsafe_takeover; new.cpu = cpu; if (!nbcon_state_try_cmpxchg(con, cur, &new)) { /* * The acquire could fail only when it has been taken * over by a higher priority context. */ WARN_ON_ONCE(nbcon_waiter_matches(cur, ctxt->prio)); return -EPERM; } /* Handover success. This context now owns the console. */ return 0; } /** * nbcon_context_try_acquire_handover - Try to acquire via handover * @ctxt: The context of the caller * @cur: The current console state * * The function must be called only when the context has higher priority * than the current owner and the console is in an unsafe state. * It is the case when nbcon_context_try_acquire_direct() returns -EBUSY. * * The function sets "req_prio" field to make the current owner aware of * the request. Then it waits until the current owner releases the console, * or an even higher context takes over the request, or timeout expires. * * The current owner checks the "req_prio" field on exit from the unsafe * region and releases the console. It does not touch the "req_prio" field * so that the console stays reserved for the waiter. * * Return: 0 on success. Otherwise, an error code on failure. Also @cur * is updated to the latest state when failed to modify it. * * Errors: * * -EPERM: A panic is in progress and this is not the panic CPU. * Or a higher priority context has taken over the * console or the handover request. * * -EBUSY: The current owner is on the same CPU so that the hand * shake could not work. Or the current owner is not * willing to wait (zero timeout). Or the console does * not enter the safe state before timeout passed. The * caller might still use the unsafe hostile takeover * when allowed. * * -EAGAIN: @cur has changed when creating the handover request. * The caller should retry with direct acquire. */ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt, struct nbcon_state *cur) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state new; int timeout; int request_err = -EBUSY; /* * Check that the handover is called when the direct acquire failed * with -EBUSY. */ WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio); WARN_ON_ONCE(!cur->unsafe); /* * Panic does not imply that the console is owned. However, it * is critical that non-panic CPUs during panic are unable to * wait for a handover in order to satisfy the assumptions of * nbcon_waiter_matches(). In particular, the assumption that * lower priorities are ignored during panic. */ if (panic_on_other_cpu()) return -EPERM; /* Handover is not possible on the same CPU. */ if (cur->cpu == cpu) return -EBUSY; /* * Console stays unsafe after an unsafe takeover until re-initialized. * Waiting is not going to help in this case. */ if (cur->unsafe_takeover) return -EBUSY; /* Is the caller willing to wait? */ if (ctxt->spinwait_max_us == 0) return -EBUSY; /* * Setup a request for the handover. The caller should try to acquire * the console directly when the current state has been modified. */ new.atom = cur->atom; new.req_prio = ctxt->prio; if (!nbcon_state_try_cmpxchg(con, cur, &new)) return -EAGAIN; cur->atom = new.atom; /* Wait until there is no owner and then acquire the console. */ for (timeout = ctxt->spinwait_max_us; timeout >= 0; timeout--) { /* On successful acquire, this request is cleared. */ request_err = nbcon_context_try_acquire_requested(ctxt, cur); if (!request_err) return 0; /* * If the acquire should be aborted, it must be ensured * that the request is removed before returning to caller. */ if (request_err == -EPERM) break; udelay(1); /* Re-read the state because some time has passed. */ nbcon_state_read(con, cur); } /* Timed out or aborted. Carefully remove handover request. */ do { /* * No need to remove request if there is a new waiter. This * can only happen if a higher priority context has taken over * the console or the handover request. */ if (!nbcon_waiter_matches(cur, ctxt->prio)) return -EPERM; /* Unset request for handover. */ new.atom = cur->atom; new.req_prio = NBCON_PRIO_NONE; if (nbcon_state_try_cmpxchg(con, cur, &new)) { /* * Request successfully unset. Report failure of * acquiring via handover. */ cur->atom = new.atom; return request_err; } /* * Unable to remove request. Try to acquire in case * the owner has released the lock. */ } while (nbcon_context_try_acquire_requested(ctxt, cur)); /* Lucky timing. The acquire succeeded while removing the request. */ return 0; } /** * nbcon_context_try_acquire_hostile - Acquire via unsafe hostile takeover * @ctxt: The context of the caller * @cur: The current console state * * Acquire the console even in the unsafe state. * * It can be permitted by setting the 'allow_unsafe_takeover' field only * by the final attempt to flush messages in panic(). * * Return: 0 on success. -EPERM when not allowed by the context. */ static int nbcon_context_try_acquire_hostile(struct nbcon_context *ctxt, struct nbcon_state *cur) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state new; if (!ctxt->allow_unsafe_takeover) return -EPERM; /* Ensure caller is allowed to perform unsafe hostile takeovers. */ if (WARN_ON_ONCE(ctxt->prio != NBCON_PRIO_PANIC)) return -EPERM; /* * Check that try_acquire_direct() and try_acquire_handover() returned * -EBUSY in the right situation. */ WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio); WARN_ON_ONCE(cur->unsafe != true); do { new.atom = cur->atom; new.cpu = cpu; new.prio = ctxt->prio; new.unsafe |= cur->unsafe_takeover; new.unsafe_takeover |= cur->unsafe; } while (!nbcon_state_try_cmpxchg(con, cur, &new)); return 0; } static struct printk_buffers panic_nbcon_pbufs; /** * nbcon_context_try_acquire - Try to acquire nbcon console * @ctxt: The context of the caller * @is_reacquire: This acquire is a reacquire * * Context: Under @ctxt->con->device_lock() or local_irq_save(). * Return: True if the console was acquired. False otherwise. * * If the caller allowed an unsafe hostile takeover, on success the * caller should check the current console state to see if it is * in an unsafe state. Otherwise, on success the caller may assume * the console is not in an unsafe state. */ static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire) { struct console *con = ctxt->console; struct nbcon_state cur; int err; nbcon_state_read(con, &cur); try_again: err = nbcon_context_try_acquire_direct(ctxt, &cur, is_reacquire); if (err != -EBUSY) goto out; err = nbcon_context_try_acquire_handover(ctxt, &cur); if (err == -EAGAIN) goto try_again; if (err != -EBUSY) goto out; err = nbcon_context_try_acquire_hostile(ctxt, &cur); out: if (err) return false; /* Acquire succeeded. */ /* Assign the appropriate buffer for this context. */ if (panic_on_this_cpu()) ctxt->pbufs = &panic_nbcon_pbufs; else ctxt->pbufs = con->pbufs; /* Set the record sequence for this context to print. */ ctxt->seq = nbcon_seq_read(ctxt->console); return true; } static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu, int expected_prio) { /* * A similar function, nbcon_waiter_matches(), only deals with * EMERGENCY and PANIC priorities. However, this function must also * deal with the NORMAL priority, which requires additional checks * and constraints. * * For the case where preemption and interrupts are disabled, it is * enough to also verify that the owning CPU has not changed. * * For the case where preemption or interrupts are enabled, an * external synchronization method *must* be used. In particular, * the driver-specific locking mechanism used in device_lock() * (including disabling migration) should be used. It prevents * scenarios such as: * * 1. [Task A] owns a context with NBCON_PRIO_NORMAL on [CPU X] and * is scheduled out. * 2. Another context takes over the lock with NBCON_PRIO_EMERGENCY * and releases it. * 3. [Task B] acquires a context with NBCON_PRIO_NORMAL on [CPU X] * and is scheduled out. * 4. [Task A] gets running on [CPU X] and sees that the console is * still owned by a task on [CPU X] with NBON_PRIO_NORMAL. Thus * [Task A] thinks it is the owner when it is not. */ if (cur->prio != expected_prio) return false; if (cur->cpu != expected_cpu) return false; return true; } /** * nbcon_context_release - Release the console * @ctxt: The nbcon context from nbcon_context_try_acquire() */ static void nbcon_context_release(struct nbcon_context *ctxt) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state cur; struct nbcon_state new; nbcon_state_read(con, &cur); do { if (!nbcon_owner_matches(&cur, cpu, ctxt->prio)) break; new.atom = cur.atom; new.prio = NBCON_PRIO_NONE; /* * If @unsafe_takeover is set, it is kept set so that * the state remains permanently unsafe. */ new.unsafe |= cur.unsafe_takeover; } while (!nbcon_state_try_cmpxchg(con, &cur, &new)); ctxt->pbufs = NULL; } /** * nbcon_context_can_proceed - Check whether ownership can proceed * @ctxt: The nbcon context from nbcon_context_try_acquire() * @cur: The current console state * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * Must be invoked when entering the unsafe state to make sure that it still * owns the lock. Also must be invoked when exiting the unsafe context * to eventually free the lock for a higher priority context which asked * for the friendly handover. * * It can be called inside an unsafe section when the console is just * temporary in safe state instead of exiting and entering the unsafe * state. * * Also it can be called in the safe context before doing an expensive * safe operation. It does not make sense to do the operation when * a higher priority context took the lock. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. */ static bool nbcon_context_can_proceed(struct nbcon_context *ctxt, struct nbcon_state *cur) { unsigned int cpu = smp_processor_id(); /* Make sure this context still owns the console. */ if (!nbcon_owner_matches(cur, cpu, ctxt->prio)) return false; /* The console owner can proceed if there is no waiter. */ if (cur->req_prio == NBCON_PRIO_NONE) return true; /* * A console owner within an unsafe region is always allowed to * proceed, even if there are waiters. It can perform a handover * when exiting the unsafe region. Otherwise the waiter will * need to perform an unsafe hostile takeover. */ if (cur->unsafe) return true; /* Waiters always have higher priorities than owners. */ WARN_ON_ONCE(cur->req_prio <= cur->prio); /* * Having a safe point for take over and eventually a few * duplicated characters or a full line is way better than a * hostile takeover. Post processing can take care of the garbage. * Release and hand over. */ nbcon_context_release(ctxt); /* * It is not clear whether the waiter really took over ownership. The * outermost callsite must make the final decision whether console * ownership is needed for it to proceed. If yes, it must reacquire * ownership (possibly hostile) before carefully proceeding. * * The calling context no longer owns the console so go back all the * way instead of trying to implement reacquire heuristics in tons of * places. */ return false; } /** * nbcon_can_proceed - Check whether ownership can proceed * @wctxt: The write context that was handed to the write function * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * It is used in nbcon_enter_unsafe() to make sure that it still owns the * lock. Also it is used in nbcon_exit_unsafe() to eventually free the lock * for a higher priority context which asked for the friendly handover. * * It can be called inside an unsafe section when the console is just * temporary in safe state instead of exiting and entering the unsafe state. * * Also it can be called in the safe context before doing an expensive safe * operation. It does not make sense to do the operation when a higher * priority context took the lock. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. */ bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; struct nbcon_state cur; nbcon_state_read(con, &cur); return nbcon_context_can_proceed(ctxt, &cur); } EXPORT_SYMBOL_GPL(nbcon_can_proceed); #define nbcon_context_enter_unsafe(c) __nbcon_context_update_unsafe(c, true) #define nbcon_context_exit_unsafe(c) __nbcon_context_update_unsafe(c, false) /** * __nbcon_context_update_unsafe - Update the unsafe bit in @con->nbcon_state * @ctxt: The nbcon context from nbcon_context_try_acquire() * @unsafe: The new value for the unsafe bit * * Return: True if the unsafe state was updated and this context still * owns the console. Otherwise false if ownership was handed * over or taken. * * This function allows console owners to modify the unsafe status of the * console. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. * * Internal helper to avoid duplicated code. */ static bool __nbcon_context_update_unsafe(struct nbcon_context *ctxt, bool unsafe) { struct console *con = ctxt->console; struct nbcon_state cur; struct nbcon_state new; nbcon_state_read(con, &cur); do { /* * The unsafe bit must not be cleared if an * unsafe hostile takeover has occurred. */ if (!unsafe && cur.unsafe_takeover) goto out; if (!nbcon_context_can_proceed(ctxt, &cur)) return false; new.atom = cur.atom; new.unsafe = unsafe; } while (!nbcon_state_try_cmpxchg(con, &cur, &new)); cur.atom = new.atom; out: return nbcon_context_can_proceed(ctxt, &cur); } void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, char *buf, unsigned int len) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; struct nbcon_state cur; wctxt->outbuf = buf; wctxt->len = len; nbcon_state_read(con, &cur); wctxt->unsafe_takeover = cur.unsafe_takeover; } /** * nbcon_enter_unsafe - Enter an unsafe region in the driver * @wctxt: The write context that was handed to the write function * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. */ bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); bool is_owner; is_owner = nbcon_context_enter_unsafe(ctxt); if (!is_owner) nbcon_write_context_set_buf(wctxt, NULL, 0); return is_owner; } EXPORT_SYMBOL_GPL(nbcon_enter_unsafe); /** * nbcon_exit_unsafe - Exit an unsafe region in the driver * @wctxt: The write context that was handed to the write function * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. */ bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); bool ret; ret = nbcon_context_exit_unsafe(ctxt); if (!ret) nbcon_write_context_set_buf(wctxt, NULL, 0); return ret; } EXPORT_SYMBOL_GPL(nbcon_exit_unsafe); /** * nbcon_reacquire_nobuf - Reacquire a console after losing ownership * while printing * @wctxt: The write context that was handed to the write callback * * Since ownership can be lost at any time due to handover or takeover, a * printing context _must_ be prepared to back out immediately and * carefully. However, there are scenarios where the printing context must * reacquire ownership in order to finalize or revert hardware changes. * * This function allows a printing context to reacquire ownership using the * same priority as its previous ownership. * * Note that after a successful reacquire the printing context will have no * output buffer because that has been lost. This function cannot be used to * resume printing. */ void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); while (!nbcon_context_try_acquire(ctxt, true)) cpu_relax(); nbcon_write_context_set_buf(wctxt, NULL, 0); } EXPORT_SYMBOL_GPL(nbcon_reacquire_nobuf); #ifdef CONFIG_PRINTK_EXECUTION_CTX static void wctxt_load_execution_ctx(struct nbcon_write_context *wctxt, struct printk_message *pmsg) { wctxt->cpu = pmsg->cpu; wctxt->pid = pmsg->pid; memcpy(wctxt->comm, pmsg->comm, sizeof(wctxt->comm)); static_assert(sizeof(wctxt->comm) == sizeof(pmsg->comm)); } #else static void wctxt_load_execution_ctx(struct nbcon_write_context *wctxt, struct printk_message *pmsg) {} #endif /** * nbcon_emit_next_record - Emit a record in the acquired context * @wctxt: The write context that will be handed to the write function * @use_atomic: True if the write_atomic() callback is to be used * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. If the caller * wants to do more it must reacquire the console first. * * When true is returned, @wctxt->ctxt.backlog indicates whether there are * still records pending in the ringbuffer, */ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_atomic) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED; struct printk_message pmsg = { .pbufs = ctxt->pbufs, }; unsigned long con_dropped; struct nbcon_state cur; unsigned long dropped; unsigned long ulseq; /* * This function should never be called for consoles that have not * implemented the necessary callback for writing: i.e. legacy * consoles and, when atomic, nbcon consoles with no write_atomic(). * Handle it as if ownership was lost and try to continue. * * Note that for nbcon consoles the write_thread() callback is * mandatory and was already checked in nbcon_alloc(). */ if (WARN_ON_ONCE((use_atomic && !con->write_atomic) || !(console_srcu_read_flags(con) & CON_NBCON))) { nbcon_context_release(ctxt); return false; } /* * The printk buffers are filled within an unsafe section. This * prevents NBCON_PRIO_NORMAL and NBCON_PRIO_EMERGENCY from * clobbering each other. */ if (!nbcon_context_enter_unsafe(ctxt)) return false; ctxt->backlog = printk_get_next_message(&pmsg, ctxt->seq, is_extended, true); if (!ctxt->backlog) return nbcon_context_exit_unsafe(ctxt); /* * @con->dropped is not protected in case of an unsafe hostile * takeover. In that situation the update can be racy so * annotate it accordingly. */ con_dropped = data_race(READ_ONCE(con->dropped)); dropped = con_dropped + pmsg.dropped; if (dropped && !is_extended) console_prepend_dropped(&pmsg, dropped); /* * If the previous owner was assigned the same record, this context * has taken over ownership and is replaying the record. Prepend a * message to let the user know the record is replayed. */ ulseq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_prev_seq)); if (__ulseq_to_u64seq(prb, ulseq) == pmsg.seq) { console_prepend_replay(&pmsg); } else { /* * Ensure this context is still the owner before trying to * update @nbcon_prev_seq. Otherwise the value in @ulseq may * not be from the previous owner and instead be some later * value from the context that took over ownership. */ nbcon_state_read(con, &cur); if (!nbcon_context_can_proceed(ctxt, &cur)) return false; atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_prev_seq), &ulseq, __u64seq_to_ulseq(pmsg.seq)); } if (!nbcon_context_exit_unsafe(ctxt)) return false; /* For skipped records just update seq/dropped in @con. */ if (pmsg.outbuf_len == 0) goto update_con; /* Initialize the write context for driver callbacks. */ nbcon_write_context_set_buf(wctxt, &pmsg.pbufs->outbuf[0], pmsg.outbuf_len); wctxt_load_execution_ctx(wctxt, &pmsg); if (use_atomic) con->write_atomic(con, wctxt); else con->write_thread(con, wctxt); if (!wctxt->outbuf) { /* * Ownership was lost and reacquired by the driver. Handle it * as if ownership was lost. */ nbcon_context_release(ctxt); return false; } /* * Ownership may have been lost but _not_ reacquired by the driver. * This case is detected and handled when entering unsafe to update * dropped/seq values. */ /* * Since any dropped message was successfully output, reset the * dropped count for the console. */ dropped = 0; update_con: /* * The dropped count and the sequence number are updated within an * unsafe section. This limits update races to the panic context and * allows the panic context to win. */ if (!nbcon_context_enter_unsafe(ctxt)) return false; if (dropped != con_dropped) { /* Counterpart to the READ_ONCE() above. */ WRITE_ONCE(con->dropped, dropped); } nbcon_seq_try_update(ctxt, pmsg.seq + 1); return nbcon_context_exit_unsafe(ctxt); } /* * nbcon_emit_one - Print one record for an nbcon console using the * specified callback * @wctxt: An initialized write context struct to use for this context * @use_atomic: True if the write_atomic() callback is to be used * * Return: True, when a record has been printed and there are still * pending records. The caller might want to continue flushing. * * False, when there is no pending record, or when the console * context cannot be acquired, or the ownership has been lost. * The caller should give up. Either the job is done, cannot be * done, or will be handled by the owning context. * * This is an internal helper to handle the locking of the console before * calling nbcon_emit_next_record(). */ static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; unsigned long flags; bool ret = false; if (!use_atomic) { con->device_lock(con, &flags); /* * Ensure this stays on the CPU to make handover and * takeover possible. */ cant_migrate(); } if (!nbcon_context_try_acquire(ctxt, false)) goto out; /* * nbcon_emit_next_record() returns false when the console was * handed over or taken over. In both cases the context is no * longer valid. * * The higher priority printing context takes over responsibility * to print the pending records. */ if (!nbcon_emit_next_record(wctxt, use_atomic)) goto out; nbcon_context_release(ctxt); ret = ctxt->backlog; out: if (!use_atomic) con->device_unlock(con, flags); return ret; } /** * nbcon_kthread_should_wakeup - Check whether a printer thread should wakeup * @con: Console to operate on * @ctxt: The nbcon context from nbcon_context_try_acquire() * * Return: True if the thread should shutdown or if the console is * allowed to print and a record is available. False otherwise. * * After the thread wakes up, it must first check if it should shutdown before * attempting any printing. */ static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_context *ctxt) { bool ret = false; short flags; int cookie; if (kthread_should_stop()) return true; /* * Block the kthread when the system is in an emergency or panic mode. * It increases the chance that these contexts would be able to show * the messages directly. And it reduces the risk of interrupted writes * where the context with a higher priority takes over the nbcon console * ownership in the middle of a message. */ if (unlikely(atomic_read(&nbcon_cpu_emergency_cnt)) || unlikely(panic_in_progress())) return false; cookie = console_srcu_read_lock(); flags = console_srcu_read_flags(con); if (console_is_usable(con, flags, false)) { /* Bring the sequence in @ctxt up to date */ ctxt->seq = nbcon_seq_read(con); ret = prb_read_valid(prb, ctxt->seq, NULL); } console_srcu_read_unlock(cookie); return ret; } /** * nbcon_kthread_func - The printer thread function * @__console: Console to operate on * * Return: 0 */ static int nbcon_kthread_func(void *__console) { struct console *con = __console; struct nbcon_write_context wctxt = { .ctxt.console = con, .ctxt.prio = NBCON_PRIO_NORMAL, }; struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); short con_flags; bool backlog; int cookie; wait_for_event: /* * Guarantee this task is visible on the rcuwait before * checking the wake condition. * * The full memory barrier within set_current_state() of * ___rcuwait_wait_event() pairs with the full memory * barrier within rcuwait_has_sleeper(). * * This pairs with rcuwait_has_sleeper:A and nbcon_kthread_wake:A. */ rcuwait_wait_event(&con->rcuwait, nbcon_kthread_should_wakeup(con, ctxt), TASK_INTERRUPTIBLE); /* LMM(nbcon_kthread_func:A) */ do { if (kthread_should_stop()) return 0; /* * Block the kthread when the system is in an emergency or panic * mode. See nbcon_kthread_should_wakeup() for more details. */ if (unlikely(atomic_read(&nbcon_cpu_emergency_cnt)) || unlikely(panic_in_progress())) goto wait_for_event; backlog = false; /* * Keep the srcu read lock around the entire operation so that * synchronize_srcu() can guarantee that the kthread stopped * or suspended printing. */ cookie = console_srcu_read_lock(); con_flags = console_srcu_read_flags(con); if (console_is_usable(con, con_flags, false)) backlog = nbcon_emit_one(&wctxt, false); console_srcu_read_unlock(cookie); cond_resched(); } while (backlog); goto wait_for_event; } /** * nbcon_irq_work - irq work to wake console printer thread * @irq_work: The irq work to operate on */ static void nbcon_irq_work(struct irq_work *irq_work) { struct console *con = container_of(irq_work, struct console, irq_work); nbcon_kthread_wake(con); } static inline bool rcuwait_has_sleeper(struct rcuwait *w) { /* * Guarantee any new records can be seen by tasks preparing to wait * before this context checks if the rcuwait is empty. * * This full memory barrier pairs with the full memory barrier within * set_current_state() of ___rcuwait_wait_event(), which is called * after prepare_to_rcuwait() adds the waiter but before it has * checked the wait condition. * * This pairs with nbcon_kthread_func:A. */ smp_mb(); /* LMM(rcuwait_has_sleeper:A) */ return rcuwait_active(w); } /** * nbcon_kthreads_wake - Wake up printing threads using irq_work */ void nbcon_kthreads_wake(void) { struct console *con; int cookie; if (!printk_kthreads_running) return; /* * It is not allowed to call this function when console irq_work * is blocked. */ if (WARN_ON_ONCE(console_irqwork_blocked)) return; cookie = console_srcu_read_lock(); for_each_console_srcu(con) { if (!(console_srcu_read_flags(con) & CON_NBCON)) continue; /* * Only schedule irq_work if the printing thread is * actively waiting. If not waiting, the thread will * notice by itself that it has work to do. */ if (rcuwait_has_sleeper(&con->rcuwait)) irq_work_queue(&con->irq_work); } console_srcu_read_unlock(cookie); } /* * nbcon_kthread_stop - Stop a console printer thread * @con: Console to operate on */ void nbcon_kthread_stop(struct console *con) { lockdep_assert_console_list_lock_held(); if (!con->kthread) return; kthread_stop(con->kthread); con->kthread = NULL; } /** * nbcon_kthread_create - Create a console printer thread * @con: Console to operate on * * Return: True if the kthread was started or already exists. * Otherwise false and @con must not be registered. * * This function is called when it will be expected that nbcon consoles are * flushed using the kthread. The messages printed with NBCON_PRIO_NORMAL * will be no longer flushed by the legacy loop. This is why failure must * be fatal for console registration. * * If @con was already registered and this function fails, @con must be * unregistered before the global state variable @printk_kthreads_running * can be set. */ bool nbcon_kthread_create(struct console *con) { struct task_struct *kt; lockdep_assert_console_list_lock_held(); if (con->kthread) return true; kt = kthread_run(nbcon_kthread_func, con, "pr/%s%d", con->name, con->index); if (WARN_ON(IS_ERR(kt))) { con_printk(KERN_ERR, con, "failed to start printing thread\n"); return false; } con->kthread = kt; /* * It is important that console printing threads are scheduled * shortly after a printk call and with generous runtime budgets. */ sched_set_normal(con->kthread, -20); return true; } /* Track the nbcon emergency nesting per CPU. */ static DEFINE_PER_CPU(unsigned int, nbcon_pcpu_emergency_nesting); static unsigned int early_nbcon_pcpu_emergency_nesting __initdata; /** * nbcon_get_cpu_emergency_nesting - Get the per CPU emergency nesting pointer * * Context: For reading, any context. For writing, any context which could * not be migrated to another CPU. * Return: Either a pointer to the per CPU emergency nesting counter of * the current CPU or to the init data during early boot. * * The function is safe for reading per-CPU variables in any context because * preemption is disabled if the current CPU is in the emergency state. See * also nbcon_cpu_emergency_enter(). */ static __ref unsigned int *nbcon_get_cpu_emergency_nesting(void) { /* * The value of __printk_percpu_data_ready gets set in normal * context and before SMP initialization. As a result it could * never change while inside an nbcon emergency section. */ if (!printk_percpu_data_ready()) return &early_nbcon_pcpu_emergency_nesting; return raw_cpu_ptr(&nbcon_pcpu_emergency_nesting); } /** * nbcon_get_default_prio - The appropriate nbcon priority to use for nbcon * printing on the current CPU * * Context: Any context. * Return: The nbcon_prio to use for acquiring an nbcon console in this * context for printing. * * The function is safe for reading per-CPU data in any context because * preemption is disabled if the current CPU is in the emergency or panic * state. */ enum nbcon_prio nbcon_get_default_prio(void) { unsigned int *cpu_emergency_nesting; if (panic_on_this_cpu()) return NBCON_PRIO_PANIC; cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); if (*cpu_emergency_nesting) return NBCON_PRIO_EMERGENCY; return NBCON_PRIO_NORMAL; } /* * Track if it is allowed to perform unsafe hostile takeovers of console * ownership. When true, console drivers might perform unsafe actions while * printing. It is externally available via nbcon_allow_unsafe_takeover(). */ static bool panic_nbcon_allow_unsafe_takeover; /** * nbcon_allow_unsafe_takeover - Check if unsafe console takeovers are allowed * * Return: True, when it is permitted to perform unsafe console printing * * This is also used by console_is_usable() to determine if it is allowed to * call write_atomic() callbacks flagged as unsafe (CON_NBCON_ATOMIC_UNSAFE). */ bool nbcon_allow_unsafe_takeover(void) { return panic_on_this_cpu() && panic_nbcon_allow_unsafe_takeover; } /** * nbcon_legacy_emit_next_record - Print one record for an nbcon console * in legacy contexts * @con: The console to print on * @handover: Will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding * both the console_lock and the SRCU read lock. Otherwise it * is set to false. * @cookie: The cookie from the SRCU read lock. * @use_atomic: Set true when called in an atomic or unknown context. * It affects which nbcon callback will be used: write_atomic() * or write_thread(). * * When false, the write_thread() callback is used and would be * called in a preemtible context unless disabled by the * device_lock. The legacy handover is not allowed in this mode. * * Context: Any context except NMI. * Return: True, when a record has been printed and there are still * pending records. The caller might want to continue flushing. * * False, when there is no pending record, or when the console * context cannot be acquired, or the ownership has been lost. * The caller should give up. Either the job is done, cannot be * done, or will be handled by the owning context. * * This function is meant to be called by console_flush_all() to print records * on nbcon consoles from legacy context (printing via console unlocking). * Essentially it is the nbcon version of console_emit_next_record(). */ bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, int cookie, bool use_atomic) { struct nbcon_write_context wctxt = { }; struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); unsigned long flags; bool progress; ctxt->console = con; ctxt->prio = nbcon_get_default_prio(); if (use_atomic) { /* * In an atomic or unknown context, use the same procedure as * in console_emit_next_record(). It allows to handover. */ printk_safe_enter_irqsave(flags); console_lock_spinning_enable(); stop_critical_timings(); } progress = nbcon_emit_one(&wctxt, use_atomic); if (use_atomic) { start_critical_timings(); *handover = console_lock_spinning_disable_and_check(cookie); printk_safe_exit_irqrestore(flags); } else { /* Non-atomic does not perform legacy spinning handovers. */ *handover = false; } return progress; } /** * __nbcon_atomic_flush_pending_con - Flush specified nbcon console using its * write_atomic() callback * @con: The nbcon console to flush * @stop_seq: Flush up until this record * * Return: 0 if @con was flushed up to @stop_seq Otherwise, error code on * failure. * * Errors: * * -EPERM: Unable to acquire console ownership. * * -EAGAIN: Another context took over ownership while printing. * * -ENOENT: A record before @stop_seq is not available. * * If flushing up to @stop_seq was not successful, it only makes sense for the * caller to try again when -EAGAIN was returned. When -EPERM is returned, * this context is not allowed to acquire the console. When -ENOENT is * returned, it cannot be expected that the unfinalized record will become * available. */ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq) { struct nbcon_write_context wctxt = { }; struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); int err = 0; ctxt->console = con; ctxt->spinwait_max_us = 2000; ctxt->prio = nbcon_get_default_prio(); ctxt->allow_unsafe_takeover = nbcon_allow_unsafe_takeover(); while (nbcon_seq_read(con) < stop_seq) { /* * Atomic flushing does not use console driver synchronization * (i.e. it does not hold the port lock for uart consoles). * Therefore IRQs must be disabled to avoid being interrupted * and then calling into a driver that will deadlock trying * to acquire console ownership. */ scoped_guard(irqsave) { if (!nbcon_context_try_acquire(ctxt, false)) return -EPERM; /* * nbcon_emit_next_record() returns false when * the console was handed over or taken over. * In both cases the context is no longer valid. */ if (!nbcon_emit_next_record(&wctxt, true)) return -EAGAIN; nbcon_context_release(ctxt); } if (!ctxt->backlog) { /* Are there reserved but not yet finalized records? */ if (nbcon_seq_read(con) < stop_seq) err = -ENOENT; break; } } return err; } /** * nbcon_atomic_flush_pending_con - Flush specified nbcon console using its * write_atomic() callback * @con: The nbcon console to flush * @stop_seq: Flush up until this record * * This will stop flushing before @stop_seq if another context has ownership. * That context is then responsible for the flushing. Likewise, if new records * are added while this context was flushing and there is no other context * to handle the printing, this context must also flush those records. */ static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq) { struct console_flush_type ft; int err; again: err = __nbcon_atomic_flush_pending_con(con, stop_seq); /* * If there was a new owner (-EPERM, -EAGAIN), that context is * responsible for completing. * * Do not wait for records not yet finalized (-ENOENT) to avoid a * possible deadlock. They will either get flushed by the writer or * eventually skipped on panic CPU. */ if (err) return; /* * If flushing was successful but more records are available, this * context must flush those remaining records if the printer thread * is not available do it. */ printk_get_console_flush_type(&ft); if (!ft.nbcon_offload && prb_read_valid(prb, nbcon_seq_read(con), NULL)) { stop_seq = prb_next_reserve_seq(prb); goto again; } } /** * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their * write_atomic() callback * @stop_seq: Flush up until this record */ static void __nbcon_atomic_flush_pending(u64 stop_seq) { struct console *con; int cookie; cookie = console_srcu_read_lock(); for_each_console_srcu(con) { short flags = console_srcu_read_flags(con); if (!(flags & CON_NBCON)) continue; if (!console_is_usable(con, flags, true)) continue; if (nbcon_seq_read(con) >= stop_seq) continue; nbcon_atomic_flush_pending_con(con, stop_seq); } console_srcu_read_unlock(cookie); } /** * nbcon_atomic_flush_pending - Flush all nbcon consoles using their * write_atomic() callback * * Flush the backlog up through the currently newest record. Any new * records added while flushing will not be flushed if there is another * context available to handle the flushing. This is to avoid one CPU * printing unbounded because other CPUs continue to add records. */ void nbcon_atomic_flush_pending(void) { __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb)); } /** * nbcon_atomic_flush_unsafe - Flush all nbcon consoles using their * write_atomic() callback and allowing unsafe hostile takeovers * * Flush the backlog up through the currently newest record. Unsafe hostile * takeovers will be performed, if necessary. */ void nbcon_atomic_flush_unsafe(void) { panic_nbcon_allow_unsafe_takeover = true; __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb)); panic_nbcon_allow_unsafe_takeover = false; } /** * nbcon_cpu_emergency_enter - Enter an emergency section where printk() * messages for that CPU are flushed directly * * Context: Any context. Disables preemption. * * When within an emergency section, printk() calls will attempt to flush any * pending messages in the ringbuffer. */ void nbcon_cpu_emergency_enter(void) { unsigned int *cpu_emergency_nesting; preempt_disable(); atomic_inc(&nbcon_cpu_emergency_cnt); cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); (*cpu_emergency_nesting)++; } /** * nbcon_cpu_emergency_exit - Exit an emergency section * * Context: Within an emergency section. Enables preemption. */ void nbcon_cpu_emergency_exit(void) { unsigned int *cpu_emergency_nesting; cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); if (!WARN_ON_ONCE(*cpu_emergency_nesting == 0)) (*cpu_emergency_nesting)--; /* * Wake up kthreads because there might be some pending messages * added by other CPUs with normal priority since the last flush * in the emergency context. */ if (!WARN_ON_ONCE(atomic_read(&nbcon_cpu_emergency_cnt) == 0)) { if (atomic_dec_return(&nbcon_cpu_emergency_cnt) == 0) { struct console_flush_type ft; printk_get_console_flush_type(&ft); if (ft.nbcon_offload) nbcon_kthreads_wake(); } } preempt_enable(); } /** * nbcon_alloc - Allocate and init the nbcon console specific data * @con: Console to initialize * * Return: True if the console was fully allocated and initialized. * Otherwise @con must not be registered. * * When allocation and init was successful, the console must be properly * freed using nbcon_free() once it is no longer needed. */ bool nbcon_alloc(struct console *con) { struct nbcon_state state = { }; /* Synchronize the kthread start. */ lockdep_assert_console_list_lock_held(); /* Check for mandatory nbcon callbacks. */ if (WARN_ON(!con->write_thread || !con->device_lock || !con->device_unlock)) { return false; } rcuwait_init(&con->rcuwait); init_irq_work(&con->irq_work, nbcon_irq_work); atomic_long_set(&ACCESS_PRIVATE(con, nbcon_prev_seq), -1UL); nbcon_state_set(con, &state); /* * Initialize @nbcon_seq to the highest possible sequence number so * that practically speaking it will have nothing to print until a * desired initial sequence number has been set via nbcon_seq_force(). */ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), ULSEQ_MAX(prb)); if (con->flags & CON_BOOT) { /* * Boot console printing is synchronized with legacy console * printing, so boot consoles can share the same global printk * buffers. */ con->pbufs = &printk_shared_pbufs; } else { con->pbufs = kmalloc_obj(*con->pbufs); if (!con->pbufs) { con_printk(KERN_ERR, con, "failed to allocate printing buffer\n"); return false; } if (printk_kthreads_ready && !have_boot_console) { if (!nbcon_kthread_create(con)) { kfree(con->pbufs); con->pbufs = NULL; return false; } /* Might be the first kthread. */ printk_kthreads_running = true; } } return true; } /** * nbcon_free - Free and cleanup the nbcon console specific data * @con: Console to free/cleanup nbcon data * * Important: @have_nbcon_console must be updated before calling * this function. In particular, it can be set only when there * is still another nbcon console registered. */ void nbcon_free(struct console *con) { struct nbcon_state state = { }; /* Synchronize the kthread stop. */ lockdep_assert_console_list_lock_held(); if (printk_kthreads_running) { nbcon_kthread_stop(con); /* Might be the last nbcon console. * * Do not rely on printk_kthreads_check_locked(). It is not * called in some code paths, see nbcon_free() callers. */ if (!have_nbcon_console) printk_kthreads_running = false; } nbcon_state_set(con, &state); /* Boot consoles share global printk buffers. */ if (!(con->flags & CON_BOOT)) kfree(con->pbufs); con->pbufs = NULL; } /** * nbcon_device_try_acquire - Try to acquire nbcon console and enter unsafe * section * @con: The nbcon console to acquire * * Context: Under the locking mechanism implemented in * @con->device_lock() including disabling migration. * Return: True if the console was acquired. False otherwise. * * Console drivers will usually use their own internal synchronization * mechasism to synchronize between console printing and non-printing * activities (such as setting baud rates). However, nbcon console drivers * supporting atomic consoles may also want to mark unsafe sections when * performing non-printing activities in order to synchronize against their * atomic_write() callback. * * This function acquires the nbcon console using priority NBCON_PRIO_NORMAL * and marks it unsafe for handover/takeover. */ bool nbcon_device_try_acquire(struct console *con) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt); cant_migrate(); memset(ctxt, 0, sizeof(*ctxt)); ctxt->console = con; ctxt->prio = NBCON_PRIO_NORMAL; if (!nbcon_context_try_acquire(ctxt, false)) return false; if (!nbcon_context_enter_unsafe(ctxt)) return false; return true; } EXPORT_SYMBOL_GPL(nbcon_device_try_acquire); /** * nbcon_device_release - Exit unsafe section and release the nbcon console * @con: The nbcon console acquired in nbcon_device_try_acquire() */ void nbcon_device_release(struct console *con) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt); struct console_flush_type ft; int cookie; if (!nbcon_context_exit_unsafe(ctxt)) return; nbcon_context_release(ctxt); /* * This context must flush any new records added while the console * was locked if the printer thread is not available to do it. The * console_srcu_read_lock must be taken to ensure the console is * usable throughout flushing. */ cookie = console_srcu_read_lock(); printk_get_console_flush_type(&ft); if (console_is_usable(con, console_srcu_read_flags(con), true) && !ft.nbcon_offload && prb_read_valid(prb, nbcon_seq_read(con), NULL)) { /* * If nbcon_atomic flushing is not available, fallback to * using the legacy loop. */ if (ft.nbcon_atomic) { __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb)); } else if (ft.legacy_direct) { if (console_trylock()) console_unlock(); } else if (ft.legacy_offload) { defer_console_output(); } } console_srcu_read_unlock(cookie); } EXPORT_SYMBOL_GPL(nbcon_device_release); /** * nbcon_kdb_try_acquire - Try to acquire nbcon console and enter unsafe * section * @con: The nbcon console to acquire * @wctxt: The nbcon write context to be used on success * * Context: Under console_srcu_read_lock() for emitting a single kdb message * using the given con->write_atomic() callback. Can be called * only when the console is usable at the moment. * * Return: True if the console was acquired. False otherwise. * * kdb emits messages on consoles registered for printk() without * storing them into the ring buffer. It has to acquire the console * ownerhip so that it could call con->write_atomic() callback a safe way. * * This function acquires the nbcon console using priority NBCON_PRIO_EMERGENCY * and marks it unsafe for handover/takeover. */ bool nbcon_kdb_try_acquire(struct console *con, struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); memset(ctxt, 0, sizeof(*ctxt)); ctxt->console = con; ctxt->prio = NBCON_PRIO_EMERGENCY; if (!nbcon_context_try_acquire(ctxt, false)) return false; if (!nbcon_context_enter_unsafe(ctxt)) return false; return true; } /** * nbcon_kdb_release - Exit unsafe section and release the nbcon console * * @wctxt: The nbcon write context initialized by a successful * nbcon_kdb_try_acquire() */ void nbcon_kdb_release(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); if (!nbcon_context_exit_unsafe(ctxt)) return; nbcon_context_release(ctxt); /* * Flush any new printk() messages added when the console was blocked. * Only the console used by the given write context was blocked. * The console was locked only when the write_atomic() callback * was usable. */ __nbcon_atomic_flush_pending_con(ctxt->console, prb_next_reserve_seq(prb)); }
2576 9 9 9 9 1 1 9 8 9 1 1 10 8 2 2 9 9 9 9 4 9 4 4 4 4 4 10 10 8 2 10 4 51 51 2489 2480 2488 2489 2489 2489 2467 2482 1 2305 2159 2154 2158 2159 2149 2092 824 824 774 347 3 68 347 347 68 347 371 800 801 802 800 5 5 5 5 5 5 2406 2 2505 2393 2389 800 770 685 800 800 11 4 7 2126 2130 2133 2126 2129 2128 2127 2128 2128 2129 2 2127 67 2071 2343 2347 2330 2348 2304 8 6 2 2119 2119 8 8 8 322 2052 2050 198 47 193 193 193 193 122 122 73 81 81 70 2 68 68 2407 2335 2411 2412 2412 2409 122 123 2127 2131 2128 2130 2124 67 2133 800 800 801 111 801 801 800 801 802 800 801 800 802 799 801 802 801 802 729 729 152 153 149 67 93 894 707 798 51 51 51 51 51 14 36 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/dir.c - kernfs directory implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/sched.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/idr.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/hash.h> #include <linux/ns_common.h> #include "kernfs-internal.h" /* * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to * call pr_cont() while holding rename_lock. Because sometimes pr_cont() * will perform wakeups when releasing console_sem. Holding rename_lock * will introduce deadlock if the scheduler reads the kernfs_name in the * wakeup path. */ static DEFINE_SPINLOCK(kernfs_pr_cont_lock); static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) static bool __kernfs_active(struct kernfs_node *kn) { return atomic_read(&kn->active) >= 0; } static bool kernfs_active(struct kernfs_node *kn) { lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem); return __kernfs_active(kn); } static bool kernfs_lockdep(struct kernfs_node *kn) { #ifdef CONFIG_DEBUG_LOCK_ALLOC return kn->flags & KERNFS_LOCKDEP; #else return false; #endif } /* kernfs_node_depth - compute depth from @from to @to */ static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to) { size_t depth = 0; while (rcu_dereference(to->__parent) && to != from) { depth++; to = rcu_dereference(to->__parent); } return depth; } static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, struct kernfs_node *b) { size_t da, db; struct kernfs_root *ra = kernfs_root(a), *rb = kernfs_root(b); if (ra != rb) return NULL; da = kernfs_depth(ra->kn, a); db = kernfs_depth(rb->kn, b); while (da > db) { a = rcu_dereference(a->__parent); da--; } while (db > da) { b = rcu_dereference(b->__parent); db--; } /* worst case b and a will be the same at root */ while (b != a) { b = rcu_dereference(b->__parent); a = rcu_dereference(a->__parent); } return a; } /** * kernfs_path_from_node_locked - find a pseudo-absolute path to @kn_to, * where kn_from is treated as root of the path. * @kn_from: kernfs node which should be treated as root for the path * @kn_to: kernfs node to which path is needed * @buf: buffer to copy the path into * @buflen: size of @buf * * We need to handle couple of scenarios here: * [1] when @kn_from is an ancestor of @kn_to at some level * kn_from: /n1/n2/n3 * kn_to: /n1/n2/n3/n4/n5 * result: /n4/n5 * * [2] when @kn_from is on a different hierarchy and we need to find common * ancestor between @kn_from and @kn_to. * kn_from: /n1/n2/n3/n4 * kn_to: /n1/n2/n5 * result: /../../n5 * OR * kn_from: /n1/n2/n3/n4/n5 [depth=5] * kn_to: /n1/n2/n3 [depth=3] * result: /../.. * * [3] when @kn_to is %NULL result will be "(null)" * * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, struct kernfs_node *kn_from, char *buf, size_t buflen) { struct kernfs_node *kn, *common; const char parent_str[] = "/.."; size_t depth_from, depth_to, len = 0; ssize_t copied; int i, j; if (!kn_to) return strscpy(buf, "(null)", buflen); if (!kn_from) kn_from = kernfs_root(kn_to)->kn; if (kn_from == kn_to) return strscpy(buf, "/", buflen); common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) return -EINVAL; depth_to = kernfs_depth(common, kn_to); depth_from = kernfs_depth(common, kn_from); buf[0] = '\0'; for (i = 0; i < depth_from; i++) { copied = strscpy(buf + len, parent_str, buflen - len); if (copied < 0) return copied; len += copied; } /* Calculate how many bytes we need for the rest */ for (i = depth_to - 1; i >= 0; i--) { const char *name; for (kn = kn_to, j = 0; j < i; j++) kn = rcu_dereference(kn->__parent); name = rcu_dereference(kn->name); len += scnprintf(buf + len, buflen - len, "/%s", name); } return len; } /** * kernfs_name - obtain the name of a given node * @kn: kernfs_node of interest * @buf: buffer to copy @kn's name into * @buflen: size of @buf * * Copies the name of @kn into @buf of @buflen bytes. The behavior is * similar to strscpy(). * * Fills buffer with "(null)" if @kn is %NULL. * * Return: the resulting length of @buf. If @buf isn't long enough, * it's filled up to @buflen-1 and nul terminated, and returns -E2BIG. * * This function can be called from any context. */ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { struct kernfs_node *kn_parent; if (!kn) return strscpy(buf, "(null)", buflen); guard(rcu)(); /* * KERNFS_ROOT_INVARIANT_PARENT is ignored here. The name is RCU freed and * the parent is either existing or not. */ kn_parent = rcu_dereference(kn->__parent); return strscpy(buf, kn_parent ? rcu_dereference(kn->name) : "/", buflen); } /** * kernfs_path_from_node - build path of node @to relative to @from. * @from: parent kernfs_node relative to which we need to build the path * @to: kernfs_node of interest * @buf: buffer to copy @to's path into * @buflen: size of @buf * * Builds @to's path relative to @from in @buf. @from and @to must * be on the same kernfs-root. If @from is not parent of @to, then a relative * path (which includes '..'s) as needed to reach from @from to @to is * returned. * * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, char *buf, size_t buflen) { struct kernfs_root *root; guard(rcu)(); if (to) { root = kernfs_root(to); if (!(root->flags & KERNFS_ROOT_INVARIANT_PARENT)) { guard(read_lock_irqsave)(&root->kernfs_rename_lock); return kernfs_path_from_node_locked(to, from, buf, buflen); } } return kernfs_path_from_node_locked(to, from, buf, buflen); } EXPORT_SYMBOL_GPL(kernfs_path_from_node); /** * pr_cont_kernfs_name - pr_cont name of a kernfs_node * @kn: kernfs_node of interest * * This function can be called from any context. */ void pr_cont_kernfs_name(struct kernfs_node *kn) { unsigned long flags; spin_lock_irqsave(&kernfs_pr_cont_lock, flags); kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); pr_cont("%s", kernfs_pr_cont_buf); spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** * pr_cont_kernfs_path - pr_cont path of a kernfs_node * @kn: kernfs_node of interest * * This function can be called from any context. */ void pr_cont_kernfs_path(struct kernfs_node *kn) { unsigned long flags; int sz; spin_lock_irqsave(&kernfs_pr_cont_lock, flags); sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); if (sz < 0) { if (sz == -E2BIG) pr_cont("(name too long)"); else pr_cont("(error)"); goto out; } pr_cont("%s", kernfs_pr_cont_buf); out: spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** * kernfs_get_parent - determine the parent node and pin it * @kn: kernfs_node of interest * * Determines @kn's parent, pins and returns it. This function can be * called from any context. * * Return: parent node of @kn */ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) { struct kernfs_node *parent; struct kernfs_root *root; unsigned long flags; root = kernfs_root(kn); read_lock_irqsave(&root->kernfs_rename_lock, flags); parent = kernfs_parent(kn); kernfs_get(parent); read_unlock_irqrestore(&root->kernfs_rename_lock, flags); return parent; } /* * kernfs_ns_id - return the namespace id for a given namespace * @ns: namespace tag (may be NULL) * * Use the 64-bit namespace id instead of raw pointers for hashing * and comparison to avoid leaking kernel addresses to userspace. */ static u64 kernfs_ns_id(const struct ns_common *ns) { return ns ? ns->ns_id : 0; } /** * kernfs_name_hash - calculate hash of @ns + @name * @name: Null terminated string to hash * @ns: Namespace tag to hash * * Return: 31-bit hash of ns + name (so it fits in an off_t) */ static unsigned int kernfs_name_hash(const char *name, const struct ns_common *ns) { unsigned long hash = init_name_hash(kernfs_ns_id(ns)); unsigned int len = strlen(name); while (len--) hash = partial_name_hash(*name++, hash); hash = end_name_hash(hash); hash &= 0x7fffffffU; /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ if (hash < 2) hash += 2; if (hash >= INT_MAX) hash = INT_MAX - 1; return hash; } static int kernfs_name_compare(unsigned int hash, const char *name, const struct ns_common *ns, const struct kernfs_node *kn) { u64 ns_id = kernfs_ns_id(ns); u64 kn_ns_id = kernfs_ns_id(kn->ns); if (hash < kn->hash) return -1; if (hash > kn->hash) return 1; if (ns_id < kn_ns_id) return -1; if (ns_id > kn_ns_id) return 1; return strcmp(name, kernfs_rcu_name(kn)); } static int kernfs_sd_compare(const struct kernfs_node *left, const struct kernfs_node *right) { return kernfs_name_compare(left->hash, kernfs_rcu_name(left), left->ns, right); } /** * kernfs_link_sibling - link kernfs_node into sibling rbtree * @kn: kernfs_node of interest * * Link @kn into its sibling rbtree which starts from * @kn->parent->dir.children. * * Locking: * kernfs_rwsem held exclusive * * Return: * %0 on success, -EEXIST on failure. */ static int kernfs_link_sibling(struct kernfs_node *kn) { struct rb_node *parent = NULL; struct kernfs_node *kn_parent; struct rb_node **node; kn_parent = kernfs_parent(kn); node = &kn_parent->dir.children.rb_node; while (*node) { struct kernfs_node *pos; int result; pos = rb_to_kn(*node); parent = *node; result = kernfs_sd_compare(kn, pos); if (result < 0) node = &pos->rb.rb_left; else if (result > 0) node = &pos->rb.rb_right; else return -EEXIST; } /* add new node and rebalance the tree */ rb_link_node(&kn->rb, parent, node); rb_insert_color(&kn->rb, &kn_parent->dir.children); /* successfully added, account subdir number */ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) kn_parent->dir.subdirs++; kernfs_inc_rev(kn_parent); up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); return 0; } /** * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree * @kn: kernfs_node of interest * * Try to unlink @kn from its sibling rbtree which starts from * kn->parent->dir.children. * * Return: %true if @kn was actually removed, * %false if @kn wasn't on the rbtree. * * Locking: * kernfs_rwsem held exclusive */ static bool kernfs_unlink_sibling(struct kernfs_node *kn) { struct kernfs_node *kn_parent; if (RB_EMPTY_NODE(&kn->rb)) return false; kn_parent = kernfs_parent(kn); down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) kn_parent->dir.subdirs--; kernfs_inc_rev(kn_parent); up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); rb_erase(&kn->rb, &kn_parent->dir.children); RB_CLEAR_NODE(&kn->rb); return true; } /** * kernfs_get_active - get an active reference to kernfs_node * @kn: kernfs_node to get an active reference to * * Get an active reference of @kn. This function is noop if @kn * is %NULL. * * Return: * Pointer to @kn on success, %NULL on failure. */ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) { if (unlikely(!kn)) return NULL; if (!atomic_inc_unless_negative(&kn->active)) return NULL; if (kernfs_lockdep(kn)) rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_); return kn; } /** * kernfs_put_active - put an active reference to kernfs_node * @kn: kernfs_node to put an active reference to * * Put an active reference to @kn. This function is noop if @kn * is %NULL. */ void kernfs_put_active(struct kernfs_node *kn) { int v; if (unlikely(!kn)) return; if (kernfs_lockdep(kn)) rwsem_release(&kn->dep_map, _RET_IP_); v = atomic_dec_return(&kn->active); if (likely(v != KN_DEACTIVATED_BIAS)) return; wake_up_all(&kernfs_root(kn)->deactivate_waitq); } /** * kernfs_drain - drain kernfs_node * @kn: kernfs_node to drain * @drop_supers: Set to true if this function is called with the * kernfs_supers_rwsem locked. * * Drain existing usages and nuke all existing mmaps of @kn. Multiple * removers may invoke this function concurrently on @kn and all will * return after draining is complete. */ static void kernfs_drain(struct kernfs_node *kn, bool drop_supers) __releases(&kernfs_root(kn)->kernfs_rwsem) __acquires(&kernfs_root(kn)->kernfs_rwsem) { struct kernfs_root *root = kernfs_root(kn); lockdep_assert_held_write(&root->kernfs_rwsem); WARN_ON_ONCE(kernfs_active(kn)); /* * Skip draining if already fully drained. This avoids draining and its * lockdep annotations for nodes which have never been activated * allowing embedding kernfs_remove() in create error paths without * worrying about draining. */ if (atomic_read(&kn->active) == KN_DEACTIVATED_BIAS && !kernfs_should_drain_open_files(kn)) return; up_write(&root->kernfs_rwsem); if (drop_supers) up_read(&root->kernfs_supers_rwsem); if (kernfs_lockdep(kn)) { rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS) lock_contended(&kn->dep_map, _RET_IP_); } wait_event(root->deactivate_waitq, atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); if (kernfs_lockdep(kn)) { lock_acquired(&kn->dep_map, _RET_IP_); rwsem_release(&kn->dep_map, _RET_IP_); } if (kernfs_should_drain_open_files(kn)) kernfs_drain_open_files(kn); if (drop_supers) down_read(&root->kernfs_supers_rwsem); down_write(&root->kernfs_rwsem); } /** * kernfs_get - get a reference count on a kernfs_node * @kn: the target kernfs_node */ void kernfs_get(struct kernfs_node *kn) { if (kn) { WARN_ON(!atomic_read(&kn->count)); atomic_inc(&kn->count); } } EXPORT_SYMBOL_GPL(kernfs_get); static void kernfs_free_rcu(struct rcu_head *rcu) { struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu); /* If the whole node goes away, then name can't be used outside */ kfree_const(rcu_access_pointer(kn->name)); if (kn->iattr) kmem_cache_free(kernfs_iattrs_cache, kn->iattr); kmem_cache_free(kernfs_node_cache, kn); } /** * kernfs_put - put a reference count on a kernfs_node * @kn: the target kernfs_node * * Put a reference count of @kn and destroy it if it reached zero. */ void kernfs_put(struct kernfs_node *kn) { struct kernfs_node *parent; struct kernfs_root *root; if (!kn || !atomic_dec_and_test(&kn->count)) return; root = kernfs_root(kn); repeat: /* * Moving/renaming is always done while holding reference. * kn->parent won't change beneath us. */ parent = kernfs_parent(kn); WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS, "kernfs_put: %s/%s: released with incorrect active_ref %d\n", parent ? rcu_dereference(parent->name) : "", rcu_dereference(kn->name), atomic_read(&kn->active)); if (kernfs_type(kn) == KERNFS_LINK) kernfs_put(kn->symlink.target_kn); if (kn->iattr && kn->iattr->xattrs) { simple_xattrs_free(kn->iattr->xattrs, NULL); kfree(kn->iattr->xattrs); kn->iattr->xattrs = NULL; } spin_lock(&root->kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&root->kernfs_idr_lock); call_rcu(&kn->rcu, kernfs_free_rcu); kn = parent; if (kn) { if (atomic_dec_and_test(&kn->count)) goto repeat; } else { /* just released the root kn, free @root too */ idr_destroy(&root->ino_idr); kfree_rcu(root, rcu); } } EXPORT_SYMBOL_GPL(kernfs_put); /** * kernfs_node_from_dentry - determine kernfs_node associated with a dentry * @dentry: the dentry in question * * Return: the kernfs_node associated with @dentry. If @dentry is not a * kernfs one, %NULL is returned. * * While the returned kernfs_node will stay accessible as long as @dentry * is accessible, the returned node can be in any state and the caller is * fully responsible for determining what's accessible. */ struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) { if (dentry->d_sb->s_op == &kernfs_sops) return kernfs_dentry_node(dentry); return NULL; } static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; u32 id_highbits; int ret; name = kstrdup_const(name, GFP_KERNEL); if (!name) return NULL; kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); if (!kn) goto err_out1; idr_preload(GFP_KERNEL); spin_lock(&root->kernfs_idr_lock); ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); if (ret >= 0 && ret < root->last_id_lowbits) root->id_highbits++; id_highbits = root->id_highbits; root->last_id_lowbits = ret; spin_unlock(&root->kernfs_idr_lock); idr_preload_end(); if (ret < 0) goto err_out2; kn->id = (u64)id_highbits << 32 | ret; atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); rcu_assign_pointer(kn->name, name); kn->mode = mode; kn->flags = flags; if (!uid_eq(uid, GLOBAL_ROOT_UID) || !gid_eq(gid, GLOBAL_ROOT_GID)) { struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = uid, .ia_gid = gid, }; ret = __kernfs_setattr(kn, &iattr); if (ret < 0) goto err_out3; } if (parent) { ret = security_kernfs_init_security(parent, kn); if (ret) goto err_out4; } return kn; err_out4: if (kn->iattr) { if (kn->iattr->xattrs) { simple_xattrs_free(kn->iattr->xattrs, NULL); kfree(kn->iattr->xattrs); } kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } err_out3: spin_lock(&root->kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&root->kernfs_idr_lock); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: kfree_const(name); return NULL; } struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; if (parent->mode & S_ISGID) { /* this code block imitates inode_init_owner() for * kernfs */ if (parent->iattr) gid = parent->iattr->ia_gid; if (flags & KERNFS_DIR) mode |= S_ISGID; } kn = __kernfs_new_node(kernfs_root(parent), parent, name, mode, uid, gid, flags); if (kn) { kernfs_get(parent); rcu_assign_pointer(kn->__parent, parent); } return kn; } /* * kernfs_find_and_get_node_by_id - get kernfs_node from node id * @root: the kernfs root * @id: the target node id * * @id's lower 32bits encode ino and upper gen. If the gen portion is * zero, all generations are matched. * * Return: %NULL on failure, * otherwise a kernfs node with reference counter incremented. */ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, u64 id) { struct kernfs_node *kn; ino_t ino = kernfs_id_ino(id); u32 gen = kernfs_id_gen(id); rcu_read_lock(); kn = idr_find(&root->ino_idr, (u32)ino); if (!kn) goto err_unlock; if (sizeof(ino_t) >= sizeof(u64)) { /* we looked up with the low 32bits, compare the whole */ if (kernfs_ino(kn) != ino) goto err_unlock; } else { /* 0 matches all generations */ if (unlikely(gen && kernfs_gen(kn) != gen)) goto err_unlock; } /* * We should fail if @kn has never been activated and guarantee success * if the caller knows that @kn is active. Both can be achieved by * __kernfs_active() which tests @kn->active without kernfs_rwsem. */ if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count))) goto err_unlock; rcu_read_unlock(); return kn; err_unlock: rcu_read_unlock(); return NULL; } /** * kernfs_add_one - add kernfs_node to parent without warning * @kn: kernfs_node to be added * * The caller must already have initialized @kn->parent. This * function increments nlink of the parent's inode if @kn is a * directory and link into the children list of the parent. * * Return: * %0 on success, -EEXIST if entry with the given name already * exists. */ int kernfs_add_one(struct kernfs_node *kn) { struct kernfs_root *root = kernfs_root(kn); struct kernfs_iattrs *ps_iattr; struct kernfs_node *parent; bool has_ns; int ret; down_write(&root->kernfs_rwsem); parent = kernfs_parent(kn); ret = -EINVAL; has_ns = kernfs_ns_enabled(parent); if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", has_ns ? "required" : "invalid", kernfs_rcu_name(parent), kernfs_rcu_name(kn))) goto out_unlock; if (kernfs_type(parent) != KERNFS_DIR) goto out_unlock; ret = -ENOENT; if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR)) goto out_unlock; kn->hash = kernfs_name_hash(kernfs_rcu_name(kn), kn->ns); ret = kernfs_link_sibling(kn); if (ret) goto out_unlock; /* Update timestamps on the parent */ down_write(&root->kernfs_iattr_rwsem); ps_iattr = parent->iattr; if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } up_write(&root->kernfs_iattr_rwsem); up_write(&root->kernfs_rwsem); /* * Activate the new node unless CREATE_DEACTIVATED is requested. * If not activated here, the kernfs user is responsible for * activating the node with kernfs_activate(). A node which hasn't * been activated is not visible to userland and its removal won't * trigger deactivation. */ if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) kernfs_activate(kn); return 0; out_unlock: up_write(&root->kernfs_rwsem); return ret; } /** * kernfs_find_ns - find kernfs_node with the given name * @parent: kernfs_node to search under * @name: name to look for * @ns: the namespace tag to use * * Look for kernfs_node with name @name under @parent. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, const unsigned char *name, const struct ns_common *ns) { struct rb_node *node = parent->dir.children.rb_node; bool has_ns = kernfs_ns_enabled(parent); unsigned int hash; lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem); if (has_ns != (bool)ns) { WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", has_ns ? "required" : "invalid", kernfs_rcu_name(parent), name); return NULL; } hash = kernfs_name_hash(name, ns); while (node) { struct kernfs_node *kn; int result; kn = rb_to_kn(node); result = kernfs_name_compare(hash, name, ns, kn); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return kn; } return NULL; } static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, const unsigned char *path, const struct ns_common *ns) { ssize_t len; char *p, *name; lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem); spin_lock_irq(&kernfs_pr_cont_lock); len = strscpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf)); if (len < 0) { spin_unlock_irq(&kernfs_pr_cont_lock); return NULL; } p = kernfs_pr_cont_buf; while ((name = strsep(&p, "/")) && parent) { if (*name == '\0') continue; parent = kernfs_find_ns(parent, name, ns); } spin_unlock_irq(&kernfs_pr_cont_lock); return parent; } /** * kernfs_find_and_get_ns - find and get kernfs_node with the given name * @parent: kernfs_node to search under * @name: name to look for * @ns: the namespace tag to use * * Look for kernfs_node with name @name under @parent and get a reference * if found. This function may sleep. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const struct ns_common *ns) { struct kernfs_node *kn; struct kernfs_root *root = kernfs_root(parent); down_read(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); kernfs_get(kn); up_read(&root->kernfs_rwsem); return kn; } EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); /** * kernfs_walk_and_get_ns - find and get kernfs_node with the given path * @parent: kernfs_node to search under * @path: path to look for * @ns: the namespace tag to use * * Look for kernfs_node with path @path under @parent and get a reference * if found. This function may sleep. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const struct ns_common *ns) { struct kernfs_node *kn; struct kernfs_root *root = kernfs_root(parent); down_read(&root->kernfs_rwsem); kn = kernfs_walk_ns(parent, path, ns); kernfs_get(kn); up_read(&root->kernfs_rwsem); return kn; } unsigned int kernfs_root_flags(struct kernfs_node *kn) { return kernfs_root(kn)->flags; } /** * kernfs_create_root - create a new kernfs hierarchy * @scops: optional syscall operations for the hierarchy * @flags: KERNFS_ROOT_* flags * @priv: opaque data associated with the new directory * * Return: the root of the new hierarchy on success, ERR_PTR() value on * failure. */ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv) { struct kernfs_root *root; struct kernfs_node *kn; root = kzalloc_obj(*root); if (!root) return ERR_PTR(-ENOMEM); idr_init(&root->ino_idr); spin_lock_init(&root->kernfs_idr_lock); init_rwsem(&root->kernfs_rwsem); init_rwsem(&root->kernfs_iattr_rwsem); init_rwsem(&root->kernfs_supers_rwsem); INIT_LIST_HEAD(&root->supers); rwlock_init(&root->kernfs_rename_lock); /* * On 64bit ino setups, id is ino. On 32bit, low 32bits are ino. * High bits generation. The starting value for both ino and * genenration is 1. Initialize upper 32bit allocation * accordingly. */ if (sizeof(ino_t) >= sizeof(u64)) root->id_highbits = 0; else root->id_highbits = 1; kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) { idr_destroy(&root->ino_idr); kfree(root); return ERR_PTR(-ENOMEM); } kn->priv = priv; kn->dir.root = root; root->syscall_ops = scops; root->flags = flags; root->kn = kn; init_waitqueue_head(&root->deactivate_waitq); if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) kernfs_activate(kn); return root; } /** * kernfs_destroy_root - destroy a kernfs hierarchy * @root: root of the hierarchy to destroy * * Destroy the hierarchy anchored at @root by removing all existing * directories and destroying @root. */ void kernfs_destroy_root(struct kernfs_root *root) { /* * kernfs_remove holds kernfs_rwsem from the root so the root * shouldn't be freed during the operation. */ kernfs_get(root->kn); kernfs_remove(root->kn); kernfs_put(root->kn); /* will also free @root */ } /** * kernfs_root_to_node - return the kernfs_node associated with a kernfs_root * @root: root to use to lookup * * Return: @root's kernfs_node */ struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root) { return root->kn; } /** * kernfs_create_dir_ns - create a directory * @parent: parent in which to create a new directory * @name: name of the new directory * @mode: mode of the new directory * @uid: uid of the new directory * @gid: gid of the new directory * @priv: opaque data associated with the new directory * @ns: optional namespace tag of the directory * * Return: the created node on success, ERR_PTR() value on failure. */ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, void *priv, const struct ns_common *ns) { struct kernfs_node *kn; int rc; /* allocate */ kn = kernfs_new_node(parent, name, mode | S_IFDIR, uid, gid, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); kn->dir.root = parent->dir.root; kn->ns = ns; kn->priv = priv; /* link in */ rc = kernfs_add_one(kn); if (!rc) return kn; kernfs_put(kn); return ERR_PTR(rc); } /** * kernfs_create_empty_dir - create an always empty directory * @parent: parent in which to create a new directory * @name: name of the new directory * * Return: the created node on success, ERR_PTR() value on failure. */ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, const char *name) { struct kernfs_node *kn; int rc; /* allocate */ kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); kn->flags |= KERNFS_EMPTY_DIR; kn->dir.root = parent->dir.root; kn->ns = NULL; kn->priv = NULL; /* link in */ rc = kernfs_add_one(kn); if (!rc) return kn; kernfs_put(kn); return ERR_PTR(rc); } static int kernfs_dop_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { struct kernfs_node *kn, *parent; struct kernfs_root *root; if (flags & LOOKUP_RCU) return -ECHILD; /* Negative hashed dentry? */ if (d_really_is_negative(dentry)) { /* If the kernfs parent node has changed discard and * proceed to ->lookup. * * There's nothing special needed here when getting the * dentry parent, even if a concurrent rename is in * progress. That's because the dentry is negative so * it can only be the target of the rename and it will * be doing a d_move() not a replace. Consequently the * dentry d_parent won't change over the d_move(). * * Also kernfs negative dentries transitioning from * negative to positive during revalidate won't happen * because they are invalidated on containing directory * changes and the lookup re-done so that a new positive * dentry can be properly created. */ root = kernfs_root_from_sb(dentry->d_sb); down_read(&root->kernfs_rwsem); parent = kernfs_dentry_node(dentry->d_parent); if (parent) { if (kernfs_dir_changed(parent, dentry)) { up_read(&root->kernfs_rwsem); return 0; } } up_read(&root->kernfs_rwsem); /* The kernfs parent node hasn't changed, leave the * dentry negative and return success. */ return 1; } kn = kernfs_dentry_node(dentry); root = kernfs_root(kn); down_read(&root->kernfs_rwsem); /* The kernfs node has been deactivated */ if (!kernfs_active(kn)) goto out_bad; parent = kernfs_parent(kn); /* The kernfs node has been moved? */ if (kernfs_dentry_node(dentry->d_parent) != parent) goto out_bad; /* The kernfs node has been renamed */ if (strcmp(dentry->d_name.name, kernfs_rcu_name(kn)) != 0) goto out_bad; /* The kernfs node has been moved to a different namespace */ if (parent && kernfs_ns_enabled(parent) && kernfs_ns_id(kernfs_info(dentry->d_sb)->ns) != kernfs_ns_id(kn->ns)) goto out_bad; up_read(&root->kernfs_rwsem); return 1; out_bad: up_read(&root->kernfs_rwsem); return 0; } const struct dentry_operations kernfs_dops = { .d_revalidate = kernfs_dop_revalidate, }; static struct dentry *kernfs_iop_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct kernfs_node *parent = dir->i_private; struct kernfs_node *kn; struct kernfs_root *root; struct inode *inode = NULL; const struct ns_common *ns = NULL; root = kernfs_root(parent); down_read(&root->kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dir->i_sb)->ns; kn = kernfs_find_ns(parent, dentry->d_name.name, ns); /* attach dentry and inode */ if (kn) { /* Inactive nodes are invisible to the VFS so don't * create a negative. */ if (!kernfs_active(kn)) { up_read(&root->kernfs_rwsem); return NULL; } inode = kernfs_get_inode(dir->i_sb, kn); if (!inode) inode = ERR_PTR(-ENOMEM); } /* * Needed for negative dentry validation. * The negative dentry can be created in kernfs_iop_lookup() * or transforms from positive dentry in dentry_unlink_inode() * called from vfs_rmdir(). */ if (!IS_ERR(inode)) kernfs_set_rev(parent, dentry); up_read(&root->kernfs_rwsem); /* instantiate and hash (possibly negative) dentry */ return d_splice_alias(inode, dentry); } static struct dentry *kernfs_iop_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct kernfs_node *parent = dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops; int ret; if (!scops || !scops->mkdir) return ERR_PTR(-EPERM); if (!kernfs_get_active(parent)) return ERR_PTR(-ENODEV); ret = scops->mkdir(parent, dentry->d_name.name, mode); kernfs_put_active(parent); return ERR_PTR(ret); } static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) { struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; if (!scops || !scops->rmdir) return -EPERM; if (!kernfs_get_active(kn)) return -ENODEV; ret = scops->rmdir(kn); kernfs_put_active(kn); return ret; } static int kernfs_iop_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct kernfs_node *kn = kernfs_dentry_node(old_dentry); struct kernfs_node *new_parent = new_dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; if (flags) return -EINVAL; if (!scops || !scops->rename) return -EPERM; if (!kernfs_get_active(kn)) return -ENODEV; if (!kernfs_get_active(new_parent)) { kernfs_put_active(kn); return -ENODEV; } ret = scops->rename(kn, new_parent, new_dentry->d_name.name); kernfs_put_active(new_parent); kernfs_put_active(kn); return ret; } const struct inode_operations kernfs_dir_iops = { .lookup = kernfs_iop_lookup, .permission = kernfs_iop_permission, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .listxattr = kernfs_iop_listxattr, .mkdir = kernfs_iop_mkdir, .rmdir = kernfs_iop_rmdir, .rename = kernfs_iop_rename, }; static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos) { struct kernfs_node *last; while (true) { struct rb_node *rbn; last = pos; if (kernfs_type(pos) != KERNFS_DIR) break; rbn = rb_first(&pos->dir.children); if (!rbn) break; pos = rb_to_kn(rbn); } return last; } /** * kernfs_next_descendant_post - find the next descendant for post-order walk * @pos: the current position (%NULL to initiate traversal) * @root: kernfs_node whose descendants to walk * * Find the next descendant to visit for post-order traversal of @root's * descendants. @root is included in the iteration and the last node to be * visited. * * Return: the next descendant to visit or %NULL when done. */ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, struct kernfs_node *root) { struct rb_node *rbn; lockdep_assert_held_write(&kernfs_root(root)->kernfs_rwsem); /* if first iteration, visit leftmost descendant which may be root */ if (!pos) return kernfs_leftmost_descendant(root); /* if we visited @root, we're done */ if (pos == root) return NULL; /* if there's an unvisited sibling, visit its leftmost descendant */ rbn = rb_next(&pos->rb); if (rbn) return kernfs_leftmost_descendant(rb_to_kn(rbn)); /* no sibling left, visit parent */ return kernfs_parent(pos); } static void kernfs_activate_one(struct kernfs_node *kn) { lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); kn->flags |= KERNFS_ACTIVATED; if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING))) return; WARN_ON_ONCE(rcu_access_pointer(kn->__parent) && RB_EMPTY_NODE(&kn->rb)); WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); atomic_sub(KN_DEACTIVATED_BIAS, &kn->active); } /** * kernfs_activate - activate a node which started deactivated * @kn: kernfs_node whose subtree is to be activated * * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node * needs to be explicitly activated. A node which hasn't been activated * isn't visible to userland and deactivation is skipped during its * removal. This is useful to construct atomic init sequences where * creation of multiple nodes should either succeed or fail atomically. * * The caller is responsible for ensuring that this function is not called * after kernfs_remove*() is invoked on @kn. */ void kernfs_activate(struct kernfs_node *kn) { struct kernfs_node *pos; struct kernfs_root *root = kernfs_root(kn); down_write(&root->kernfs_rwsem); pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) kernfs_activate_one(pos); up_write(&root->kernfs_rwsem); } /** * kernfs_show - show or hide a node * @kn: kernfs_node to show or hide * @show: whether to show or hide * * If @show is %false, @kn is marked hidden and deactivated. A hidden node is * ignored in future activaitons. If %true, the mark is removed and activation * state is restored. This function won't implicitly activate a new node in a * %KERNFS_ROOT_CREATE_DEACTIVATED root which hasn't been activated yet. * * To avoid recursion complexities, directories aren't supported for now. */ void kernfs_show(struct kernfs_node *kn, bool show) { struct kernfs_root *root = kernfs_root(kn); if (WARN_ON_ONCE(kernfs_type(kn) == KERNFS_DIR)) return; down_write(&root->kernfs_rwsem); if (show) { kn->flags &= ~KERNFS_HIDDEN; if (kn->flags & KERNFS_ACTIVATED) kernfs_activate_one(kn); } else { kn->flags |= KERNFS_HIDDEN; if (kernfs_active(kn)) atomic_add(KN_DEACTIVATED_BIAS, &kn->active); kernfs_drain(kn, false); } up_write(&root->kernfs_rwsem); } /* * This function enables VFS to send fsnotify events for deletions. * There is gap in this implementation for certain file removals due their * unique nature in kernfs. Directory removals that trigger file removals occur * through vfs_rmdir, which shrinks the dcache and emits fsnotify events after * the rmdir operation; there is no issue here. However kernfs writes to * particular files (e.g. cgroup.subtree_control) can also cause file removal, * but vfs_write does not attempt to emit fsnotify events after the write * operation, even if i_nlink counts are 0. As a usecase for monitoring this * category of file removals is not known, they are left without having * IN_DELETE or IN_DELETE_SELF events generated. * Fanotify recursive monitoring also does not work for kernfs nodes that do not * have inodes attached, as they are created on-demand in kernfs. */ static void kernfs_clear_inode_nlink(struct kernfs_node *kn) { struct kernfs_root *root = kernfs_root(kn); struct kernfs_super_info *info; lockdep_assert_held_read(&root->kernfs_supers_rwsem); list_for_each_entry(info, &root->supers, node) { struct inode *inode = ilookup(info->sb, kernfs_ino(kn)); if (inode) { clear_nlink(inode); iput(inode); } } } static void __kernfs_remove(struct kernfs_node *kn) { struct kernfs_node *pos, *parent; /* Short-circuit if non-root @kn has already finished removal. */ if (!kn) return; lockdep_assert_held_read(&kernfs_root(kn)->kernfs_supers_rwsem); lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); /* * This is for kernfs_remove_self() which plays with active ref * after removal. */ if (kernfs_parent(kn) && RB_EMPTY_NODE(&kn->rb)) return; pr_debug("kernfs %s: removing\n", kernfs_rcu_name(kn)); /* prevent new usage by marking all nodes removing and deactivating */ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) { pos->flags |= KERNFS_REMOVING; if (kernfs_active(pos)) atomic_add(KN_DEACTIVATED_BIAS, &pos->active); } up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); /* deactivate and unlink the subtree node-by-node */ do { pos = kernfs_leftmost_descendant(kn); /* * kernfs_drain() may drop kernfs_rwsem temporarily and @pos's * base ref could have been put by someone else by the time * the function returns. Make sure it doesn't go away * underneath us. */ kernfs_get(pos); kernfs_drain(pos, true); parent = kernfs_parent(pos); /* * kernfs_unlink_sibling() succeeds once per node. Use it * to decide who's responsible for cleanups. */ if (!parent || kernfs_unlink_sibling(pos)) { struct kernfs_iattrs *ps_iattr = parent ? parent->iattr : NULL; down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); kernfs_clear_inode_nlink(pos); /* update timestamps on the parent */ if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); kernfs_put(pos); } kernfs_put(pos); } while (pos != kn); } /** * kernfs_remove - remove a kernfs_node recursively * @kn: the kernfs_node to remove * * Remove @kn along with all its subdirectories and files. */ void kernfs_remove(struct kernfs_node *kn) { struct kernfs_root *root; if (!kn) return; root = kernfs_root(kn); down_read(&root->kernfs_supers_rwsem); down_write(&root->kernfs_rwsem); __kernfs_remove(kn); up_write(&root->kernfs_rwsem); up_read(&root->kernfs_supers_rwsem); } /** * kernfs_break_active_protection - break out of active protection * @kn: the self kernfs_node * * The caller must be running off of a kernfs operation which is invoked * with an active reference - e.g. one of kernfs_ops. Each invocation of * this function must also be matched with an invocation of * kernfs_unbreak_active_protection(). * * This function releases the active reference of @kn the caller is * holding. Once this function is called, @kn may be removed at any point * and the caller is solely responsible for ensuring that the objects it * dereferences are accessible. */ void kernfs_break_active_protection(struct kernfs_node *kn) { /* * Take out ourself out of the active ref dependency chain. If * we're called without an active ref, lockdep will complain. */ kernfs_put_active(kn); } /** * kernfs_unbreak_active_protection - undo kernfs_break_active_protection() * @kn: the self kernfs_node * * If kernfs_break_active_protection() was called, this function must be * invoked before finishing the kernfs operation. Note that while this * function restores the active reference, it doesn't and can't actually * restore the active protection - @kn may already or be in the process of * being drained and removed. Once kernfs_break_active_protection() is * invoked, that protection is irreversibly gone for the kernfs operation * instance. * * While this function may be called at any point after * kernfs_break_active_protection() is invoked, its most useful location * would be right before the enclosing kernfs operation returns. */ void kernfs_unbreak_active_protection(struct kernfs_node *kn) { /* * @kn->active could be in any state; however, the increment we do * here will be undone as soon as the enclosing kernfs operation * finishes and this temporary bump can't break anything. If @kn * is alive, nothing changes. If @kn is being deactivated, the * soon-to-follow put will either finish deactivation or restore * deactivated state. If @kn is already removed, the temporary * bump is guaranteed to be gone before @kn is released. */ atomic_inc(&kn->active); if (kernfs_lockdep(kn)) rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_); } /** * kernfs_remove_self - remove a kernfs_node from its own method * @kn: the self kernfs_node to remove * * The caller must be running off of a kernfs operation which is invoked * with an active reference - e.g. one of kernfs_ops. This can be used to * implement a file operation which deletes itself. * * For example, the "delete" file for a sysfs device directory can be * implemented by invoking kernfs_remove_self() on the "delete" file * itself. This function breaks the circular dependency of trying to * deactivate self while holding an active ref itself. It isn't necessary * to modify the usual removal path to use kernfs_remove_self(). The * "delete" implementation can simply invoke kernfs_remove_self() on self * before proceeding with the usual removal path. kernfs will ignore later * kernfs_remove() on self. * * kernfs_remove_self() can be called multiple times concurrently on the * same kernfs_node. Only the first one actually performs removal and * returns %true. All others will wait until the kernfs operation which * won self-removal finishes and return %false. Note that the losers wait * for the completion of not only the winning kernfs_remove_self() but also * the whole kernfs_ops which won the arbitration. This can be used to * guarantee, for example, all concurrent writes to a "delete" file to * finish only after the whole operation is complete. * * Return: %true if @kn is removed by this call, otherwise %false. */ bool kernfs_remove_self(struct kernfs_node *kn) { bool ret; struct kernfs_root *root = kernfs_root(kn); down_read(&root->kernfs_supers_rwsem); down_write(&root->kernfs_rwsem); kernfs_break_active_protection(kn); /* * SUICIDAL is used to arbitrate among competing invocations. Only * the first one will actually perform removal. When the removal * is complete, SUICIDED is set and the active ref is restored * while kernfs_rwsem for held exclusive. The ones which lost * arbitration waits for SUICIDED && drained which can happen only * after the enclosing kernfs operation which executed the winning * instance of kernfs_remove_self() finished. */ if (!(kn->flags & KERNFS_SUICIDAL)) { kn->flags |= KERNFS_SUICIDAL; __kernfs_remove(kn); kn->flags |= KERNFS_SUICIDED; ret = true; } else { wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq; DEFINE_WAIT(wait); while (true) { prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE); if ((kn->flags & KERNFS_SUICIDED) && atomic_read(&kn->active) == KN_DEACTIVATED_BIAS) break; up_write(&root->kernfs_rwsem); up_read(&root->kernfs_supers_rwsem); schedule(); down_read(&root->kernfs_supers_rwsem); down_write(&root->kernfs_rwsem); } finish_wait(waitq, &wait); WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb)); ret = false; } /* * This must be done while kernfs_rwsem held exclusive; otherwise, * waiting for SUICIDED && deactivated could finish prematurely. */ kernfs_unbreak_active_protection(kn); up_write(&root->kernfs_rwsem); up_read(&root->kernfs_supers_rwsem); return ret; } /** * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it * @parent: parent of the target * @name: name of the kernfs_node to remove * @ns: namespace tag of the kernfs_node to remove * * Look for the kernfs_node with @name and @ns under @parent and remove it. * * Return: %0 on success, -ENOENT if such entry doesn't exist. */ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const struct ns_common *ns) { struct kernfs_node *kn; struct kernfs_root *root; if (!parent) { WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n", name); return -ENOENT; } root = kernfs_root(parent); down_read(&root->kernfs_supers_rwsem); down_write(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); if (kn) { kernfs_get(kn); __kernfs_remove(kn); kernfs_put(kn); } up_write(&root->kernfs_rwsem); up_read(&root->kernfs_supers_rwsem); if (kn) return 0; else return -ENOENT; } /** * kernfs_rename_ns - move and rename a kernfs_node * @kn: target node * @new_parent: new parent to put @sd under * @new_name: new name * @new_ns: new namespace tag * * Return: %0 on success, -errno on failure. */ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const struct ns_common *new_ns) { struct kernfs_node *old_parent; struct kernfs_root *root; const char *old_name; int error; /* can't move or rename root */ if (!rcu_access_pointer(kn->__parent)) return -EINVAL; root = kernfs_root(kn); down_write(&root->kernfs_rwsem); error = -ENOENT; if (!kernfs_active(kn) || !kernfs_active(new_parent) || (new_parent->flags & KERNFS_EMPTY_DIR)) goto out; old_parent = kernfs_parent(kn); if (root->flags & KERNFS_ROOT_INVARIANT_PARENT) { error = -EINVAL; if (WARN_ON_ONCE(old_parent != new_parent)) goto out; } error = 0; old_name = kernfs_rcu_name(kn); if (!new_name) new_name = old_name; if ((old_parent == new_parent) && (kernfs_ns_id(kn->ns) == kernfs_ns_id(new_ns)) && (strcmp(old_name, new_name) == 0)) goto out; /* nothing to rename */ error = -EEXIST; if (kernfs_find_ns(new_parent, new_name, new_ns)) goto out; /* rename kernfs_node */ if (strcmp(old_name, new_name) != 0) { error = -ENOMEM; new_name = kstrdup_const(new_name, GFP_KERNEL); if (!new_name) goto out; } else { new_name = NULL; } /* * Move to the appropriate place in the appropriate directories rbtree. */ kernfs_unlink_sibling(kn); /* rename_lock protects ->parent accessors */ if (old_parent != new_parent) { kernfs_get(new_parent); write_lock_irq(&root->kernfs_rename_lock); rcu_assign_pointer(kn->__parent, new_parent); kn->ns = new_ns; if (new_name) rcu_assign_pointer(kn->name, new_name); write_unlock_irq(&root->kernfs_rename_lock); kernfs_put(old_parent); } else { /* name assignment is RCU protected, parent is the same */ kn->ns = new_ns; if (new_name) rcu_assign_pointer(kn->name, new_name); } kn->hash = kernfs_name_hash(new_name ?: old_name, kn->ns); kernfs_link_sibling(kn); if (new_name && !is_kernel_rodata((unsigned long)old_name)) kfree_rcu_mightsleep(old_name); error = 0; out: up_write(&root->kernfs_rwsem); return error; } static int kernfs_dir_fop_release(struct inode *inode, struct file *filp) { kernfs_put(filp->private_data); return 0; } static struct kernfs_node *kernfs_dir_pos(const struct ns_common *ns, struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos) { if (pos) { int valid = kernfs_active(pos) && rcu_access_pointer(pos->__parent) == parent && hash == pos->hash; kernfs_put(pos); if (!valid) pos = NULL; } if (!pos && (hash > 1) && (hash < INT_MAX)) { struct rb_node *node = parent->dir.children.rb_node; u64 ns_id = kernfs_ns_id(ns); while (node) { pos = rb_to_kn(node); if (hash < pos->hash) node = node->rb_left; else if (hash > pos->hash) node = node->rb_right; else if (ns_id < kernfs_ns_id(pos->ns)) node = node->rb_left; else if (ns_id > kernfs_ns_id(pos->ns)) node = node->rb_right; else break; } } /* Skip over entries which are dying/dead or in the wrong namespace */ while (pos && (!kernfs_active(pos) || kernfs_ns_id(pos->ns) != kernfs_ns_id(ns))) { struct rb_node *node = rb_next(&pos->rb); if (!node) pos = NULL; else pos = rb_to_kn(node); } return pos; } static struct kernfs_node *kernfs_dir_next_pos(const struct ns_common *ns, struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos) { pos = kernfs_dir_pos(ns, parent, ino, pos); if (pos) { do { struct rb_node *node = rb_next(&pos->rb); if (!node) pos = NULL; else pos = rb_to_kn(node); } while (pos && (!kernfs_active(pos) || kernfs_ns_id(pos->ns) != kernfs_ns_id(ns))); } return pos; } static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct kernfs_node *parent = kernfs_dentry_node(dentry); struct kernfs_node *pos = file->private_data; struct kernfs_root *root; const struct ns_common *ns = NULL; if (!dir_emit_dots(file, ctx)) return 0; root = kernfs_root(parent); down_read(&root->kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dentry->d_sb)->ns; for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos); pos; pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) { const char *name = kernfs_rcu_name(pos); unsigned int type = fs_umode_to_dtype(pos->mode); int len = strlen(name); ino_t ino = kernfs_ino(pos); ctx->pos = pos->hash; file->private_data = pos; kernfs_get(pos); if (!dir_emit(ctx, name, len, ino, type)) { up_read(&root->kernfs_rwsem); return 0; } } up_read(&root->kernfs_rwsem); file->private_data = NULL; ctx->pos = INT_MAX; return 0; } const struct file_operations kernfs_dir_fops = { .read = generic_read_dir, .iterate_shared = kernfs_fop_readdir, .release = kernfs_dir_fop_release, .llseek = generic_file_llseek, };
47 43 43 3 3 4 4 7 7 11 11 6 2 4 1 4 1 4 10 3 5 3 5 2 7 4 4 1 3 2 2 5 4 4 3 8 8 8 3 5 5 4 4 4 4 2 2 4 4 4 4 30 29 38 91 82 29 56 51 12 24 24 63 63 90 91 64 1 26 1 89 68 8 61 60 61 56 5 59 6 5 56 7 69 10 59 45 139 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 // SPDX-License-Identifier: GPL-2.0-or-later /* * Support for AES-NI and VAES instructions. This file contains glue code. * The real AES implementations are in aesni-intel_asm.S and other .S files. * * Copyright (C) 2008, Intel Corp. * Author: Huang Ying <ying.huang@intel.com> * * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD * interface for 64-bit kernels. * Authors: Adrian Hoban <adrian.hoban@intel.com> * Gabriele Paoloni <gabriele.paoloni@intel.com> * Tadeusz Struk (tadeusz.struk@intel.com) * Aidan O'Mahony (aidan.o.mahony@intel.com) * Copyright (c) 2010, Intel Corporation. * * Copyright 2024 Google LLC */ #include <linux/hardirq.h> #include <linux/types.h> #include <linux/module.h> #include <linux/err.h> #include <crypto/algapi.h> #include <crypto/aes.h> #include <crypto/b128ops.h> #include <crypto/gcm.h> #include <crypto/gf128mul.h> #include <crypto/xts.h> #include <asm/cpu_device_id.h> #include <asm/simd.h> #include <crypto/scatterwalk.h> #include <crypto/internal/aead.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <linux/jump_label.h> #include <linux/workqueue.h> #include <linux/spinlock.h> #include <linux/static_call.h> #define AESNI_ALIGN 16 #define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN))) #define AES_BLOCK_MASK (~(AES_BLOCK_SIZE - 1)) #define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1)) #define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA) #define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA) struct aesni_xts_ctx { struct crypto_aes_ctx tweak_ctx AESNI_ALIGN_ATTR; struct crypto_aes_ctx crypt_ctx AESNI_ALIGN_ATTR; }; static inline void *aes_align_addr(void *addr) { if (crypto_tfm_ctx_alignment() >= AESNI_ALIGN) return addr; return PTR_ALIGN(addr, AESNI_ALIGN); } asmlinkage void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, unsigned int key_len); asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in); asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len); asmlinkage void aesni_ecb_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len); asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); #ifdef CONFIG_X86_64 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); #endif static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) { return aes_align_addr(raw_ctx); } static inline struct aesni_xts_ctx *aes_xts_ctx(struct crypto_skcipher *tfm) { return aes_align_addr(crypto_skcipher_ctx(tfm)); } static int aes_set_key_common(struct crypto_aes_ctx *ctx, const u8 *in_key, unsigned int key_len) { int err; if (!crypto_simd_usable()) return aes_expandkey(ctx, in_key, key_len); err = aes_check_keylen(key_len); if (err) return err; kernel_fpu_begin(); aesni_set_key(ctx, in_key, key_len); kernel_fpu_end(); return 0; } static int aesni_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int len) { return aes_set_key_common(aes_ctx(crypto_skcipher_ctx(tfm)), key, len); } static int ecb_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { kernel_fpu_begin(); aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK); kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } return err; } static int ecb_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { kernel_fpu_begin(); aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK); kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } return err; } static int cbc_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { kernel_fpu_begin(); aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } return err; } static int cbc_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { kernel_fpu_begin(); aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } return err; } static int cts_cbc_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2; struct scatterlist *src = req->src, *dst = req->dst; struct scatterlist sg_src[2], sg_dst[2]; struct skcipher_request subreq; struct skcipher_walk walk; int err; skcipher_request_set_tfm(&subreq, tfm); skcipher_request_set_callback(&subreq, skcipher_request_flags(req), NULL, NULL); if (req->cryptlen <= AES_BLOCK_SIZE) { if (req->cryptlen < AES_BLOCK_SIZE) return -EINVAL; cbc_blocks = 1; } if (cbc_blocks > 0) { skcipher_request_set_crypt(&subreq, req->src, req->dst, cbc_blocks * AES_BLOCK_SIZE, req->iv); err = cbc_encrypt(&subreq); if (err) return err; if (req->cryptlen == AES_BLOCK_SIZE) return 0; dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen); if (req->dst != req->src) dst = scatterwalk_ffwd(sg_dst, req->dst, subreq.cryptlen); } /* handle ciphertext stealing */ skcipher_request_set_crypt(&subreq, src, dst, req->cryptlen - cbc_blocks * AES_BLOCK_SIZE, req->iv); err = skcipher_walk_virt(&walk, &subreq, false); if (err) return err; kernel_fpu_begin(); aesni_cts_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes, walk.iv); kernel_fpu_end(); return skcipher_walk_done(&walk, 0); } static int cts_cbc_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2; struct scatterlist *src = req->src, *dst = req->dst; struct scatterlist sg_src[2], sg_dst[2]; struct skcipher_request subreq; struct skcipher_walk walk; int err; skcipher_request_set_tfm(&subreq, tfm); skcipher_request_set_callback(&subreq, skcipher_request_flags(req), NULL, NULL); if (req->cryptlen <= AES_BLOCK_SIZE) { if (req->cryptlen < AES_BLOCK_SIZE) return -EINVAL; cbc_blocks = 1; } if (cbc_blocks > 0) { skcipher_request_set_crypt(&subreq, req->src, req->dst, cbc_blocks * AES_BLOCK_SIZE, req->iv); err = cbc_decrypt(&subreq); if (err) return err; if (req->cryptlen == AES_BLOCK_SIZE) return 0; dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen); if (req->dst != req->src) dst = scatterwalk_ffwd(sg_dst, req->dst, subreq.cryptlen); } /* handle ciphertext stealing */ skcipher_request_set_crypt(&subreq, src, dst, req->cryptlen - cbc_blocks * AES_BLOCK_SIZE, req->iv); err = skcipher_walk_virt(&walk, &subreq, false); if (err) return err; kernel_fpu_begin(); aesni_cts_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes, walk.iv); kernel_fpu_end(); return skcipher_walk_done(&walk, 0); } #ifdef CONFIG_X86_64 /* This is the non-AVX version. */ static int ctr_crypt_aesni(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); u8 keystream[AES_BLOCK_SIZE]; struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) > 0) { kernel_fpu_begin(); if (nbytes & AES_BLOCK_MASK) aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); nbytes &= ~AES_BLOCK_MASK; if (walk.nbytes == walk.total && nbytes > 0) { aesni_enc(ctx, keystream, walk.iv); crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes - nbytes, walk.src.virt.addr + walk.nbytes - nbytes, keystream, nbytes); crypto_inc(walk.iv, AES_BLOCK_SIZE); nbytes = 0; } kernel_fpu_end(); err = skcipher_walk_done(&walk, nbytes); } return err; } #endif static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); int err; err = xts_verify_key(tfm, key, keylen); if (err) return err; keylen /= 2; /* first half of xts-key is for crypt */ err = aes_set_key_common(&ctx->crypt_ctx, key, keylen); if (err) return err; /* second half of xts-key is for tweak */ return aes_set_key_common(&ctx->tweak_ctx, key + keylen, keylen); } typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key, u8 iv[AES_BLOCK_SIZE]); typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key, const u8 *src, u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); /* This handles cases where the source and/or destination span pages. */ static noinline int xts_crypt_slowpath(struct skcipher_request *req, xts_crypt_func crypt_func) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); int tail = req->cryptlen % AES_BLOCK_SIZE; struct scatterlist sg_src[2], sg_dst[2]; struct skcipher_request subreq; struct skcipher_walk walk; struct scatterlist *src, *dst; int err; /* * If the message length isn't divisible by the AES block size, then * separate off the last full block and the partial block. This ensures * that they are processed in the same call to the assembly function, * which is required for ciphertext stealing. */ if (tail) { skcipher_request_set_tfm(&subreq, tfm); skcipher_request_set_callback(&subreq, skcipher_request_flags(req), NULL, NULL); skcipher_request_set_crypt(&subreq, req->src, req->dst, req->cryptlen - tail - AES_BLOCK_SIZE, req->iv); req = &subreq; } err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes) { kernel_fpu_begin(); (*crypt_func)(&ctx->crypt_ctx, walk.src.virt.addr, walk.dst.virt.addr, walk.nbytes & ~(AES_BLOCK_SIZE - 1), req->iv); kernel_fpu_end(); err = skcipher_walk_done(&walk, walk.nbytes & (AES_BLOCK_SIZE - 1)); } if (err || !tail) return err; /* Do ciphertext stealing with the last full block and partial block. */ dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); if (req->dst != req->src) dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, req->iv); err = skcipher_walk_virt(&walk, req, false); if (err) return err; kernel_fpu_begin(); (*crypt_func)(&ctx->crypt_ctx, walk.src.virt.addr, walk.dst.virt.addr, walk.nbytes, req->iv); kernel_fpu_end(); return skcipher_walk_done(&walk, 0); } /* __always_inline to avoid indirect call in fastpath */ static __always_inline int xts_crypt(struct skcipher_request *req, xts_encrypt_iv_func encrypt_iv, xts_crypt_func crypt_func) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); if (unlikely(req->cryptlen < AES_BLOCK_SIZE)) return -EINVAL; kernel_fpu_begin(); (*encrypt_iv)(&ctx->tweak_ctx, req->iv); /* * In practice, virtually all XTS plaintexts and ciphertexts are either * 512 or 4096 bytes and do not use multiple scatterlist elements. To * optimize the performance of these cases, the below fast-path handles * single-scatterlist-element messages as efficiently as possible. The * code is 64-bit specific, as it assumes no page mapping is needed. */ if (IS_ENABLED(CONFIG_X86_64) && likely(req->src->length >= req->cryptlen && req->dst->length >= req->cryptlen)) { (*crypt_func)(&ctx->crypt_ctx, sg_virt(req->src), sg_virt(req->dst), req->cryptlen, req->iv); kernel_fpu_end(); return 0; } kernel_fpu_end(); return xts_crypt_slowpath(req, crypt_func); } static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, u8 iv[AES_BLOCK_SIZE]) { aesni_enc(tweak_key, iv, iv); } static void aesni_xts_encrypt(const struct crypto_aes_ctx *key, const u8 *src, u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]) { aesni_xts_enc(key, dst, src, len, tweak); } static void aesni_xts_decrypt(const struct crypto_aes_ctx *key, const u8 *src, u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]) { aesni_xts_dec(key, dst, src, len, tweak); } static int xts_encrypt_aesni(struct skcipher_request *req) { return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_encrypt); } static int xts_decrypt_aesni(struct skcipher_request *req) { return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_decrypt); } static struct skcipher_alg aesni_skciphers[] = { { .base = { .cra_name = "ecb(aes)", .cra_driver_name = "ecb-aes-aesni", .cra_priority = 400, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, }, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, .setkey = aesni_skcipher_setkey, .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { .base = { .cra_name = "cbc(aes)", .cra_driver_name = "cbc-aes-aesni", .cra_priority = 400, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, }, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, .setkey = aesni_skcipher_setkey, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, }, { .base = { .cra_name = "cts(cbc(aes))", .cra_driver_name = "cts-cbc-aes-aesni", .cra_priority = 400, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, }, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, .walksize = 2 * AES_BLOCK_SIZE, .setkey = aesni_skcipher_setkey, .encrypt = cts_cbc_encrypt, .decrypt = cts_cbc_decrypt, #ifdef CONFIG_X86_64 }, { .base = { .cra_name = "ctr(aes)", .cra_driver_name = "ctr-aes-aesni", .cra_priority = 400, .cra_blocksize = 1, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, }, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, .chunksize = AES_BLOCK_SIZE, .setkey = aesni_skcipher_setkey, .encrypt = ctr_crypt_aesni, .decrypt = ctr_crypt_aesni, #endif }, { .base = { .cra_name = "xts(aes)", .cra_driver_name = "xts-aes-aesni", .cra_priority = 401, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = XTS_AES_CTX_SIZE, .cra_module = THIS_MODULE, }, .min_keysize = 2 * AES_MIN_KEY_SIZE, .max_keysize = 2 * AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, .walksize = 2 * AES_BLOCK_SIZE, .setkey = xts_setkey_aesni, .encrypt = xts_encrypt_aesni, .decrypt = xts_decrypt_aesni, } }; #ifdef CONFIG_X86_64 asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, u8 iv[AES_BLOCK_SIZE]); /* __always_inline to avoid indirect call */ static __always_inline int ctr_crypt(struct skcipher_request *req, void (*ctr64_func)(const struct crypto_aes_ctx *key, const u8 *src, u8 *dst, int len, const u64 le_ctr[2])) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm)); unsigned int nbytes, p1_nbytes, nblocks; struct skcipher_walk walk; u64 le_ctr[2]; u64 ctr64; int err; ctr64 = le_ctr[0] = get_unaligned_be64(&req->iv[8]); le_ctr[1] = get_unaligned_be64(&req->iv[0]); err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) != 0) { if (nbytes < walk.total) { /* Not the end yet, so keep the length block-aligned. */ nbytes = round_down(nbytes, AES_BLOCK_SIZE); nblocks = nbytes / AES_BLOCK_SIZE; } else { /* It's the end, so include any final partial block. */ nblocks = DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE); } ctr64 += nblocks; kernel_fpu_begin(); if (likely(ctr64 >= nblocks)) { /* The low 64 bits of the counter won't overflow. */ (*ctr64_func)(key, walk.src.virt.addr, walk.dst.virt.addr, nbytes, le_ctr); } else { /* * The low 64 bits of the counter will overflow. The * assembly doesn't handle this case, so split the * operation into two at the point where the overflow * will occur. After the first part, add the carry bit. */ p1_nbytes = min(nbytes, (nblocks - ctr64) * AES_BLOCK_SIZE); (*ctr64_func)(key, walk.src.virt.addr, walk.dst.virt.addr, p1_nbytes, le_ctr); le_ctr[0] = 0; le_ctr[1]++; (*ctr64_func)(key, walk.src.virt.addr + p1_nbytes, walk.dst.virt.addr + p1_nbytes, nbytes - p1_nbytes, le_ctr); } kernel_fpu_end(); le_ctr[0] = ctr64; err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } put_unaligned_be64(ctr64, &req->iv[8]); put_unaligned_be64(le_ctr[1], &req->iv[0]); return err; } /* __always_inline to avoid indirect call */ static __always_inline int xctr_crypt(struct skcipher_request *req, void (*xctr_func)(const struct crypto_aes_ctx *key, const u8 *src, u8 *dst, int len, const u8 iv[AES_BLOCK_SIZE], u64 ctr)) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm)); struct skcipher_walk walk; unsigned int nbytes; u64 ctr = 1; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) != 0) { if (nbytes < walk.total) nbytes = round_down(nbytes, AES_BLOCK_SIZE); kernel_fpu_begin(); (*xctr_func)(key, walk.src.virt.addr, walk.dst.virt.addr, nbytes, req->iv, ctr); kernel_fpu_end(); ctr += DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE); err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; } #define DEFINE_AVX_SKCIPHER_ALGS(suffix, driver_name_suffix, priority) \ \ asmlinkage void \ aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); \ asmlinkage void \ aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); \ \ static int xts_encrypt_##suffix(struct skcipher_request *req) \ { \ return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_encrypt_##suffix); \ } \ \ static int xts_decrypt_##suffix(struct skcipher_request *req) \ { \ return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_decrypt_##suffix); \ } \ \ asmlinkage void \ aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key, \ const u8 *src, u8 *dst, int len, const u64 le_ctr[2]);\ \ static int ctr_crypt_##suffix(struct skcipher_request *req) \ { \ return ctr_crypt(req, aes_ctr64_crypt_##suffix); \ } \ \ asmlinkage void \ aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key, \ const u8 *src, u8 *dst, int len, \ const u8 iv[AES_BLOCK_SIZE], u64 ctr); \ \ static int xctr_crypt_##suffix(struct skcipher_request *req) \ { \ return xctr_crypt(req, aes_xctr_crypt_##suffix); \ } \ \ static struct skcipher_alg skcipher_algs_##suffix[] = {{ \ .base.cra_name = "xts(aes)", \ .base.cra_driver_name = "xts-aes-" driver_name_suffix, \ .base.cra_priority = priority, \ .base.cra_blocksize = AES_BLOCK_SIZE, \ .base.cra_ctxsize = XTS_AES_CTX_SIZE, \ .base.cra_module = THIS_MODULE, \ .min_keysize = 2 * AES_MIN_KEY_SIZE, \ .max_keysize = 2 * AES_MAX_KEY_SIZE, \ .ivsize = AES_BLOCK_SIZE, \ .walksize = 2 * AES_BLOCK_SIZE, \ .setkey = xts_setkey_aesni, \ .encrypt = xts_encrypt_##suffix, \ .decrypt = xts_decrypt_##suffix, \ }, { \ .base.cra_name = "ctr(aes)", \ .base.cra_driver_name = "ctr-aes-" driver_name_suffix, \ .base.cra_priority = priority, \ .base.cra_blocksize = 1, \ .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \ .base.cra_module = THIS_MODULE, \ .min_keysize = AES_MIN_KEY_SIZE, \ .max_keysize = AES_MAX_KEY_SIZE, \ .ivsize = AES_BLOCK_SIZE, \ .chunksize = AES_BLOCK_SIZE, \ .setkey = aesni_skcipher_setkey, \ .encrypt = ctr_crypt_##suffix, \ .decrypt = ctr_crypt_##suffix, \ }, { \ .base.cra_name = "xctr(aes)", \ .base.cra_driver_name = "xctr-aes-" driver_name_suffix, \ .base.cra_priority = priority, \ .base.cra_blocksize = 1, \ .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \ .base.cra_module = THIS_MODULE, \ .min_keysize = AES_MIN_KEY_SIZE, \ .max_keysize = AES_MAX_KEY_SIZE, \ .ivsize = AES_BLOCK_SIZE, \ .chunksize = AES_BLOCK_SIZE, \ .setkey = aesni_skcipher_setkey, \ .encrypt = xctr_crypt_##suffix, \ .decrypt = xctr_crypt_##suffix, \ }} DEFINE_AVX_SKCIPHER_ALGS(aesni_avx, "aesni-avx", 500); DEFINE_AVX_SKCIPHER_ALGS(vaes_avx2, "vaes-avx2", 600); DEFINE_AVX_SKCIPHER_ALGS(vaes_avx512, "vaes-avx512", 800); /* The common part of the x86_64 AES-GCM key struct */ struct aes_gcm_key { /* Expanded AES key and the AES key length in bytes */ struct aes_enckey aes_key; /* RFC4106 nonce (used only by the rfc4106 algorithms) */ u32 rfc4106_nonce; }; /* Key struct used by the AES-NI implementations of AES-GCM */ struct aes_gcm_key_aesni { /* * Common part of the key. 16-byte alignment is required by the * assembly code. */ struct aes_gcm_key base __aligned(16); /* * Powers of the hash key H^8 through H^1. These are 128-bit values. * They all have an extra factor of x^-1 and are byte-reversed. 16-byte * alignment is required by the assembly code. */ u64 h_powers[8][2] __aligned(16); /* * h_powers_xored[i] contains the two 64-bit halves of h_powers[i] XOR'd * together. It's used for Karatsuba multiplication. 16-byte alignment * is required by the assembly code. */ u64 h_powers_xored[8] __aligned(16); /* * H^1 times x^64 (and also the usual extra factor of x^-1). 16-byte * alignment is required by the assembly code. */ u64 h_times_x64[2] __aligned(16); }; #define AES_GCM_KEY_AESNI(key) \ container_of((key), struct aes_gcm_key_aesni, base) #define AES_GCM_KEY_AESNI_SIZE \ (sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1))) /* Key struct used by the VAES + AVX2 implementation of AES-GCM */ struct aes_gcm_key_vaes_avx2 { /* * Common part of the key. The assembly code prefers 16-byte alignment * for this. */ struct aes_gcm_key base __aligned(16); /* * Powers of the hash key H^8 through H^1. These are 128-bit values. * They all have an extra factor of x^-1 and are byte-reversed. * The assembly code prefers 32-byte alignment for this. */ u64 h_powers[8][2] __aligned(32); /* * Each entry in this array contains the two halves of an entry of * h_powers XOR'd together, in the following order: * H^8,H^6,H^7,H^5,H^4,H^2,H^3,H^1 i.e. indices 0,2,1,3,4,6,5,7. * This is used for Karatsuba multiplication. */ u64 h_powers_xored[8]; }; #define AES_GCM_KEY_VAES_AVX2(key) \ container_of((key), struct aes_gcm_key_vaes_avx2, base) #define AES_GCM_KEY_VAES_AVX2_SIZE \ (sizeof(struct aes_gcm_key_vaes_avx2) + (31 & ~(CRYPTO_MINALIGN - 1))) /* Key struct used by the VAES + AVX512 implementation of AES-GCM */ struct aes_gcm_key_vaes_avx512 { /* * Common part of the key. The assembly code prefers 16-byte alignment * for this. */ struct aes_gcm_key base __aligned(16); /* * Powers of the hash key H^16 through H^1. These are 128-bit values. * They all have an extra factor of x^-1 and are byte-reversed. This * array is aligned to a 64-byte boundary to make it naturally aligned * for 512-bit loads, which can improve performance. (The assembly code * doesn't *need* the alignment; this is just an optimization.) */ u64 h_powers[16][2] __aligned(64); /* Three padding blocks required by the assembly code */ u64 padding[3][2]; }; #define AES_GCM_KEY_VAES_AVX512(key) \ container_of((key), struct aes_gcm_key_vaes_avx512, base) #define AES_GCM_KEY_VAES_AVX512_SIZE \ (sizeof(struct aes_gcm_key_vaes_avx512) + (63 & ~(CRYPTO_MINALIGN - 1))) /* * These flags are passed to the AES-GCM helper functions to specify the * specific version of AES-GCM (RFC4106 or not), whether it's encryption or * decryption, and which assembly functions should be called. Assembly * functions are selected using flags instead of function pointers to avoid * indirect calls (which are very expensive on x86) regardless of inlining. */ #define FLAG_RFC4106 BIT(0) #define FLAG_ENC BIT(1) #define FLAG_AVX BIT(2) #define FLAG_VAES_AVX2 BIT(3) #define FLAG_VAES_AVX512 BIT(4) static inline struct aes_gcm_key * aes_gcm_key_get(struct crypto_aead *tfm, int flags) { if (flags & FLAG_VAES_AVX512) return PTR_ALIGN(crypto_aead_ctx(tfm), 64); else if (flags & FLAG_VAES_AVX2) return PTR_ALIGN(crypto_aead_ctx(tfm), 32); else return PTR_ALIGN(crypto_aead_ctx(tfm), 16); } asmlinkage void aes_gcm_precompute_aesni(struct aes_gcm_key_aesni *key); asmlinkage void aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key); asmlinkage void aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key); asmlinkage void aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key); static void aes_gcm_precompute(struct aes_gcm_key *key, int flags) { if (flags & FLAG_VAES_AVX512) aes_gcm_precompute_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key)); else if (flags & FLAG_VAES_AVX2) aes_gcm_precompute_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key)); else if (flags & FLAG_AVX) aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key)); else aes_gcm_precompute_aesni(AES_GCM_KEY_AESNI(key)); } asmlinkage void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key, u8 ghash_acc[16], const u8 *aad, int aadlen); asmlinkage void aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key, u8 ghash_acc[16], const u8 *aad, int aadlen); asmlinkage void aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, u8 ghash_acc[16], const u8 *aad, int aadlen); asmlinkage void aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, u8 ghash_acc[16], const u8 *aad, int aadlen); static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16], const u8 *aad, int aadlen, int flags) { if (flags & FLAG_VAES_AVX512) aes_gcm_aad_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), ghash_acc, aad, aadlen); else if (flags & FLAG_VAES_AVX2) aes_gcm_aad_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), ghash_acc, aad, aadlen); else if (flags & FLAG_AVX) aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc, aad, aadlen); else aes_gcm_aad_update_aesni(AES_GCM_KEY_AESNI(key), ghash_acc, aad, aadlen); } asmlinkage void aes_gcm_enc_update_aesni(const struct aes_gcm_key_aesni *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen); asmlinkage void aes_gcm_enc_update_aesni_avx(const struct aes_gcm_key_aesni *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen); asmlinkage void aes_gcm_enc_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen); asmlinkage void aes_gcm_enc_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen); asmlinkage void aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen); asmlinkage void aes_gcm_dec_update_aesni_avx(const struct aes_gcm_key_aesni *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen); asmlinkage void aes_gcm_dec_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen); asmlinkage void aes_gcm_dec_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen); /* __always_inline to optimize out the branches based on @flags */ static __always_inline void aes_gcm_update(const struct aes_gcm_key *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen, int flags) { if (flags & FLAG_ENC) { if (flags & FLAG_VAES_AVX512) aes_gcm_enc_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), le_ctr, ghash_acc, src, dst, datalen); else if (flags & FLAG_VAES_AVX2) aes_gcm_enc_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), le_ctr, ghash_acc, src, dst, datalen); else if (flags & FLAG_AVX) aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, src, dst, datalen); else aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, src, dst, datalen); } else { if (flags & FLAG_VAES_AVX512) aes_gcm_dec_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), le_ctr, ghash_acc, src, dst, datalen); else if (flags & FLAG_VAES_AVX2) aes_gcm_dec_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), le_ctr, ghash_acc, src, dst, datalen); else if (flags & FLAG_AVX) aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, src, dst, datalen); else aes_gcm_dec_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, src, dst, datalen); } } asmlinkage void aes_gcm_enc_final_aesni(const struct aes_gcm_key_aesni *key, const u32 le_ctr[4], u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen); asmlinkage void aes_gcm_enc_final_aesni_avx(const struct aes_gcm_key_aesni *key, const u32 le_ctr[4], u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen); asmlinkage void aes_gcm_enc_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, const u32 le_ctr[4], u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen); asmlinkage void aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, const u32 le_ctr[4], u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen); /* __always_inline to optimize out the branches based on @flags */ static __always_inline void aes_gcm_enc_final(const struct aes_gcm_key *key, const u32 le_ctr[4], u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, int flags) { if (flags & FLAG_VAES_AVX512) aes_gcm_enc_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), le_ctr, ghash_acc, total_aadlen, total_datalen); else if (flags & FLAG_VAES_AVX2) aes_gcm_enc_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), le_ctr, ghash_acc, total_aadlen, total_datalen); else if (flags & FLAG_AVX) aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, total_aadlen, total_datalen); else aes_gcm_enc_final_aesni(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, total_aadlen, total_datalen); } asmlinkage bool __must_check aes_gcm_dec_final_aesni(const struct aes_gcm_key_aesni *key, const u32 le_ctr[4], const u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, const u8 tag[16], int taglen); asmlinkage bool __must_check aes_gcm_dec_final_aesni_avx(const struct aes_gcm_key_aesni *key, const u32 le_ctr[4], const u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, const u8 tag[16], int taglen); asmlinkage bool __must_check aes_gcm_dec_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, const u32 le_ctr[4], const u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, const u8 tag[16], int taglen); asmlinkage bool __must_check aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, const u32 le_ctr[4], const u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, const u8 tag[16], int taglen); /* __always_inline to optimize out the branches based on @flags */ static __always_inline bool __must_check aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4], u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, u8 tag[16], int taglen, int flags) { if (flags & FLAG_VAES_AVX512) return aes_gcm_dec_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), le_ctr, ghash_acc, total_aadlen, total_datalen, tag, taglen); else if (flags & FLAG_VAES_AVX2) return aes_gcm_dec_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), le_ctr, ghash_acc, total_aadlen, total_datalen, tag, taglen); else if (flags & FLAG_AVX) return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, total_aadlen, total_datalen, tag, taglen); else return aes_gcm_dec_final_aesni(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, total_aadlen, total_datalen, tag, taglen); } /* * This is the Integrity Check Value (aka the authentication tag) length and can * be 8, 12 or 16 bytes long. */ static int common_rfc4106_set_authsize(struct crypto_aead *aead, unsigned int authsize) { switch (authsize) { case 8: case 12: case 16: break; default: return -EINVAL; } return 0; } static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, unsigned int authsize) { switch (authsize) { case 4: case 8: case 12: case 13: case 14: case 15: case 16: break; default: return -EINVAL; } return 0; } /* * This is the setkey function for the x86_64 implementations of AES-GCM. It * saves the RFC4106 nonce if applicable, expands the AES key, and precomputes * powers of the hash key. * * To comply with the crypto_aead API, this has to be usable in no-SIMD context. * For that reason, this function includes a portable C implementation of the * needed logic. However, the portable C implementation is very slow, taking * about the same time as encrypting 37 KB of data. To be ready for users that * may set a key even somewhat frequently, we therefore also include a SIMD * assembly implementation, expanding the AES key using AES-NI and precomputing * the hash key powers using PCLMULQDQ or VPCLMULQDQ. */ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key, unsigned int keylen, int flags) { struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags); int err; if (flags & FLAG_RFC4106) { if (keylen < 4) return -EINVAL; keylen -= 4; key->rfc4106_nonce = get_unaligned_be32(raw_key + keylen); } /* The assembly code assumes the following offsets. */ static_assert(offsetof(struct aes_gcm_key_aesni, base.aes_key.len) == 0); static_assert(offsetof(struct aes_gcm_key_aesni, base.aes_key.k.rndkeys) == 16); static_assert(offsetof(struct aes_gcm_key_aesni, h_powers) == 272); static_assert(offsetof(struct aes_gcm_key_aesni, h_powers_xored) == 400); static_assert(offsetof(struct aes_gcm_key_aesni, h_times_x64) == 464); static_assert(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.len) == 0); static_assert(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.k.rndkeys) == 16); static_assert(offsetof(struct aes_gcm_key_vaes_avx2, h_powers) == 288); static_assert(offsetof(struct aes_gcm_key_vaes_avx2, h_powers_xored) == 416); static_assert(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.len) == 0); static_assert(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.k.rndkeys) == 16); static_assert(offsetof(struct aes_gcm_key_vaes_avx512, h_powers) == 320); static_assert(offsetof(struct aes_gcm_key_vaes_avx512, padding) == 576); err = aes_prepareenckey(&key->aes_key, raw_key, keylen); if (err) return err; if (likely(crypto_simd_usable())) { kernel_fpu_begin(); aes_gcm_precompute(key, flags); kernel_fpu_end(); } else { static const u8 x_to_the_minus1[16] __aligned(__alignof__(be128)) = { [0] = 0xc2, [15] = 1 }; static const u8 x_to_the_63[16] __aligned(__alignof__(be128)) = { [7] = 1, }; be128 h1 = {}; be128 h; int i; /* Encrypt the all-zeroes block to get the hash key H^1 */ aes_encrypt(&key->aes_key, (u8 *)&h1, (u8 *)&h1); /* Compute H^1 * x^-1 */ h = h1; gf128mul_lle(&h, (const be128 *)x_to_the_minus1); /* Compute the needed key powers */ if (flags & FLAG_VAES_AVX512) { struct aes_gcm_key_vaes_avx512 *k = AES_GCM_KEY_VAES_AVX512(key); for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { k->h_powers[i][0] = be64_to_cpu(h.b); k->h_powers[i][1] = be64_to_cpu(h.a); gf128mul_lle(&h, &h1); } memset(k->padding, 0, sizeof(k->padding)); } else if (flags & FLAG_VAES_AVX2) { struct aes_gcm_key_vaes_avx2 *k = AES_GCM_KEY_VAES_AVX2(key); static const u8 indices[8] = { 0, 2, 1, 3, 4, 6, 5, 7 }; for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { k->h_powers[i][0] = be64_to_cpu(h.b); k->h_powers[i][1] = be64_to_cpu(h.a); gf128mul_lle(&h, &h1); } for (i = 0; i < ARRAY_SIZE(k->h_powers_xored); i++) { int j = indices[i]; k->h_powers_xored[i] = k->h_powers[j][0] ^ k->h_powers[j][1]; } } else { struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key); for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { k->h_powers[i][0] = be64_to_cpu(h.b); k->h_powers[i][1] = be64_to_cpu(h.a); k->h_powers_xored[i] = k->h_powers[i][0] ^ k->h_powers[i][1]; gf128mul_lle(&h, &h1); } gf128mul_lle(&h1, (const be128 *)x_to_the_63); k->h_times_x64[0] = be64_to_cpu(h1.b); k->h_times_x64[1] = be64_to_cpu(h1.a); } } return 0; } /* * Initialize @ghash_acc, then pass all @assoclen bytes of associated data * (a.k.a. additional authenticated data) from @sg_src through the GHASH update * assembly function. kernel_fpu_begin() must have already been called. */ static void gcm_process_assoc(const struct aes_gcm_key *key, u8 ghash_acc[16], struct scatterlist *sg_src, unsigned int assoclen, int flags) { struct scatter_walk walk; /* * The assembly function requires that the length of any non-last * segment of associated data be a multiple of 16 bytes, so this * function does the buffering needed to achieve that. */ unsigned int pos = 0; u8 buf[16]; memset(ghash_acc, 0, 16); scatterwalk_start(&walk, sg_src); while (assoclen) { unsigned int orig_len_this_step = scatterwalk_next( &walk, assoclen); unsigned int len_this_step = orig_len_this_step; unsigned int len; const u8 *src = walk.addr; if (unlikely(pos)) { len = min(len_this_step, 16 - pos); memcpy(&buf[pos], src, len); pos += len; src += len; len_this_step -= len; if (pos < 16) goto next; aes_gcm_aad_update(key, ghash_acc, buf, 16, flags); pos = 0; } len = len_this_step; if (unlikely(assoclen)) /* Not the last segment yet? */ len = round_down(len, 16); aes_gcm_aad_update(key, ghash_acc, src, len, flags); src += len; len_this_step -= len; if (unlikely(len_this_step)) { memcpy(buf, src, len_this_step); pos = len_this_step; } next: scatterwalk_done_src(&walk, orig_len_this_step); if (need_resched()) { kernel_fpu_end(); kernel_fpu_begin(); } assoclen -= orig_len_this_step; } if (unlikely(pos)) aes_gcm_aad_update(key, ghash_acc, buf, pos, flags); } /* __always_inline to optimize out the branches based on @flags */ static __always_inline int gcm_crypt(struct aead_request *req, int flags) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); const struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags); unsigned int assoclen = req->assoclen; struct skcipher_walk walk; unsigned int nbytes; u8 ghash_acc[16]; /* GHASH accumulator */ u32 le_ctr[4]; /* Counter in little-endian format */ int taglen; int err; /* Initialize the counter and determine the associated data length. */ le_ctr[0] = 2; if (flags & FLAG_RFC4106) { if (unlikely(assoclen != 16 && assoclen != 20)) return -EINVAL; assoclen -= 8; le_ctr[1] = get_unaligned_be32(req->iv + 4); le_ctr[2] = get_unaligned_be32(req->iv + 0); le_ctr[3] = key->rfc4106_nonce; /* already byte-swapped */ } else { le_ctr[1] = get_unaligned_be32(req->iv + 8); le_ctr[2] = get_unaligned_be32(req->iv + 4); le_ctr[3] = get_unaligned_be32(req->iv + 0); } /* Begin walking through the plaintext or ciphertext. */ if (flags & FLAG_ENC) err = skcipher_walk_aead_encrypt(&walk, req, false); else err = skcipher_walk_aead_decrypt(&walk, req, false); if (err) return err; /* * Since the AES-GCM assembly code requires that at least three assembly * functions be called to process any message (this is needed to support * incremental updates cleanly), to reduce overhead we try to do all * three calls in the same kernel FPU section if possible. We close the * section and start a new one if there are multiple data segments or if * rescheduling is needed while processing the associated data. */ kernel_fpu_begin(); /* Pass the associated data through GHASH. */ gcm_process_assoc(key, ghash_acc, req->src, assoclen, flags); /* En/decrypt the data and pass the ciphertext through GHASH. */ while (unlikely((nbytes = walk.nbytes) < walk.total)) { /* * Non-last segment. In this case, the assembly function * requires that the length be a multiple of 16 (AES_BLOCK_SIZE) * bytes. The needed buffering of up to 16 bytes is handled by * the skcipher_walk. Here we just need to round down to a * multiple of 16. */ nbytes = round_down(nbytes, AES_BLOCK_SIZE); aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr, walk.dst.virt.addr, nbytes, flags); le_ctr[0] += nbytes / AES_BLOCK_SIZE; kernel_fpu_end(); err = skcipher_walk_done(&walk, walk.nbytes - nbytes); if (err) return err; kernel_fpu_begin(); } /* Last segment: process all remaining data. */ aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr, walk.dst.virt.addr, nbytes, flags); /* * The low word of the counter isn't used by the finalize, so there's no * need to increment it here. */ /* Finalize */ taglen = crypto_aead_authsize(tfm); if (flags & FLAG_ENC) { /* Finish computing the auth tag. */ aes_gcm_enc_final(key, le_ctr, ghash_acc, assoclen, req->cryptlen, flags); /* Store the computed auth tag in the dst scatterlist. */ scatterwalk_map_and_copy(ghash_acc, req->dst, req->assoclen + req->cryptlen, taglen, 1); } else { unsigned int datalen = req->cryptlen - taglen; u8 tag[16]; /* Get the transmitted auth tag from the src scatterlist. */ scatterwalk_map_and_copy(tag, req->src, req->assoclen + datalen, taglen, 0); /* * Finish computing the auth tag and compare it to the * transmitted one. The assembly function does the actual tag * comparison. Here, just check the boolean result. */ if (!aes_gcm_dec_final(key, le_ctr, ghash_acc, assoclen, datalen, tag, taglen, flags)) err = -EBADMSG; } kernel_fpu_end(); if (nbytes) skcipher_walk_done(&walk, 0); return err; } #define DEFINE_GCM_ALGS(suffix, flags, generic_driver_name, rfc_driver_name, \ ctxsize, priority) \ \ static int gcm_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \ unsigned int keylen) \ { \ return gcm_setkey(tfm, raw_key, keylen, (flags)); \ } \ \ static int gcm_encrypt_##suffix(struct aead_request *req) \ { \ return gcm_crypt(req, (flags) | FLAG_ENC); \ } \ \ static int gcm_decrypt_##suffix(struct aead_request *req) \ { \ return gcm_crypt(req, (flags)); \ } \ \ static int rfc4106_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \ unsigned int keylen) \ { \ return gcm_setkey(tfm, raw_key, keylen, (flags) | FLAG_RFC4106); \ } \ \ static int rfc4106_encrypt_##suffix(struct aead_request *req) \ { \ return gcm_crypt(req, (flags) | FLAG_RFC4106 | FLAG_ENC); \ } \ \ static int rfc4106_decrypt_##suffix(struct aead_request *req) \ { \ return gcm_crypt(req, (flags) | FLAG_RFC4106); \ } \ \ static struct aead_alg aes_gcm_algs_##suffix[] = { { \ .setkey = gcm_setkey_##suffix, \ .setauthsize = generic_gcmaes_set_authsize, \ .encrypt = gcm_encrypt_##suffix, \ .decrypt = gcm_decrypt_##suffix, \ .ivsize = GCM_AES_IV_SIZE, \ .chunksize = AES_BLOCK_SIZE, \ .maxauthsize = 16, \ .base = { \ .cra_name = "gcm(aes)", \ .cra_driver_name = generic_driver_name, \ .cra_priority = (priority), \ .cra_blocksize = 1, \ .cra_ctxsize = (ctxsize), \ .cra_module = THIS_MODULE, \ }, \ }, { \ .setkey = rfc4106_setkey_##suffix, \ .setauthsize = common_rfc4106_set_authsize, \ .encrypt = rfc4106_encrypt_##suffix, \ .decrypt = rfc4106_decrypt_##suffix, \ .ivsize = GCM_RFC4106_IV_SIZE, \ .chunksize = AES_BLOCK_SIZE, \ .maxauthsize = 16, \ .base = { \ .cra_name = "rfc4106(gcm(aes))", \ .cra_driver_name = rfc_driver_name, \ .cra_priority = (priority), \ .cra_blocksize = 1, \ .cra_ctxsize = (ctxsize), \ .cra_module = THIS_MODULE, \ }, \ } } /* aes_gcm_algs_aesni */ DEFINE_GCM_ALGS(aesni, /* no flags */ 0, "generic-gcm-aesni", "rfc4106-gcm-aesni", AES_GCM_KEY_AESNI_SIZE, 400); /* aes_gcm_algs_aesni_avx */ DEFINE_GCM_ALGS(aesni_avx, FLAG_AVX, "generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx", AES_GCM_KEY_AESNI_SIZE, 500); /* aes_gcm_algs_vaes_avx2 */ DEFINE_GCM_ALGS(vaes_avx2, FLAG_VAES_AVX2, "generic-gcm-vaes-avx2", "rfc4106-gcm-vaes-avx2", AES_GCM_KEY_VAES_AVX2_SIZE, 600); /* aes_gcm_algs_vaes_avx512 */ DEFINE_GCM_ALGS(vaes_avx512, FLAG_VAES_AVX512, "generic-gcm-vaes-avx512", "rfc4106-gcm-vaes-avx512", AES_GCM_KEY_VAES_AVX512_SIZE, 800); static int __init register_avx_algs(void) { int err; if (!boot_cpu_has(X86_FEATURE_AVX)) return 0; err = crypto_register_skciphers(skcipher_algs_aesni_avx, ARRAY_SIZE(skcipher_algs_aesni_avx)); if (err) return err; err = crypto_register_aeads(aes_gcm_algs_aesni_avx, ARRAY_SIZE(aes_gcm_algs_aesni_avx)); if (err) return err; /* * Note: not all the algorithms registered below actually require * VPCLMULQDQ. But in practice every CPU with VAES also has VPCLMULQDQ. * Similarly, the assembler support was added at about the same time. * For simplicity, just always check for VAES and VPCLMULQDQ together. */ if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_VAES) || !boot_cpu_has(X86_FEATURE_VPCLMULQDQ) || !boot_cpu_has(X86_FEATURE_PCLMULQDQ) || !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) return 0; err = crypto_register_skciphers(skcipher_algs_vaes_avx2, ARRAY_SIZE(skcipher_algs_vaes_avx2)); if (err) return err; err = crypto_register_aeads(aes_gcm_algs_vaes_avx2, ARRAY_SIZE(aes_gcm_algs_vaes_avx2)); if (err) return err; if (!boot_cpu_has(X86_FEATURE_AVX512BW) || !boot_cpu_has(X86_FEATURE_AVX512VL) || !boot_cpu_has(X86_FEATURE_BMI2) || !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL)) return 0; if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) { int i; for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++) skcipher_algs_vaes_avx512[i].base.cra_priority = 1; for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx512); i++) aes_gcm_algs_vaes_avx512[i].base.cra_priority = 1; } err = crypto_register_skciphers(skcipher_algs_vaes_avx512, ARRAY_SIZE(skcipher_algs_vaes_avx512)); if (err) return err; err = crypto_register_aeads(aes_gcm_algs_vaes_avx512, ARRAY_SIZE(aes_gcm_algs_vaes_avx512)); if (err) return err; return 0; } #define unregister_skciphers(A) \ if (refcount_read(&(A)[0].base.cra_refcnt) != 0) \ crypto_unregister_skciphers((A), ARRAY_SIZE(A)) #define unregister_aeads(A) \ if (refcount_read(&(A)[0].base.cra_refcnt) != 0) \ crypto_unregister_aeads((A), ARRAY_SIZE(A)) static void unregister_avx_algs(void) { unregister_skciphers(skcipher_algs_aesni_avx); unregister_aeads(aes_gcm_algs_aesni_avx); unregister_skciphers(skcipher_algs_vaes_avx2); unregister_skciphers(skcipher_algs_vaes_avx512); unregister_aeads(aes_gcm_algs_vaes_avx2); unregister_aeads(aes_gcm_algs_vaes_avx512); } #else /* CONFIG_X86_64 */ static struct aead_alg aes_gcm_algs_aesni[0]; static int __init register_avx_algs(void) { return 0; } static void unregister_avx_algs(void) { } #endif /* !CONFIG_X86_64 */ static const struct x86_cpu_id aesni_cpu_id[] = { X86_MATCH_FEATURE(X86_FEATURE_AES, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); static int __init aesni_init(void) { int err; if (!x86_match_cpu(aesni_cpu_id)) return -ENODEV; err = crypto_register_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers)); if (err) return err; err = crypto_register_aeads(aes_gcm_algs_aesni, ARRAY_SIZE(aes_gcm_algs_aesni)); if (err) goto unregister_skciphers; err = register_avx_algs(); if (err) goto unregister_avx; return 0; unregister_avx: unregister_avx_algs(); crypto_unregister_aeads(aes_gcm_algs_aesni, ARRAY_SIZE(aes_gcm_algs_aesni)); unregister_skciphers: crypto_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers)); return err; } static void __exit aesni_exit(void) { crypto_unregister_aeads(aes_gcm_algs_aesni, ARRAY_SIZE(aes_gcm_algs_aesni)); crypto_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers)); unregister_avx_algs(); } module_init(aesni_init); module_exit(aesni_exit); MODULE_DESCRIPTION("AES cipher and modes, optimized with AES-NI or VAES instructions"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("aes");
3 8167 8167 8175 819 12685 9892 2643 1816 86 17 10 9 2 7 346 2 6 5081 5081 253 4 152 695 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PGTABLE_H #define _LINUX_PGTABLE_H #include <linux/pfn.h> #include <asm/pgtable.h> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) #define PUD_ORDER (PUD_SHIFT - PAGE_SHIFT) #ifndef __ASSEMBLY__ #ifdef CONFIG_MMU #include <linux/mm_types.h> #include <linux/bug.h> #include <linux/errno.h> #include <asm-generic/pgtable_uffd.h> #include <linux/page_table_check.h> #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \ defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS #error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED #endif /* * This defines the generic helper for accessing PMD page * table page. Although platforms can still override this * via their respective <asm/pgtable.h>. */ #ifndef pmd_pgtable #define pmd_pgtable(pmd) pmd_page(pmd) #endif #define pmd_folio(pmd) page_folio(pmd_page(pmd)) /* * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD] * * The pXx_index() functions return the index of the entry in the page * table page which would control the given virtual address * * As these functions may be used by the same code for different levels of * the page table folding, they are always available, regardless of * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0 * because in such cases PTRS_PER_PxD equals 1. */ static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } #ifndef pmd_index static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } #define pmd_index pmd_index #endif #ifndef pud_index static inline unsigned long pud_index(unsigned long address) { return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); } #define pud_index pud_index #endif #ifndef pgd_index /* Must be a compile-time constant, so implement it as a macro */ #define pgd_index(a) (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) #endif #ifndef kernel_pte_init static inline void kernel_pte_init(void *addr) { } #define kernel_pte_init kernel_pte_init #endif #ifndef pmd_init static inline void pmd_init(void *addr) { } #define pmd_init pmd_init #endif #ifndef pud_init static inline void pud_init(void *addr) { } #define pud_init pud_init #endif #ifndef pte_offset_kernel static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) { return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address); } #define pte_offset_kernel pte_offset_kernel #endif #ifdef CONFIG_HIGHPTE #define __pte_map(pmd, address) \ ((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address))) #define pte_unmap(pte) do { \ kunmap_local((pte)); \ rcu_read_unlock(); \ } while (0) #else static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address) { return pte_offset_kernel(pmd, address); } static inline void pte_unmap(pte_t *pte) { rcu_read_unlock(); } #endif void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); /* Find an entry in the second-level page table.. */ #ifndef pmd_offset static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { return pud_pgtable(*pud) + pmd_index(address); } #define pmd_offset pmd_offset #endif #ifndef pud_offset static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) { return p4d_pgtable(*p4d) + pud_index(address); } #define pud_offset pud_offset #endif static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address) { return (pgd + pgd_index(address)); }; /* * a shortcut to get a pgd_t in a given mm */ #ifndef pgd_offset #define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) #endif /* * a shortcut which implies the use of the kernel's pgd, instead * of a process's */ #define pgd_offset_k(address) pgd_offset(&init_mm, (address)) /* * In many cases it is known that a virtual address is mapped at PMD or PTE * level, so instead of traversing all the page table levels, we can get a * pointer to the PMD entry in user or kernel page table or translate a virtual * address to the pointer in the PTE in the kernel page tables with simple * helpers. */ static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va); } static inline pmd_t *pmd_off_k(unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va); } static inline pte_t *virt_to_kpte(unsigned long vaddr) { pmd_t *pmd = pmd_off_k(vaddr); return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr); } #ifndef pmd_young static inline int pmd_young(pmd_t pmd) { return 0; } #endif #ifndef pmd_dirty static inline int pmd_dirty(pmd_t pmd) { return 0; } #endif /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode * is issued. Some architectures may benefit from doing this, and it is * beneficial for both shadow and direct mode hypervisors, which may batch * the PTE updates which happen during this window. Note that using this * interface requires that read hazards be removed from the code. A read * hazard could result in the direct mode hypervisor case, since the actual * write to the page tables may not yet have taken place, so reads though * a raw PTE pointer after it has been modified are not guaranteed to be * up to date. * * In the general case, no lock is guaranteed to be held between entry and exit * of the lazy mode. (In practice, for user PTE updates, the appropriate page * table lock(s) are held, but for kernel PTE updates, no lock is held). * The implementation must therefore assume preemption may be enabled upon * entry to the mode and cpu migration is possible; it must take steps to be * robust against this. An implementation may handle this by disabling * preemption, as a consequence generic code may not sleep while the lazy MMU * mode is active. * * The mode is disabled in interrupt context and calls to the lazy_mmu API have * no effect. * * The lazy MMU mode is enabled for a given block of code using: * * lazy_mmu_mode_enable(); * <code> * lazy_mmu_mode_disable(); * * Nesting is permitted: <code> may itself use an enable()/disable() pair. * A nested call to enable() has no functional effect; however disable() causes * any batched architectural state to be flushed regardless of nesting. After a * call to disable(), the caller can therefore rely on all previous page table * modifications to have taken effect, but the lazy MMU mode may still be * enabled. * * In certain cases, it may be desirable to temporarily pause the lazy MMU mode. * This can be done using: * * lazy_mmu_mode_pause(); * <code> * lazy_mmu_mode_resume(); * * pause() ensures that the mode is exited regardless of the nesting level; * resume() re-enters the mode at the same nesting level. Any call to the * lazy_mmu_mode_* API between those two calls has no effect. In particular, * this means that pause()/resume() pairs may nest. * * is_lazy_mmu_mode_active() can be used to check whether the lazy MMU mode is * currently enabled. */ #ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE /** * lazy_mmu_mode_enable() - Enable the lazy MMU mode. * * Enters a new lazy MMU mode section; if the mode was not already enabled, * enables it and calls arch_enter_lazy_mmu_mode(). * * Must be paired with a call to lazy_mmu_mode_disable(). * * Has no effect if called: * - While paused - see lazy_mmu_mode_pause() * - In interrupt context */ static inline void lazy_mmu_mode_enable(void) { struct lazy_mmu_state *state = &current->lazy_mmu_state; if (in_interrupt() || state->pause_count > 0) return; VM_WARN_ON_ONCE(state->enable_count == U8_MAX); if (state->enable_count++ == 0) arch_enter_lazy_mmu_mode(); } /** * lazy_mmu_mode_disable() - Disable the lazy MMU mode. * * Exits the current lazy MMU mode section. If it is the outermost section, * disables the mode and calls arch_leave_lazy_mmu_mode(). Otherwise (nested * section), calls arch_flush_lazy_mmu_mode(). * * Must match a call to lazy_mmu_mode_enable(). * * Has no effect if called: * - While paused - see lazy_mmu_mode_pause() * - In interrupt context */ static inline void lazy_mmu_mode_disable(void) { struct lazy_mmu_state *state = &current->lazy_mmu_state; if (in_interrupt() || state->pause_count > 0) return; VM_WARN_ON_ONCE(state->enable_count == 0); if (--state->enable_count == 0) arch_leave_lazy_mmu_mode(); else /* Exiting a nested section */ arch_flush_lazy_mmu_mode(); } /** * lazy_mmu_mode_pause() - Pause the lazy MMU mode. * * Pauses the lazy MMU mode; if it is currently active, disables it and calls * arch_leave_lazy_mmu_mode(). * * Must be paired with a call to lazy_mmu_mode_resume(). Calls to the * lazy_mmu_mode_* API have no effect until the matching resume() call. * * Has no effect if called: * - While paused (inside another pause()/resume() pair) * - In interrupt context */ static inline void lazy_mmu_mode_pause(void) { struct lazy_mmu_state *state = &current->lazy_mmu_state; if (in_interrupt()) return; VM_WARN_ON_ONCE(state->pause_count == U8_MAX); if (state->pause_count++ == 0 && state->enable_count > 0) arch_leave_lazy_mmu_mode(); } /** * lazy_mmu_mode_resume() - Resume the lazy MMU mode. * * Resumes the lazy MMU mode; if it was active at the point where the matching * call to lazy_mmu_mode_pause() was made, re-enables it and calls * arch_enter_lazy_mmu_mode(). * * Must match a call to lazy_mmu_mode_pause(). * * Has no effect if called: * - While paused (inside another pause()/resume() pair) * - In interrupt context */ static inline void lazy_mmu_mode_resume(void) { struct lazy_mmu_state *state = &current->lazy_mmu_state; if (in_interrupt()) return; VM_WARN_ON_ONCE(state->pause_count == 0); if (--state->pause_count == 0 && state->enable_count > 0) arch_enter_lazy_mmu_mode(); } #else static inline void lazy_mmu_mode_enable(void) {} static inline void lazy_mmu_mode_disable(void) {} static inline void lazy_mmu_mode_pause(void) {} static inline void lazy_mmu_mode_resume(void) {} #endif #ifndef pte_batch_hint /** * pte_batch_hint - Number of pages that can be added to batch without scanning. * @ptep: Page table pointer for the entry. * @pte: Page table entry. * * Some architectures know that a set of contiguous ptes all map the same * contiguous memory with the same permissions. In this case, it can provide a * hint to aid pte batching without the core code needing to scan every pte. * * An architecture implementation may ignore the PTE accessed state. Further, * the dirty state must apply atomically to all the PTEs described by the hint. * * May be overridden by the architecture, else pte_batch_hint is always 1. */ static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte) { return 1; } #endif #ifndef pte_advance_pfn static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); } #endif #define pte_next_pfn(pte) pte_advance_pfn(pte, 1) #ifndef set_ptes /** * set_ptes - Map consecutive pages to a contiguous range of addresses. * @mm: Address space to map the pages into. * @addr: Address to map the first page at. * @ptep: Page table pointer for the first entry. * @pte: Page table entry for the first page. * @nr: Number of pages to map. * * When nr==1, initial state of pte may be present or not present, and new state * may be present or not present. When nr>1, initial state of all ptes must be * not present, and new state must be present. * * May be overridden by the architecture, or the architecture can define * set_pte() and PFN_PTE_SHIFT. * * Context: The caller holds the page table lock. The pages all belong * to the same folio. The PTEs are all in the same PMD. */ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { page_table_check_ptes_set(mm, addr, ptep, pte, nr); for (;;) { set_pte(ptep, pte); if (--nr == 0) break; ptep++; pte = pte_next_pfn(pte); } } #endif #define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); #endif #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); extern int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty); #else static inline int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { BUILD_BUG(); return 0; } static inline int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef ptep_get static inline pte_t ptep_get(pte_t *ptep) { return READ_ONCE(*ptep); } #endif #ifndef pmdp_get static inline pmd_t pmdp_get(pmd_t *pmdp) { return READ_ONCE(*pmdp); } #endif #ifndef pudp_get static inline pud_t pudp_get(pud_t *pudp) { return READ_ONCE(*pudp); } #endif #ifndef p4dp_get static inline p4d_t p4dp_get(p4d_t *p4dp) { return READ_ONCE(*p4dp); } #endif #ifndef pgdp_get static inline pgd_t pgdp_get(pgd_t *pgdp) { return READ_ONCE(*pgdp); } #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { pte_t pte = ptep_get(ptep); bool young = true; if (!pte_young(pte)) young = false; else set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte)); return young; } #endif #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; bool young = true; if (!pmd_young(pmd)) young = false; else set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd)); return young; } #else static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return false; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH bool ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE bool pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else /* * Despite relevant to THP only, this API is called from generic rmap code * under PageTransHuge(), hence needs a dummy implementation for !THP */ static inline bool pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return false; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef arch_has_hw_nonleaf_pmd_young /* * Return whether the accessed bit in non-leaf PMD entries is supported on the * local CPU. */ static inline bool arch_has_hw_nonleaf_pmd_young(void) { return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG); } #endif #ifndef arch_has_hw_pte_young /* * Return whether the accessed bit is supported on the local CPU. * * This stub assumes accessing through an old PTE triggers a page fault. * Architectures that automatically set the access bit should overwrite it. */ static inline bool arch_has_hw_pte_young(void) { return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG); } #endif #ifndef exec_folio_order /* * Returns preferred minimum folio order for executable file-backed memory. Must * be in range [0, PMD_ORDER). Default to order-0. */ static inline unsigned int exec_folio_order(void) { return 0; } #endif #ifndef arch_check_zapped_pte static inline void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) { } #endif #ifndef arch_check_zapped_pmd static inline void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) { } #endif #ifndef arch_check_zapped_pud static inline void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) { } #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); page_table_check_pte_clear(mm, address, pte); return pte; } #endif #ifndef clear_young_dirty_ptes /** * clear_young_dirty_ptes - Mark PTEs that map consecutive pages of the * same folio as old/clean. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to mark old/clean. * @flags: Flags to modify the PTE batch semantics. * * May be overridden by the architecture; otherwise, implemented by * get_and_clear/modify/set for each pte in the range. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void clear_young_dirty_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr, cydp_t flags) { pte_t pte; for (;;) { if (flags == CYDP_CLEAR_YOUNG) ptep_test_and_clear_young(vma, addr, ptep); else { pte = ptep_get_and_clear(vma->vm_mm, addr, ptep); if (flags & CYDP_CLEAR_YOUNG) pte = pte_mkold(pte); if (flags & CYDP_CLEAR_DIRTY) pte = pte_mkclean(pte); set_pte_at(vma->vm_mm, addr, ptep, pte); } if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = ptep_get(ptep); pte_clear(mm, addr, ptep); /* * No need for ptep_get_and_clear(): page table check doesn't care about * any bits that could have been set by HW concurrently. */ page_table_check_pte_clear(mm, addr, pte); } #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH /* * For walking the pagetables without holding any locks. Some architectures * (eg x86-32 PAE) cannot load the entries atomically without using expensive * instructions. We are guaranteed that a PTE will only either go from not * present to present, or present to not present -- it will not switch to a * completely different present page without a TLB flush inbetween; which we * are blocking by holding interrupts off. * * Setting ptes from not present to present goes: * * ptep->pte_high = h; * smp_wmb(); * ptep->pte_low = l; * * And present to not present goes: * * ptep->pte_low = 0; * smp_wmb(); * ptep->pte_high = 0; * * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. * We load pte_high *after* loading pte_low, which ensures we don't see an older * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't * picked up a changed pte high. We might have gotten rubbish values from * pte_low and pte_high, but we are guaranteed that pte_low will not have the * present bit set *unless* it is 'l'. Because get_user_pages_fast() only * operates on present ptes we're safe. */ static inline pte_t ptep_get_lockless(pte_t *ptep) { pte_t pte; do { pte.pte_low = ptep->pte_low; smp_rmb(); pte.pte_high = ptep->pte_high; smp_rmb(); } while (unlikely(pte.pte_low != ptep->pte_low)); return pte; } #define ptep_get_lockless ptep_get_lockless #if CONFIG_PGTABLE_LEVELS > 2 static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) { pmd_t pmd; do { pmd.pmd_low = pmdp->pmd_low; smp_rmb(); pmd.pmd_high = pmdp->pmd_high; smp_rmb(); } while (unlikely(pmd.pmd_low != pmdp->pmd_low)); return pmd; } #define pmdp_get_lockless pmdp_get_lockless #define pmdp_get_lockless_sync() tlb_remove_table_sync_one() #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */ /* * We require that the PTE can be read atomically. */ #ifndef ptep_get_lockless static inline pte_t ptep_get_lockless(pte_t *ptep) { return ptep_get(ptep); } #endif #ifndef pmdp_get_lockless static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) { return pmdp_get(pmdp); } static inline void pmdp_get_lockless_sync(void) { } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; pmd_clear(pmdp); page_table_check_pmd_clear(mm, address, pmd); return pmd; } #endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */ #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t pud = *pudp; pud_clear(pudp); page_table_check_pud_clear(mm, address, pud); return pud; } #endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, int full) { return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); } #endif #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, int full) { return pudp_huge_get_and_clear(vma->vm_mm, address, pudp); } #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { return ptep_get_and_clear(mm, address, ptep); } #endif #ifndef get_and_clear_full_ptes /** * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of * the same folio, collecting dirty/accessed bits. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the * returned PTE. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { pte_t pte, tmp_pte; pte = ptep_get_and_clear_full(mm, addr, ptep, full); while (--nr) { ptep++; addr += PAGE_SIZE; tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full); if (pte_dirty(tmp_pte)) pte = pte_mkdirty(pte); if (pte_young(tmp_pte)) pte = pte_mkyoung(pte); } return pte; } #endif /** * get_and_clear_ptes - Clear present PTEs that map consecutive pages of * the same folio, collecting dirty/accessed bits. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * * Use this instead of get_and_clear_full_ptes() if it is known that we don't * need to clear the full mm, which is mostly the case. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline pte_t get_and_clear_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { return get_and_clear_full_ptes(mm, addr, ptep, nr, 0); } #ifndef clear_full_ptes /** * clear_full_ptes - Clear present PTEs that map consecutive pages of the same * folio. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_get_and_clear_full(). * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { for (;;) { ptep_get_and_clear_full(mm, addr, ptep, full); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif /** * clear_ptes - Clear present PTEs that map consecutive pages of the same folio. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * * Use this instead of clear_full_ptes() if it is known that we don't need to * clear the full mm, which is mostly the case. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void clear_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { clear_full_ptes(mm, addr, ptep, nr, 0); } /* * If two threads concurrently fault at the same page, the thread that * won the race updates the PTE and its local TLB/Cache. The other thread * gives up, simply does nothing, and continues; on architectures where * software can update TLB, local TLB can be updated here to avoid next page * fault. This function updates TLB only, do nothing with cache or others. * It is the difference with function update_mmu_cache. */ #ifndef update_mmu_tlb_range static inline void update_mmu_tlb_range(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int nr) { } #endif static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { update_mmu_tlb_range(vma, address, ptep, 1); } /* * Some architectures may be able to avoid expensive synchronization * primitives when modifications are made to PTE's which are already * not present, or in the process of an address space destruction. */ #ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL static inline void pte_clear_not_present_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { pte_clear(mm, address, ptep); } #endif #ifndef clear_not_present_full_ptes /** * clear_not_present_full_ptes - Clear multiple not present PTEs which are * consecutive in the pgtable. * @mm: Address space the ptes represent. * @addr: Address of the first pte. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over pte_clear_not_present_full(). * * Context: The caller holds the page table lock. The PTEs are all not present. * The PTEs are all in the same PMD. */ static inline void clear_not_present_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { for (;;) { pte_clear_not_present_full(mm, addr, ptep, full); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH extern pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pud_t *pudp); #endif #ifndef pte_mkwrite static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) { return pte_mkwrite_novma(pte); } #endif #if defined(CONFIG_ARCH_WANT_PMD_MKWRITE) && !defined(pmd_mkwrite) static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { return pmd_mkwrite_novma(pmd); } #endif #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT struct mm_struct; static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t old_pte = ptep_get(ptep); set_pte_at(mm, address, ptep, pte_wrprotect(old_pte)); } #endif #ifndef wrprotect_ptes /** * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same * folio. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to write-protect. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_set_wrprotect(). * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { for (;;) { ptep_set_wrprotect(mm, addr, ptep); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif #ifndef clear_flush_young_ptes /** * clear_flush_young_ptes - Mark PTEs that map consecutive pages of the same * folio as old and flush the TLB. * @vma: The virtual memory area the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear access bit. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_clear_flush_young(). * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline bool clear_flush_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { bool young = false; for (;;) { young |= ptep_clear_flush_young(vma, addr, ptep); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } return young; } #endif #ifndef test_and_clear_young_ptes /** * test_and_clear_young_ptes - Mark PTEs that map consecutive pages of the same * folio as old * @vma: The virtual memory area the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear access bit. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_test_and_clear_young(). * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. * * Returns: whether any PTE was young. */ static inline bool test_and_clear_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { bool young = false; for (;;) { young |= ptep_test_and_clear_young(vma, addr, ptep); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } return young; } #endif /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings * out extra page fault penalty to track page access bit. For optimization page * access bit can be set during all page fault flow on these arches. * To be differentiate with macro pte_mkyoung, this macro is used on platforms * where software maintains page access bit. */ #ifndef pte_sw_mkyoung static inline pte_t pte_sw_mkyoung(pte_t pte) { return pte; } #define pte_sw_mkyoung pte_sw_mkyoung #endif #ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t old_pmd = *pmdp; set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd)); } #else static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t old_pud = *pudp; set_pud_at(mm, address, pudp, pud_wrprotect(old_pud)); } #else static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #endif #ifndef pmdp_collapse_flush #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return *pmdp; } #define pmdp_collapse_flush pmdp_collapse_flush #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif #ifndef arch_needs_pgtable_deposit #define arch_needs_pgtable_deposit() (false) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is an implementation of pmdp_establish() that is only suitable for an * architecture that doesn't have hardware dirty/accessed bits. In this case we * can't race with CPU which sets these bits and non-atomic approach is fine. */ static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { pmd_t old_pmd = *pmdp; set_pmd_at(vma->vm_mm, address, pmdp, pmd); return old_pmd; } #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD /* * pmdp_invalidate_ad() invalidates the PMD while changing a transparent * hugepage mapping in the page tables. This function is similar to * pmdp_invalidate(), but should only be used if the access and dirty bits would * not be cleared by the software in the new PMD value. The function ensures * that hardware changes of the access and dirty bits updates would not be lost. * * Doing so can allow in certain architectures to avoid a TLB flush in most * cases. Yet, another TLB flush might be necessary later if the PMD update * itself requires such flush (e.g., if protection was set to be stricter). Yet, * even when a TLB flush is needed because of the update, the caller may be able * to batch these TLB flushing operations, so fewer TLB flush operations are * needed. */ extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { return pte_val(pte_a) == pte_val(pte_b); } #endif #ifndef __HAVE_ARCH_PTE_UNUSED /* * Some architectures provide facilities to virtualization guests * so that they can flag allocated pages as unused. This allows the * host to transparently reclaim unused pages. This function returns * whether the pte's page is unused. */ static inline int pte_unused(pte_t pte) { return 0; } #endif #ifndef pte_access_permitted #define pte_access_permitted(pte, write) \ (pte_present(pte) && (!(write) || pte_write(pte))) #endif #ifndef pmd_access_permitted #define pmd_access_permitted(pmd, write) \ (pmd_present(pmd) && (!(write) || pmd_write(pmd))) #endif #ifndef pud_access_permitted #define pud_access_permitted(pud, write) \ (pud_present(pud) && (!(write) || pud_write(pud))) #endif #ifndef p4d_access_permitted #define p4d_access_permitted(p4d, write) \ (p4d_present(p4d) && (!(write) || p4d_write(p4d))) #endif #ifndef pgd_access_permitted #define pgd_access_permitted(pgd, write) \ (pgd_present(pgd) && (!(write) || pgd_write(pgd))) #endif #ifndef __HAVE_ARCH_PMD_SAME static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { return pmd_val(pmd_a) == pmd_val(pmd_b); } #endif #ifndef pud_same static inline int pud_same(pud_t pud_a, pud_t pud_b) { return pud_val(pud_a) == pud_val(pud_b); } #define pud_same pud_same #endif #ifndef __HAVE_ARCH_P4D_SAME static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b) { return p4d_val(p4d_a) == p4d_val(p4d_b); } #endif #ifndef __HAVE_ARCH_PGD_SAME static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) { return pgd_val(pgd_a) == pgd_val(pgd_b); } #endif #ifndef __HAVE_ARCH_DO_SWAP_PAGE static inline void arch_do_swap_page_nr(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t oldpte, int nr) { } #else /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_do_swap_page() can restore this * metadata when a page is swapped back in. */ static inline void arch_do_swap_page_nr(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t oldpte, int nr) { for (int i = 0; i < nr; i++) { arch_do_swap_page(vma->vm_mm, vma, addr + i * PAGE_SIZE, pte_advance_pfn(pte, i), pte_advance_pfn(oldpte, i)); } } #endif #ifndef __HAVE_ARCH_UNMAP_ONE /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_unmap_one() can save this * metadata on a swap-out of a page. */ static inline int arch_unmap_one(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t orig_pte) { return 0; } #endif /* * Allow architectures to preserve additional metadata associated with * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function * prototypes must be defined in the arch-specific asm/pgtable.h file. */ #ifndef __HAVE_ARCH_PREPARE_TO_SWAP static inline int arch_prepare_to_swap(struct folio *folio) { return 0; } #endif #ifndef __HAVE_ARCH_SWAP_INVALIDATE static inline void arch_swap_invalidate_page(int type, pgoff_t offset) { } static inline void arch_swap_invalidate_area(int type) { } #endif #ifndef __HAVE_ARCH_SWAP_RESTORE static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) { } #endif #ifndef __HAVE_ARCH_MOVE_PTE #define move_pte(pte, old_addr, new_addr) (pte) #endif #ifndef pte_accessible # define pte_accessible(mm, pte) ((void)(pte), 1) #endif #ifndef flush_tlb_fix_spurious_fault #define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address) #endif #ifndef flush_tlb_fix_spurious_fault_pmd #define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) do { } while (0) #endif /* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout. */ #define pgd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #ifndef p4d_addr_end #define p4d_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pud_addr_end #define pud_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pmd_addr_end #define pmd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif /* * When walking page tables, we usually want to skip any p?d_none entries; * and any p?d_bad entries - reporting the error before resetting to none. * Do the tests inline, but report and clear the bad entry in mm/memory.c. */ void pgd_clear_bad(pgd_t *); #ifndef __PAGETABLE_P4D_FOLDED void p4d_clear_bad(p4d_t *); #else #define p4d_clear_bad(p4d) do { } while (0) #endif #ifndef __PAGETABLE_PUD_FOLDED void pud_clear_bad(pud_t *); #else #define pud_clear_bad(p4d) do { } while (0) #endif void pmd_clear_bad(pmd_t *); static inline int pgd_none_or_clear_bad(pgd_t *pgd) { if (pgd_none(*pgd)) return 1; if (unlikely(pgd_bad(*pgd))) { pgd_clear_bad(pgd); return 1; } return 0; } static inline int p4d_none_or_clear_bad(p4d_t *p4d) { if (p4d_none(*p4d)) return 1; if (unlikely(p4d_bad(*p4d))) { p4d_clear_bad(p4d); return 1; } return 0; } static inline int pud_none_or_clear_bad(pud_t *pud) { if (pud_none(*pud)) return 1; if (unlikely(pud_bad(*pud))) { pud_clear_bad(pud); return 1; } return 0; } static inline int pmd_none_or_clear_bad(pmd_t *pmd) { if (pmd_none(*pmd)) return 1; if (unlikely(pmd_bad(*pmd))) { pmd_clear_bad(pmd); return 1; } return 0; } static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { /* * Get the current pte state, but zero it out to make it * non-present, preventing the hardware from asynchronously * updating it. */ return ptep_get_and_clear(vma->vm_mm, addr, ptep); } static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { /* * The pte is non-present, so there's no hardware state to * preserve. */ set_pte_at(vma->vm_mm, addr, ptep, pte); } #ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION /* * Start a pte protection read-modify-write transaction, which * protects against asynchronous hardware modifications to the pte. * The intention is not to prevent the hardware from making pte * updates, but to prevent any updates it may make from being lost. * * This does not protect against other software modifications of the * pte; the appropriate pte lock must be held over the transaction. * * Note that this interface is intended to be batchable, meaning that * ptep_modify_prot_commit may not actually update the pte, but merely * queue the update to be done at some later time. The update must be * actually committed before the pte lock is released, however. */ static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return __ptep_modify_prot_start(vma, addr, ptep); } /* * Commit an update to a pte, leaving any hardware-controlled bits in * the PTE unmodified. The pte returned from ptep_modify_prot_start() may * additionally have young and/or dirty bits set where previously they were not, * so the updated pte may have these additional changes. */ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { __ptep_modify_prot_commit(vma, addr, ptep, pte); } #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ /** * modify_prot_start_ptes - Start a pte protection read-modify-write transaction * over a batch of ptes, which protects against asynchronous hardware * modifications to the ptes. The intention is not to prevent the hardware from * making pte updates, but to prevent any updates it may make from being lost. * Please see the comment above ptep_modify_prot_start() for full description. * * @vma: The virtual memory area the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_modify_prot_start(), collecting the a/d bits from each pte * in the batch. * * Note that PTE bits in the PTE batch besides the PFN can differ. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. All other PTE bits must be identical for * all PTEs in the batch except for young and dirty bits. The PTEs are all in * the same PMD. */ #ifndef modify_prot_start_ptes static inline pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { pte_t pte, tmp_pte; pte = ptep_modify_prot_start(vma, addr, ptep); while (--nr) { ptep++; addr += PAGE_SIZE; tmp_pte = ptep_modify_prot_start(vma, addr, ptep); if (pte_dirty(tmp_pte)) pte = pte_mkdirty(pte); if (pte_young(tmp_pte)) pte = pte_mkyoung(pte); } return pte; } #endif /** * modify_prot_commit_ptes - Commit an update to a batch of ptes, leaving any * hardware-controlled bits in the PTE unmodified. * * @vma: The virtual memory area the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @old_pte: Old page table entry (for the first entry) which is now cleared. * @pte: New page table entry to be set. * @nr: Number of entries. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_modify_prot_commit(). * * Context: The caller holds the page table lock. The PTEs are all in the same * PMD. On exit, the set ptes in the batch map the same folio. The ptes set by * ptep_modify_prot_start() may additionally have young and/or dirty bits set * where previously they were not, so the updated ptes may have these * additional changes. */ #ifndef modify_prot_commit_ptes static inline void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte, unsigned int nr) { int i; for (i = 0; i < nr; ++i, ++ptep, addr += PAGE_SIZE) { ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte); /* Advance PFN only, set same prot */ old_pte = pte_next_pfn(old_pte); pte = pte_next_pfn(pte); } } #endif /* * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values * and let generic vmalloc, ioremap and page table update code know when * arch_sync_kernel_mappings() needs to be called. */ #ifndef ARCH_PAGE_TABLE_SYNC_MASK #define ARCH_PAGE_TABLE_SYNC_MASK 0 #endif /* * There is no default implementation for arch_sync_kernel_mappings(). It is * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK * is 0. */ void arch_sync_kernel_mappings(unsigned long start, unsigned long end); #endif /* CONFIG_MMU */ /* * On almost all architectures and configurations, 0 can be used as the * upper ceiling to free_pgtables(): on many architectures it has the same * effect as using TASK_SIZE. However, there is one configuration which * must impose a more careful limit, to avoid freeing kernel pgtables. */ #ifndef USER_PGTABLES_CEILING #define USER_PGTABLES_CEILING 0UL #endif /* * This defines the first usable user address. Platforms * can override its value with custom FIRST_USER_ADDRESS * defined in their respective <asm/pgtable.h>. */ #ifndef FIRST_USER_ADDRESS #define FIRST_USER_ADDRESS 0UL #endif /* * No-op macros that just return the current protection value. Defined here * because these macros can be used even if CONFIG_MMU is not defined. */ #ifndef pgprot_nx #define pgprot_nx(prot) (prot) #endif #ifndef pgprot_noncached #define pgprot_noncached(prot) (prot) #endif #ifndef pgprot_writecombine #define pgprot_writecombine pgprot_noncached #endif #ifndef pgprot_writethrough #define pgprot_writethrough pgprot_noncached #endif #ifndef pgprot_device #define pgprot_device pgprot_noncached #endif #ifndef pgprot_mhp #define pgprot_mhp(prot) (prot) #endif #ifdef CONFIG_MMU #ifndef pgprot_modify #define pgprot_modify pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) { if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot))) newprot = pgprot_noncached(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot))) newprot = pgprot_writecombine(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot))) newprot = pgprot_device(newprot); return newprot; } #endif #endif /* CONFIG_MMU */ #ifndef pgprot_encrypted #define pgprot_encrypted(prot) (prot) #endif #ifndef pgprot_decrypted #define pgprot_decrypted(prot) (prot) #endif /* * A facility to provide batching of the reload of page tables and * other process state with the actual context switch code for * paravirtualized guests. By convention, only one of the batched * update (lazy) modes (CPU, MMU) should be active at any given time, * entry should never be nested, and entry and exits should always be * paired. This is for sanity of maintaining and reasoning about the * kernel code. In this case, the exit (end of the context switch) is * in architecture-specific code, and so doesn't need a generic * definition. */ #ifndef __HAVE_ARCH_START_CONTEXT_SWITCH #define arch_start_context_switch(prev) do {} while (0) #endif /* * Some platforms can customize the PTE soft-dirty bit making it unavailable * even if the architecture provides the resource. * Adding this API allows architectures to add their own checks for the * devices on which the kernel is running. * Note: When overriding it, please make sure the CONFIG_MEM_SOFT_DIRTY * is part of this macro. */ #ifndef pgtable_supports_soft_dirty #define pgtable_supports_soft_dirty() IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) #endif #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */ static inline int pte_soft_dirty(pte_t pte) { return 0; } static inline int pmd_soft_dirty(pmd_t pmd) { return 0; } static inline pte_t pte_mksoft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { return pte; } static inline int pte_swp_soft_dirty(pte_t pte) { return 0; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #ifndef __HAVE_PFNMAP_TRACKING /* * Interfaces that can be used by architecture code to keep track of * memory type of pfn mappings specified by the remap_pfn_range, * vmf_insert_pfn. */ static inline int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, pgprot_t *prot) { return 0; } static inline int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot) { return 0; } static inline void pfnmap_untrack(unsigned long pfn, unsigned long size) { } #else /** * pfnmap_setup_cachemode - setup the cachemode in the pgprot for a pfn range * @pfn: the start of the pfn range * @size: the size of the pfn range in bytes * @prot: the pgprot to modify * * Lookup the cachemode for the pfn range starting at @pfn with the size * @size and store it in @prot, leaving other data in @prot unchanged. * * This allows for a hardware implementation to have fine-grained control of * memory cache behavior at page level granularity. Without a hardware * implementation, this function does nothing. * * Currently there is only one implementation for this - x86 Page Attribute * Table (PAT). See Documentation/arch/x86/pat.rst for more details. * * This function can fail if the pfn range spans pfns that require differing * cachemodes. If the pfn range was previously verified to have a single * cachemode, it is sufficient to query only a single pfn. The assumption is * that this is the case for drivers using the vmf_insert_pfn*() interface. * * Returns 0 on success and -EINVAL on error. */ int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, pgprot_t *prot); /** * pfnmap_track - track a pfn range * @pfn: the start of the pfn range * @size: the size of the pfn range in bytes * @prot: the pgprot to track * * Requested the pfn range to be 'tracked' by a hardware implementation and * setup the cachemode in @prot similar to pfnmap_setup_cachemode(). * * This allows for fine-grained control of memory cache behaviour at page * level granularity. Tracking memory this way is persisted across VMA splits * (VMA merging does not apply for VM_PFNMAP). * * Currently, there is only one implementation for this - x86 Page Attribute * Table (PAT). See Documentation/arch/x86/pat.rst for more details. * * Returns 0 on success and -EINVAL on error. */ int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot); /** * pfnmap_untrack - untrack a pfn range * @pfn: the start of the pfn range * @size: the size of the pfn range in bytes * * Untrack a pfn range previously tracked through pfnmap_track(). */ void pfnmap_untrack(unsigned long pfn, unsigned long size); #endif /** * pfnmap_setup_cachemode_pfn - setup the cachemode in the pgprot for a pfn * @pfn: the pfn * @prot: the pgprot to modify * * Lookup the cachemode for @pfn and store it in @prot, leaving other * data in @prot unchanged. * * See pfnmap_setup_cachemode() for details. */ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) { pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot); } /* * ZERO_PAGE() is global shared page(s) that is always zero. It is used for * zero-mapped memory areas, CoW etc. * * On architectures that __HAVE_COLOR_ZERO_PAGE there are several such pages * for different ranges in the virtual address space. * * zero_page_pfn identifies the first (or the only) pfn for these pages. * * For architectures that don't __HAVE_COLOR_ZERO_PAGE the zero page lives in * empty_zero_page in BSS. */ void arch_setup_zero_pages(void); #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_page_pfn; unsigned long offset_from_zero_pfn = pfn - zero_page_pfn; return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); } #define zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) #else static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_page_pfn; return pfn == zero_page_pfn; } static inline unsigned long zero_pfn(unsigned long addr) { extern unsigned long zero_page_pfn; return zero_page_pfn; } extern uint8_t empty_zero_page[PAGE_SIZE]; extern struct page *__zero_page; static inline struct page *_zero_page(unsigned long addr) { return __zero_page; } #define ZERO_PAGE(vaddr) _zero_page(vaddr) #endif /* __HAVE_COLOR_ZERO_PAGE */ #ifdef CONFIG_MMU #ifndef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_trans_huge(pmd_t pmd) { return 0; } #ifndef pmd_write static inline int pmd_write(pmd_t pmd) { BUG(); return 0; } #endif /* pmd_write */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef pud_write static inline int pud_write(pud_t pud) { BUG(); return 0; } #endif /* pud_write */ #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static inline int pud_trans_huge(pud_t pud) { return 0; } #endif static inline int pud_trans_unstable(pud_t *pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) pud_t pudval = pudp_get(pud); if (pud_none(pudval) || pud_trans_huge(pudval)) return 1; if (unlikely(pud_bad(pudval))) { pud_clear_bad(pud); return 1; } #endif return 0; } #ifndef CONFIG_NUMA_BALANCING /* * In an inaccessible (PROT_NONE) VMA, pte_protnone() may indicate "yes". It is * perfectly valid to indicate "no" in that case, which is why our default * implementation defaults to "always no". * * In an accessible VMA, however, pte_protnone() reliably indicates PROT_NONE * page protection due to NUMA hinting. NUMA hinting faults only apply in * accessible VMAs. * * So, to reliably identify PROT_NONE PTEs that require a NUMA hinting fault, * looking at the VMA accessibility is sufficient. */ static inline int pte_protnone(pte_t pte) { return 0; } static inline int pmd_protnone(pmd_t pmd) { return 0; } #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_MMU */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP #ifndef __PAGETABLE_P4D_FOLDED int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot); void p4d_clear_huge(p4d_t *p4d); #else static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline void p4d_clear_huge(p4d_t *p4d) { } #endif /* !__PAGETABLE_P4D_FOLDED */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pud_clear_huge(pud_t *pud); int pmd_clear_huge(pmd_t *pmd); int p4d_free_pud_page(p4d_t *p4d, unsigned long addr); int pud_free_pmd_page(pud_t *pud, unsigned long addr); int pmd_free_pte_page(pmd_t *pmd, unsigned long addr); #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { return 0; } static inline void p4d_clear_huge(p4d_t *p4d) { } static inline int pud_clear_huge(pud_t *pud) { return 0; } static inline int pmd_clear_huge(pmd_t *pmd) { return 0; } static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) { return 0; } static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr) { return 0; } static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { return 0; } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * ARCHes with special requirements for evicting THP backing TLB entries can * implement this. Otherwise also, it can help optimize normal TLB flush in * THP regime. Stock flush_tlb_range() typically has optimization to nuke the * entire TLB if flush span is greater than a threshold, which will * likely be true for a single huge page. Thus a single THP flush will * invalidate the entire TLB which is not desirable. * e.g. see arch/arc: flush_pmd_tlb_range */ #define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #define flush_pud_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #else #define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG() #define flush_pud_tlb_range(vma, addr, end) BUILD_BUG() #endif #endif struct file; int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot); #ifndef CONFIG_X86_ESPFIX64 static inline void init_espfix_bsp(void) { } #endif extern void __init pgtable_cache_init(void); #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) { return true; } static inline bool arch_has_pfn_modify_check(void) { return false; } #endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */ /* * Architecture PAGE_KERNEL_* fallbacks * * Some architectures don't define certain PAGE_KERNEL_* flags. This is either * because they really don't support them, or the port needs to be updated to * reflect the required functionality. Below are a set of relatively safe * fallbacks, as best effort, which we can count on in lieu of the architectures * not defining them on their own yet. */ #ifndef PAGE_KERNEL_RO # define PAGE_KERNEL_RO PAGE_KERNEL #endif #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif /* * Page Table Modification bits for pgtbl_mod_mask. * * These are used by the p?d_alloc_track*() and p*d_populate_kernel() * functions in the generic vmalloc, ioremap and page table update code * to track at which page-table levels entries have been modified. * Based on that the code can better decide when page table changes need * to be synchronized to other page-tables in the system. */ #define __PGTBL_PGD_MODIFIED 0 #define __PGTBL_P4D_MODIFIED 1 #define __PGTBL_PUD_MODIFIED 2 #define __PGTBL_PMD_MODIFIED 3 #define __PGTBL_PTE_MODIFIED 4 #define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED) #define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED) #define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED) #define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED) #define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED) /* Page-Table Modification Mask */ typedef unsigned int pgtbl_mod_mask; enum pgtable_level { PGTABLE_LEVEL_PTE = 0, PGTABLE_LEVEL_PMD, PGTABLE_LEVEL_PUD, PGTABLE_LEVEL_P4D, PGTABLE_LEVEL_PGD, }; static inline const char *pgtable_level_to_str(enum pgtable_level level) { switch (level) { case PGTABLE_LEVEL_PTE: return "pte"; case PGTABLE_LEVEL_PMD: return "pmd"; case PGTABLE_LEVEL_PUD: return "pud"; case PGTABLE_LEVEL_P4D: return "p4d"; case PGTABLE_LEVEL_PGD: return "pgd"; default: return "unknown"; } } #endif /* !__ASSEMBLY__ */ #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) #ifdef CONFIG_PHYS_ADDR_T_64BIT /* * ZSMALLOC needs to know the highest PFN on 32-bit architectures * with physical address space extension, but falls back to * BITS_PER_LONG otherwise. */ #error Missing MAX_POSSIBLE_PHYSMEM_BITS definition #else #define MAX_POSSIBLE_PHYSMEM_BITS 32 #endif #endif #ifndef has_transparent_hugepage #define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE) #endif #ifndef has_transparent_pud_hugepage #define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) #endif /* * On some architectures it depends on the mm if the p4d/pud or pmd * layer of the page table hierarchy is folded or not. */ #ifndef mm_p4d_folded #define mm_p4d_folded(mm) __is_defined(__PAGETABLE_P4D_FOLDED) #endif #ifndef mm_pud_folded #define mm_pud_folded(mm) __is_defined(__PAGETABLE_PUD_FOLDED) #endif #ifndef mm_pmd_folded #define mm_pmd_folded(mm) __is_defined(__PAGETABLE_PMD_FOLDED) #endif #ifndef p4d_offset_lockless #define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address) #endif #ifndef pud_offset_lockless #define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address) #endif #ifndef pmd_offset_lockless #define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address) #endif /* * pXd_leaf() is the API to check whether a pgtable entry is a huge page * mapping. It should work globally across all archs, without any * dependency on CONFIG_* options. For architectures that do not support * huge mappings on specific levels, below fallbacks will be used. * * A leaf pgtable entry should always imply the following: * * - It is a "present" entry. IOW, before using this API, please check it * with pXd_present() first. NOTE: it may not always mean the "present * bit" is set. For example, PROT_NONE entries are always "present". * * - It should _never_ be a swap entry of any type. Above "present" check * should have guarded this, but let's be crystal clear on this. * * - It should contain a huge PFN, which points to a huge page larger than * PAGE_SIZE of the platform. The PFN format isn't important here. * * - It should cover all kinds of huge mappings (i.e. pXd_trans_huge() * or hugetlb mappings). */ #ifndef pgd_leaf #define pgd_leaf(x) false #endif #ifndef p4d_leaf #define p4d_leaf(x) false #endif #ifndef pud_leaf #define pud_leaf(x) false #endif #ifndef pmd_leaf #define pmd_leaf(x) false #endif #ifndef pgd_leaf_size #define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT) #endif #ifndef p4d_leaf_size #define p4d_leaf_size(x) P4D_SIZE #endif #ifndef pud_leaf_size #define pud_leaf_size(x) PUD_SIZE #endif #ifndef pmd_leaf_size #define pmd_leaf_size(x) PMD_SIZE #endif #ifndef __pte_leaf_size #ifndef pte_leaf_size #define pte_leaf_size(x) PAGE_SIZE #endif #define __pte_leaf_size(x,y) pte_leaf_size(y) #endif /* * We always define pmd_pfn for all archs as it's used in lots of generic * code. Now it happens too for pud_pfn (and can happen for larger * mappings too in the future; we're not there yet). Instead of defining * it for all archs (like pmd_pfn), provide a fallback. * * Note that returning 0 here means any arch that didn't define this can * get severely wrong when it hits a real pud leaf. It's arch's * responsibility to properly define it when a huge pud is possible. */ #ifndef pud_pfn #define pud_pfn(x) 0 #endif /* * Some architectures have MMUs that are configurable or selectable at boot * time. These lead to variable PTRS_PER_x. For statically allocated arrays it * helps to have a static maximum value. */ #ifndef MAX_PTRS_PER_PTE #define MAX_PTRS_PER_PTE PTRS_PER_PTE #endif #ifndef MAX_PTRS_PER_PMD #define MAX_PTRS_PER_PMD PTRS_PER_PMD #endif #ifndef MAX_PTRS_PER_PUD #define MAX_PTRS_PER_PUD PTRS_PER_PUD #endif #ifndef MAX_PTRS_PER_P4D #define MAX_PTRS_PER_P4D PTRS_PER_P4D #endif #ifndef pte_pgprot #define pte_pgprot(x) ((pgprot_t) {0}) #endif #ifndef pmd_pgprot #define pmd_pgprot(x) ((pgprot_t) {0}) #endif #ifndef pud_pgprot #define pud_pgprot(x) ((pgprot_t) {0}) #endif /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: * * map_type prot * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (yes) yes w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes * * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (copy) copy w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes * * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and * MAP_PRIVATE (with Enhanced PAN supported): * r: (no) no * w: (no) no * x: (yes) yes */ #define DECLARE_VM_GET_PAGE_PROT \ pgprot_t vm_get_page_prot(vm_flags_t vm_flags) \ { \ return protection_map[vm_flags & \ (VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)]; \ } \ EXPORT_SYMBOL(vm_get_page_prot); #endif /* _LINUX_PGTABLE_H */
58 58 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 // SPDX-License-Identifier: GPL-2.0-or-later /* Copyright 2020 NXP */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/slab.h> #include <net/act_api.h> #include <net/netlink.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_gate.h> #include <net/tc_wrapper.h> static struct tc_action_ops act_gate_ops; static ktime_t gate_get_time(struct tcf_gate *gact) { ktime_t mono = ktime_get(); switch (gact->tk_offset) { case TK_OFFS_MAX: return mono; default: return ktime_mono_to_any(mono, gact->tk_offset); } return KTIME_MAX; } static void tcf_gate_params_free_rcu(struct rcu_head *head); static void gate_get_start_time(struct tcf_gate *gact, const struct tcf_gate_params *param, ktime_t *start) { ktime_t now, base, cycle; u64 n; base = ns_to_ktime(param->tcfg_basetime); now = gate_get_time(gact); if (ktime_after(base, now)) { *start = base; return; } cycle = param->tcfg_cycletime; n = div64_u64(ktime_sub_ns(now, base), cycle); *start = ktime_add_ns(base, (n + 1) * cycle); } static void gate_start_timer(struct tcf_gate *gact, ktime_t start) { ktime_t expires; expires = hrtimer_get_expires(&gact->hitimer); if (expires == 0) expires = KTIME_MAX; start = min_t(ktime_t, start, expires); hrtimer_start(&gact->hitimer, start, HRTIMER_MODE_ABS_SOFT); } static enum hrtimer_restart gate_timer_func(struct hrtimer *timer) { struct tcf_gate *gact = container_of(timer, struct tcf_gate, hitimer); struct tcfg_gate_entry *next; struct tcf_gate_params *p; ktime_t close_time, now; spin_lock(&gact->tcf_lock); p = rcu_dereference_protected(gact->param, lockdep_is_held(&gact->tcf_lock)); next = gact->next_entry; /* cycle start, clear pending bit, clear total octets */ gact->current_gate_status = next->gate_state ? GATE_ACT_GATE_OPEN : 0; gact->current_entry_octets = 0; gact->current_max_octets = next->maxoctets; gact->current_close_time = ktime_add_ns(gact->current_close_time, next->interval); close_time = gact->current_close_time; if (list_is_last(&next->list, &p->entries)) next = list_first_entry(&p->entries, struct tcfg_gate_entry, list); else next = list_next_entry(next, list); now = gate_get_time(gact); if (ktime_after(now, close_time)) { ktime_t cycle, base; u64 n; cycle = p->tcfg_cycletime; base = ns_to_ktime(p->tcfg_basetime); n = div64_u64(ktime_sub_ns(now, base), cycle); close_time = ktime_add_ns(base, (n + 1) * cycle); } gact->next_entry = next; hrtimer_set_expires(&gact->hitimer, close_time); spin_unlock(&gact->tcf_lock); return HRTIMER_RESTART; } TC_INDIRECT_SCOPE int tcf_gate_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_gate *gact = to_gate(a); int action = READ_ONCE(gact->tcf_action); tcf_lastuse_update(&gact->tcf_tm); tcf_action_update_bstats(&gact->common, skb); spin_lock(&gact->tcf_lock); if (unlikely(gact->current_gate_status & GATE_ACT_PENDING)) { spin_unlock(&gact->tcf_lock); return action; } if (!(gact->current_gate_status & GATE_ACT_GATE_OPEN)) { spin_unlock(&gact->tcf_lock); goto drop; } if (gact->current_max_octets >= 0) { gact->current_entry_octets += qdisc_pkt_len(skb); if (gact->current_entry_octets > gact->current_max_octets) { spin_unlock(&gact->tcf_lock); goto overlimit; } } spin_unlock(&gact->tcf_lock); return action; overlimit: tcf_action_inc_overlimit_qstats(&gact->common); drop: tcf_action_inc_drop_qstats(&gact->common); return TC_ACT_SHOT; } static const struct nla_policy entry_policy[TCA_GATE_ENTRY_MAX + 1] = { [TCA_GATE_ENTRY_INDEX] = { .type = NLA_U32 }, [TCA_GATE_ENTRY_GATE] = { .type = NLA_FLAG }, [TCA_GATE_ENTRY_INTERVAL] = { .type = NLA_U32 }, [TCA_GATE_ENTRY_IPV] = { .type = NLA_S32 }, [TCA_GATE_ENTRY_MAX_OCTETS] = { .type = NLA_S32 }, }; static const struct nla_policy gate_policy[TCA_GATE_MAX + 1] = { [TCA_GATE_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_gate)), [TCA_GATE_PRIORITY] = { .type = NLA_S32 }, [TCA_GATE_ENTRY_LIST] = { .type = NLA_NESTED }, [TCA_GATE_BASE_TIME] = { .type = NLA_U64 }, [TCA_GATE_CYCLE_TIME] = { .type = NLA_U64 }, [TCA_GATE_CYCLE_TIME_EXT] = { .type = NLA_U64 }, [TCA_GATE_FLAGS] = { .type = NLA_U32 }, [TCA_GATE_CLOCKID] = { .type = NLA_S32 }, }; static int fill_gate_entry(struct nlattr **tb, struct tcfg_gate_entry *entry, struct netlink_ext_ack *extack) { u32 interval = 0; entry->gate_state = nla_get_flag(tb[TCA_GATE_ENTRY_GATE]); if (tb[TCA_GATE_ENTRY_INTERVAL]) interval = nla_get_u32(tb[TCA_GATE_ENTRY_INTERVAL]); if (interval == 0) { NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); return -EINVAL; } entry->interval = interval; entry->ipv = nla_get_s32_default(tb[TCA_GATE_ENTRY_IPV], -1); entry->maxoctets = nla_get_s32_default(tb[TCA_GATE_ENTRY_MAX_OCTETS], -1); return 0; } static int parse_gate_entry(struct nlattr *n, struct tcfg_gate_entry *entry, int index, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_GATE_ENTRY_MAX + 1] = { }; int err; err = nla_parse_nested(tb, TCA_GATE_ENTRY_MAX, n, entry_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Could not parse nested entry"); return -EINVAL; } entry->index = index; return fill_gate_entry(tb, entry, extack); } static void release_entry_list(struct list_head *entries) { struct tcfg_gate_entry *entry, *e; list_for_each_entry_safe(entry, e, entries, list) { list_del(&entry->list); kfree(entry); } } static int tcf_gate_copy_entries(struct tcf_gate_params *dst, const struct tcf_gate_params *src, struct netlink_ext_ack *extack) { struct tcfg_gate_entry *entry; int i = 0; list_for_each_entry(entry, &src->entries, list) { struct tcfg_gate_entry *new; new = kzalloc(sizeof(*new), GFP_ATOMIC); if (!new) { NL_SET_ERR_MSG(extack, "Not enough memory for entry"); return -ENOMEM; } new->index = entry->index; new->gate_state = entry->gate_state; new->interval = entry->interval; new->ipv = entry->ipv; new->maxoctets = entry->maxoctets; list_add_tail(&new->list, &dst->entries); i++; } dst->num_entries = i; return 0; } static int parse_gate_list(struct nlattr *list_attr, struct tcf_gate_params *sched, struct netlink_ext_ack *extack) { struct tcfg_gate_entry *entry; struct nlattr *n; int err, rem; int i = 0; if (!list_attr) return -EINVAL; nla_for_each_nested(n, list_attr, rem) { if (nla_type(n) != TCA_GATE_ONE_ENTRY) { NL_SET_ERR_MSG(extack, "Attribute isn't type 'entry'"); continue; } entry = kzalloc_obj(*entry, GFP_ATOMIC); if (!entry) { NL_SET_ERR_MSG(extack, "Not enough memory for entry"); err = -ENOMEM; goto release_list; } err = parse_gate_entry(n, entry, i, extack); if (err < 0) { kfree(entry); goto release_list; } list_add_tail(&entry->list, &sched->entries); i++; } sched->num_entries = i; return i; release_list: release_entry_list(&sched->entries); return err; } static bool gate_timer_needs_cancel(u64 basetime, u64 old_basetime, enum tk_offsets tko, enum tk_offsets old_tko, s32 clockid, s32 old_clockid) { return basetime != old_basetime || clockid != old_clockid || tko != old_tko; } static int gate_clock_resolve(s32 clockid, enum tk_offsets *tko, struct netlink_ext_ack *extack) { switch (clockid) { case CLOCK_REALTIME: *tko = TK_OFFS_REAL; return 0; case CLOCK_MONOTONIC: *tko = TK_OFFS_MAX; return 0; case CLOCK_BOOTTIME: *tko = TK_OFFS_BOOT; return 0; case CLOCK_TAI: *tko = TK_OFFS_TAI; return 0; default: NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); return -EINVAL; } } static void gate_setup_timer(struct tcf_gate *gact, s32 clockid, enum tk_offsets tko) { WRITE_ONCE(gact->tk_offset, tko); hrtimer_setup(&gact->hitimer, gate_timer_func, clockid, HRTIMER_MODE_ABS_SOFT); } static int tcf_gate_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_gate_ops.net_id); u64 cycletime = 0, basetime = 0, cycletime_ext = 0; struct tcf_gate_params *p = NULL, *old_p = NULL; enum tk_offsets old_tk_offset = TK_OFFS_TAI; const struct tcf_gate_params *cur_p = NULL; bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_GATE_MAX + 1]; enum tk_offsets tko = TK_OFFS_TAI; struct tcf_chain *goto_ch = NULL; s32 timer_clockid = CLOCK_TAI; bool use_old_entries = false; s32 old_clockid = CLOCK_TAI; bool need_cancel = false; s32 clockid = CLOCK_TAI; struct tcf_gate *gact; struct tc_gate *parm; u64 old_basetime = 0; int ret = 0, err; u32 gflags = 0; s32 prio = -1; ktime_t start; u32 index; if (!nla) return -EINVAL; err = nla_parse_nested(tb, TCA_GATE_MAX, nla, gate_policy, extack); if (err < 0) return err; if (!tb[TCA_GATE_PARMS]) return -EINVAL; if (tb[TCA_GATE_CLOCKID]) clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]); parm = nla_data(tb[TCA_GATE_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; if (err && bind) return ACT_P_BOUND; if (!err) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_gate_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } gact = to_gate(*a); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) { err = -ENOMEM; goto chain_put; } INIT_LIST_HEAD(&p->entries); use_old_entries = !tb[TCA_GATE_ENTRY_LIST]; if (!use_old_entries) { err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); if (err < 0) goto err_free; use_old_entries = !err; } if (ret == ACT_P_CREATED && use_old_entries) { NL_SET_ERR_MSG(extack, "The entry list is empty"); err = -EINVAL; goto err_free; } if (ret != ACT_P_CREATED) { rcu_read_lock(); cur_p = rcu_dereference(gact->param); old_basetime = cur_p->tcfg_basetime; old_clockid = cur_p->tcfg_clockid; old_tk_offset = READ_ONCE(gact->tk_offset); basetime = old_basetime; cycletime_ext = cur_p->tcfg_cycletime_ext; prio = cur_p->tcfg_priority; gflags = cur_p->tcfg_flags; if (!tb[TCA_GATE_CLOCKID]) clockid = old_clockid; err = 0; if (use_old_entries) { err = tcf_gate_copy_entries(p, cur_p, extack); if (!err && !tb[TCA_GATE_CYCLE_TIME]) cycletime = cur_p->tcfg_cycletime; } rcu_read_unlock(); if (err) goto err_free; } if (tb[TCA_GATE_PRIORITY]) prio = nla_get_s32(tb[TCA_GATE_PRIORITY]); if (tb[TCA_GATE_BASE_TIME]) basetime = nla_get_u64(tb[TCA_GATE_BASE_TIME]); if (tb[TCA_GATE_FLAGS]) gflags = nla_get_u32(tb[TCA_GATE_FLAGS]); if (tb[TCA_GATE_CYCLE_TIME]) cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); if (tb[TCA_GATE_CYCLE_TIME_EXT]) cycletime_ext = nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]); err = gate_clock_resolve(clockid, &tko, extack); if (err) goto err_free; timer_clockid = clockid; need_cancel = ret != ACT_P_CREATED && gate_timer_needs_cancel(basetime, old_basetime, tko, old_tk_offset, timer_clockid, old_clockid); if (need_cancel) hrtimer_cancel(&gact->hitimer); spin_lock_bh(&gact->tcf_lock); if (!cycletime) { struct tcfg_gate_entry *entry; ktime_t cycle = 0; list_for_each_entry(entry, &p->entries, list) cycle = ktime_add_ns(cycle, entry->interval); cycletime = cycle; } p->tcfg_cycletime = cycletime; p->tcfg_cycletime_ext = cycletime_ext; if (need_cancel || ret == ACT_P_CREATED) gate_setup_timer(gact, timer_clockid, tko); p->tcfg_priority = prio; p->tcfg_flags = gflags; p->tcfg_basetime = basetime; p->tcfg_clockid = timer_clockid; gate_get_start_time(gact, p, &start); old_p = rcu_replace_pointer(gact->param, p, lockdep_is_held(&gact->tcf_lock)); gact->current_close_time = start; gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING; gact->next_entry = list_first_entry(&p->entries, struct tcfg_gate_entry, list); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); gate_start_timer(gact, start); spin_unlock_bh(&gact->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (old_p) call_rcu(&old_p->rcu, tcf_gate_params_free_rcu); return ret; err_free: release_entry_list(&p->entries); kfree(p); chain_put: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: /* action is not inserted in any list: it's safe to init hitimer * without taking tcf_lock. */ if (ret == ACT_P_CREATED) gate_setup_timer(gact, timer_clockid, tko); tcf_idr_release(*a, bind); return err; } static void tcf_gate_params_free_rcu(struct rcu_head *head) { struct tcf_gate_params *p = container_of(head, struct tcf_gate_params, rcu); release_entry_list(&p->entries); kfree(p); } static void tcf_gate_cleanup(struct tc_action *a) { struct tcf_gate *gact = to_gate(a); struct tcf_gate_params *p; hrtimer_cancel(&gact->hitimer); p = rcu_dereference_protected(gact->param, 1); if (p) call_rcu(&p->rcu, tcf_gate_params_free_rcu); } static int dumping_entry(struct sk_buff *skb, struct tcfg_gate_entry *entry) { struct nlattr *item; item = nla_nest_start_noflag(skb, TCA_GATE_ONE_ENTRY); if (!item) return -ENOSPC; if (nla_put_u32(skb, TCA_GATE_ENTRY_INDEX, entry->index)) goto nla_put_failure; if (entry->gate_state && nla_put_flag(skb, TCA_GATE_ENTRY_GATE)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GATE_ENTRY_INTERVAL, entry->interval)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_ENTRY_MAX_OCTETS, entry->maxoctets)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_ENTRY_IPV, entry->ipv)) goto nla_put_failure; return nla_nest_end(skb, item); nla_put_failure: nla_nest_cancel(skb, item); return -1; } static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_gate *gact = to_gate(a); struct tc_gate opt = { .index = gact->tcf_index, .refcnt = refcount_read(&gact->tcf_refcnt) - ref, .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind, }; struct tcfg_gate_entry *entry; struct tcf_gate_params *p; struct nlattr *entry_list; struct tcf_t t; rcu_read_lock(); opt.action = READ_ONCE(gact->tcf_action); p = rcu_dereference(gact->param); if (nla_put(skb, TCA_GATE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_GATE_BASE_TIME, p->tcfg_basetime, TCA_GATE_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME, p->tcfg_cycletime, TCA_GATE_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME_EXT, p->tcfg_cycletime_ext, TCA_GATE_PAD)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_CLOCKID, p->tcfg_clockid)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GATE_FLAGS, p->tcfg_flags)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_PRIORITY, p->tcfg_priority)) goto nla_put_failure; entry_list = nla_nest_start_noflag(skb, TCA_GATE_ENTRY_LIST); if (!entry_list) goto nla_put_failure; list_for_each_entry(entry, &p->entries, list) { if (dumping_entry(skb, entry) < 0) goto nla_put_failure; } nla_nest_end(skb, entry_list); tcf_tm_dump(&t, &gact->tcf_tm); if (nla_put_64bit(skb, TCA_GATE_TM, sizeof(t), &t, TCA_GATE_PAD)) goto nla_put_failure; rcu_read_unlock(); return skb->len; nla_put_failure: rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } static void tcf_gate_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_gate *gact = to_gate(a); struct tcf_t *tm = &gact->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static size_t tcf_gate_get_fill_size(const struct tc_action *act) { return nla_total_size(sizeof(struct tc_gate)); } static void tcf_gate_entry_destructor(void *priv) { struct action_gate_entry *oe = priv; kfree(oe); } static int tcf_gate_get_entries(struct flow_action_entry *entry, const struct tc_action *act) { entry->gate.entries = tcf_gate_get_list(act); if (!entry->gate.entries) return -EINVAL; entry->destructor = tcf_gate_entry_destructor; entry->destructor_priv = entry->gate.entries; return 0; } static int tcf_gate_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { int err; if (bind) { struct flow_action_entry *entry = entry_data; entry->id = FLOW_ACTION_GATE; entry->gate.prio = tcf_gate_prio(act); entry->gate.basetime = tcf_gate_basetime(act); entry->gate.cycletime = tcf_gate_cycletime(act); entry->gate.cycletimeext = tcf_gate_cycletimeext(act); entry->gate.num_entries = tcf_gate_num_entries(act); err = tcf_gate_get_entries(entry, act); if (err) return err; *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; fl_action->id = FLOW_ACTION_GATE; } return 0; } static struct tc_action_ops act_gate_ops = { .kind = "gate", .id = TCA_ID_GATE, .owner = THIS_MODULE, .act = tcf_gate_act, .dump = tcf_gate_dump, .init = tcf_gate_init, .cleanup = tcf_gate_cleanup, .stats_update = tcf_gate_stats_update, .get_fill_size = tcf_gate_get_fill_size, .offload_act_setup = tcf_gate_offload_act_setup, .size = sizeof(struct tcf_gate), }; MODULE_ALIAS_NET_ACT("gate"); static __net_init int gate_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_gate_ops.net_id); return tc_action_net_init(net, tn, &act_gate_ops); } static void __net_exit gate_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_gate_ops.net_id); } static struct pernet_operations gate_net_ops = { .init = gate_init_net, .exit_batch = gate_exit_net, .id = &act_gate_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init gate_init_module(void) { return tcf_register_action(&act_gate_ops, &gate_net_ops); } static void __exit gate_cleanup_module(void) { tcf_unregister_action(&act_gate_ops, &gate_net_ops); } module_init(gate_init_module); module_exit(gate_cleanup_module); MODULE_DESCRIPTION("TC gate action"); MODULE_LICENSE("GPL v2");
3 3 3 2 1 3 3 3 3 3 1 1 1 1 1 9 9 9 8 9 9 8 1 1 3 2 6 5 6 6 1 6 6 3 7 7 3 7 7 3 3 4 4 4 3 8 2 1 6 1 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 // SPDX-License-Identifier: GPL-2.0-only #include <linux/net_tstamp.h> #include <linux/phy.h> #include <linux/phy_link_topology.h> #include <linux/ptp_clock_kernel.h> #include <net/netdev_lock.h> #include "bitset.h" #include "common.h" #include "netlink.h" #include "ts.h" struct tsinfo_req_info { struct ethnl_req_info base; struct hwtstamp_provider_desc hwprov_desc; }; struct tsinfo_reply_data { struct ethnl_reply_data base; struct kernel_ethtool_ts_info ts_info; struct ethtool_ts_stats stats; }; #define TSINFO_REQINFO(__req_base) \ container_of(__req_base, struct tsinfo_req_info, base) #define TSINFO_REPDATA(__reply_base) \ container_of(__reply_base, struct tsinfo_reply_data, base) #define ETHTOOL_TS_STAT_CNT \ (__ETHTOOL_A_TS_STAT_CNT - (ETHTOOL_A_TS_STAT_UNSPEC + 1)) const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1] = { [ETHTOOL_A_TSINFO_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), [ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER] = NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy), }; int ts_parse_hwtst_provider(const struct nlattr *nest, struct hwtstamp_provider_desc *hwprov_desc, struct netlink_ext_ack *extack, bool *mod) { struct nlattr *tb[ARRAY_SIZE(ethnl_ts_hwtst_prov_policy)]; int ret; ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_ts_hwtst_prov_policy) - 1, nest, ethnl_ts_hwtst_prov_policy, extack); if (ret < 0) return ret; if (NL_REQ_ATTR_CHECK(extack, nest, tb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX) || NL_REQ_ATTR_CHECK(extack, nest, tb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER)) return -EINVAL; ethnl_update_u32(&hwprov_desc->index, tb[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX], mod); ethnl_update_u32(&hwprov_desc->qualifier, tb[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER], mod); return 0; } static int tsinfo_parse_request(struct ethnl_req_info *req_base, const struct genl_info *info, struct nlattr **tb, struct netlink_ext_ack *extack) { struct tsinfo_req_info *req = TSINFO_REQINFO(req_base); bool mod = false; req->hwprov_desc.index = -1; if (!tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER]) return 0; if (req_base->flags & ETHTOOL_FLAG_STATS) { NL_SET_ERR_MSG(extack, "can't query statistics for a provider"); return -EOPNOTSUPP; } return ts_parse_hwtst_provider(tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER], &req->hwprov_desc, extack, &mod); } static int tsinfo_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); struct tsinfo_req_info *req = TSINFO_REQINFO(req_base); struct net_device *dev = reply_base->dev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; if (req->hwprov_desc.index != -1) { ret = ethtool_get_ts_info_by_phc(dev, &data->ts_info, &req->hwprov_desc); ethnl_ops_complete(dev); return ret; } if (req_base->flags & ETHTOOL_FLAG_STATS) { ethtool_stats_init((u64 *)&data->stats, sizeof(data->stats) / sizeof(u64)); if (dev->ethtool_ops->get_ts_stats) dev->ethtool_ops->get_ts_stats(dev, &data->stats); } ret = __ethtool_get_ts_info(dev, &data->ts_info); ethnl_ops_complete(dev); return ret; } static int tsinfo_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct kernel_ethtool_ts_info *ts_info = &data->ts_info; int len = 0; int ret; BUILD_BUG_ON(__SOF_TIMESTAMPING_CNT > 32); BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32); BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32); if (ts_info->so_timestamping) { ret = ethnl_bitset32_size(&ts_info->so_timestamping, NULL, __SOF_TIMESTAMPING_CNT, sof_timestamping_names, compact); if (ret < 0) return ret; len += ret; /* _TSINFO_TIMESTAMPING */ } if (ts_info->tx_types) { ret = ethnl_bitset32_size(&ts_info->tx_types, NULL, __HWTSTAMP_TX_CNT, ts_tx_type_names, compact); if (ret < 0) return ret; len += ret; /* _TSINFO_TX_TYPES */ } if (ts_info->rx_filters) { ret = ethnl_bitset32_size(&ts_info->rx_filters, NULL, __HWTSTAMP_FILTER_CNT, ts_rx_filter_names, compact); if (ret < 0) return ret; len += ret; /* _TSINFO_RX_FILTERS */ } if (ts_info->phc_index >= 0) { len += nla_total_size(sizeof(u32)); /* _TSINFO_PHC_INDEX */ /* _TSINFO_HWTSTAMP_PROVIDER */ len += nla_total_size(0) + 2 * nla_total_size(sizeof(u32)); } if (ts_info->phc_source) { len += nla_total_size(sizeof(u32)); /* _TSINFO_HWTSTAMP_SOURCE */ if (ts_info->phc_phyindex) /* _TSINFO_HWTSTAMP_PHYINDEX */ len += nla_total_size(sizeof(u32)); } if (req_base->flags & ETHTOOL_FLAG_STATS) len += nla_total_size(0) + /* _TSINFO_STATS */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_TS_STAT_CNT; return len; } static int tsinfo_put_stat(struct sk_buff *skb, u64 val, u16 attrtype) { if (val == ETHTOOL_STAT_NOT_SET) return 0; if (nla_put_uint(skb, attrtype, val)) return -EMSGSIZE; return 0; } static int tsinfo_put_stats(struct sk_buff *skb, const struct ethtool_ts_stats *stats) { struct nlattr *nest; nest = nla_nest_start(skb, ETHTOOL_A_TSINFO_STATS); if (!nest) return -EMSGSIZE; if (tsinfo_put_stat(skb, stats->tx_stats.pkts, ETHTOOL_A_TS_STAT_TX_PKTS) || tsinfo_put_stat(skb, stats->tx_stats.onestep_pkts_unconfirmed, ETHTOOL_A_TS_STAT_TX_ONESTEP_PKTS_UNCONFIRMED) || tsinfo_put_stat(skb, stats->tx_stats.lost, ETHTOOL_A_TS_STAT_TX_LOST) || tsinfo_put_stat(skb, stats->tx_stats.err, ETHTOOL_A_TS_STAT_TX_ERR)) goto err_cancel; nla_nest_end(skb, nest); return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int tsinfo_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct kernel_ethtool_ts_info *ts_info = &data->ts_info; int ret; if (ts_info->so_timestamping) { ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_TIMESTAMPING, &ts_info->so_timestamping, NULL, __SOF_TIMESTAMPING_CNT, sof_timestamping_names, compact); if (ret < 0) return ret; } if (ts_info->tx_types) { ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_TX_TYPES, &ts_info->tx_types, NULL, __HWTSTAMP_TX_CNT, ts_tx_type_names, compact); if (ret < 0) return ret; } if (ts_info->rx_filters) { ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_RX_FILTERS, &ts_info->rx_filters, NULL, __HWTSTAMP_FILTER_CNT, ts_rx_filter_names, compact); if (ret < 0) return ret; } if (ts_info->phc_index >= 0) { struct nlattr *nest; ret = nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX, ts_info->phc_index); if (ret) return -EMSGSIZE; nest = nla_nest_start(skb, ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX, ts_info->phc_index) || nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER, ts_info->phc_qualifier)) { nla_nest_cancel(skb, nest); return -EMSGSIZE; } nla_nest_end(skb, nest); } if (ts_info->phc_source) { if (nla_put_u32(skb, ETHTOOL_A_TSINFO_HWTSTAMP_SOURCE, ts_info->phc_source)) return -EMSGSIZE; if (ts_info->phc_phyindex && nla_put_u32(skb, ETHTOOL_A_TSINFO_HWTSTAMP_PHYINDEX, ts_info->phc_phyindex)) return -EMSGSIZE; } if (req_base->flags & ETHTOOL_FLAG_STATS && tsinfo_put_stats(skb, &data->stats)) return -EMSGSIZE; return 0; } struct ethnl_tsinfo_dump_ctx { struct tsinfo_req_info *req_info; struct tsinfo_reply_data *reply_data; unsigned long pos_ifindex; bool netdev_dump_done; unsigned long pos_phyindex; enum hwtstamp_provider_qualifier pos_phcqualifier; }; static void *ethnl_tsinfo_prepare_dump(struct sk_buff *skb, struct net_device *dev, struct tsinfo_reply_data *reply_data, struct netlink_callback *cb) { struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; void *ehdr = NULL; ehdr = ethnl_dump_put(skb, cb, ETHTOOL_MSG_TSINFO_GET_REPLY); if (!ehdr) return ERR_PTR(-EMSGSIZE); reply_data = ctx->reply_data; memset(reply_data, 0, sizeof(*reply_data)); reply_data->base.dev = dev; reply_data->ts_info.cmd = ETHTOOL_GET_TS_INFO; reply_data->ts_info.phc_index = -1; return ehdr; } static int ethnl_tsinfo_end_dump(struct sk_buff *skb, struct net_device *dev, struct tsinfo_req_info *req_info, struct tsinfo_reply_data *reply_data, void *ehdr) { int ret; reply_data->ts_info.so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TSINFO_HEADER); if (ret < 0) return ret; ret = tsinfo_fill_reply(skb, &req_info->base, &reply_data->base); if (ret < 0) return ret; reply_data->base.dev = NULL; genlmsg_end(skb, ehdr); return ret; } static int ethnl_tsinfo_dump_one_phydev(struct sk_buff *skb, struct net_device *dev, struct phy_device *phydev, struct netlink_callback *cb) { struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; struct tsinfo_reply_data *reply_data; struct tsinfo_req_info *req_info; void *ehdr = NULL; int ret = 0; if (!phy_has_tsinfo(phydev)) return -EOPNOTSUPP; reply_data = ctx->reply_data; req_info = ctx->req_info; ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb); if (IS_ERR(ehdr)) return PTR_ERR(ehdr); ret = phy_ts_info(phydev, &reply_data->ts_info); if (ret < 0) goto err; if (reply_data->ts_info.phc_index >= 0) { reply_data->ts_info.phc_source = HWTSTAMP_SOURCE_PHYLIB; reply_data->ts_info.phc_phyindex = phydev->phyindex; } ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, ehdr); if (ret < 0) goto err; return ret; err: genlmsg_cancel(skb, ehdr); return ret; } static int ethnl_tsinfo_dump_one_netdev(struct sk_buff *skb, struct net_device *dev, struct netlink_callback *cb) { struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; const struct ethtool_ops *ops = dev->ethtool_ops; struct tsinfo_reply_data *reply_data; struct tsinfo_req_info *req_info; void *ehdr = NULL; int ret = 0; if (!ops->get_ts_info) return -EOPNOTSUPP; reply_data = ctx->reply_data; req_info = ctx->req_info; for (; ctx->pos_phcqualifier < HWTSTAMP_PROVIDER_QUALIFIER_CNT; ctx->pos_phcqualifier++) { if (!net_support_hwtstamp_qualifier(dev, ctx->pos_phcqualifier)) continue; ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb); if (IS_ERR(ehdr)) return PTR_ERR(ehdr); reply_data->ts_info.phc_qualifier = ctx->pos_phcqualifier; ret = ops->get_ts_info(dev, &reply_data->ts_info); if (ret < 0) goto err; if (reply_data->ts_info.phc_index >= 0) reply_data->ts_info.phc_source = HWTSTAMP_SOURCE_NETDEV; ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, ehdr); if (ret < 0) goto err; } return ret; err: genlmsg_cancel(skb, ehdr); return ret; } static int ethnl_tsinfo_dump_one_net_topo(struct sk_buff *skb, struct net_device *dev, struct netlink_callback *cb) { struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; struct phy_device_node *pdn; int ret = 0; if (!ctx->netdev_dump_done) { ret = ethnl_tsinfo_dump_one_netdev(skb, dev, cb); if (ret < 0 && ret != -EOPNOTSUPP) return ret; ctx->netdev_dump_done = true; } if (!dev->link_topo) { if (phy_has_tsinfo(dev->phydev)) { ret = ethnl_tsinfo_dump_one_phydev(skb, dev, dev->phydev, cb); if (ret < 0 && ret != -EOPNOTSUPP) return ret; } return 0; } xa_for_each_start(&dev->link_topo->phys, ctx->pos_phyindex, pdn, ctx->pos_phyindex) { if (phy_has_tsinfo(pdn->phy)) { ret = ethnl_tsinfo_dump_one_phydev(skb, dev, pdn->phy, cb); if (ret < 0 && ret != -EOPNOTSUPP) return ret; } } return ret; } int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); struct net_device *dev; int ret = 0; rtnl_lock(); if (ctx->req_info->base.dev) { dev = ctx->req_info->base.dev; netdev_lock_ops(dev); ret = ethnl_tsinfo_dump_one_net_topo(skb, dev, cb); netdev_unlock_ops(dev); } else { for_each_netdev_dump(net, dev, ctx->pos_ifindex) { netdev_lock_ops(dev); ret = ethnl_tsinfo_dump_one_net_topo(skb, dev, cb); netdev_unlock_ops(dev); if (ret < 0 && ret != -EOPNOTSUPP) break; ctx->pos_phyindex = 0; ctx->netdev_dump_done = false; ctx->pos_phcqualifier = HWTSTAMP_PROVIDER_QUALIFIER_PRECISE; } } rtnl_unlock(); return ret; } int ethnl_tsinfo_start(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; struct nlattr **tb = info->info.attrs; struct tsinfo_reply_data *reply_data; struct tsinfo_req_info *req_info; int ret; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); req_info = kzalloc_obj(*req_info); if (!req_info) return -ENOMEM; reply_data = kzalloc_obj(*reply_data); if (!reply_data) { ret = -ENOMEM; goto free_req_info; } ret = ethnl_parse_header_dev_get(&req_info->base, tb[ETHTOOL_A_TSINFO_HEADER], sock_net(cb->skb->sk), cb->extack, false); if (ret < 0) goto free_reply_data; if (req_info->base.flags & ETHTOOL_FLAG_STATS) { NL_SET_ERR_MSG(cb->extack, "stats not supported in dump"); ret = -EOPNOTSUPP; goto err_dev_put; } ctx->req_info = req_info; ctx->reply_data = reply_data; ctx->pos_ifindex = 0; ctx->pos_phyindex = 0; ctx->netdev_dump_done = false; ctx->pos_phcqualifier = HWTSTAMP_PROVIDER_QUALIFIER_PRECISE; return 0; err_dev_put: ethnl_parse_header_dev_put(&req_info->base); free_reply_data: kfree(reply_data); free_req_info: kfree(req_info); return ret; } int ethnl_tsinfo_done(struct netlink_callback *cb) { struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; struct tsinfo_req_info *req_info = ctx->req_info; ethnl_parse_header_dev_put(&req_info->base); kfree(ctx->reply_data); kfree(ctx->req_info); return 0; } const struct ethnl_request_ops ethnl_tsinfo_request_ops = { .request_cmd = ETHTOOL_MSG_TSINFO_GET, .reply_cmd = ETHTOOL_MSG_TSINFO_GET_REPLY, .hdr_attr = ETHTOOL_A_TSINFO_HEADER, .req_info_size = sizeof(struct tsinfo_req_info), .reply_data_size = sizeof(struct tsinfo_reply_data), .parse_request = tsinfo_parse_request, .prepare_data = tsinfo_prepare_data, .reply_size = tsinfo_reply_size, .fill_reply = tsinfo_fill_reply, };
15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __PSP_PSP_H #define __PSP_PSP_H #include <linux/list.h> #include <linux/lockdep.h> #include <linux/mutex.h> #include <net/netns/generic.h> #include <net/psp.h> #include <net/sock.h> extern struct xarray psp_devs; extern struct mutex psp_devs_lock; void psp_dev_free(struct psp_dev *psd); int psp_dev_check_access(struct psp_dev *psd, struct net *net); void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd); struct psp_assoc *psp_assoc_create(struct psp_dev *psd); struct psp_dev *psp_dev_get_for_sock(struct sock *sk); void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas); int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, struct psp_key_parsed *key, struct netlink_ext_ack *extack); int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, u32 version, struct psp_key_parsed *key, struct netlink_ext_ack *extack); void psp_assocs_key_rotated(struct psp_dev *psd); static inline void psp_dev_get(struct psp_dev *psd) { refcount_inc(&psd->refcnt); } static inline bool psp_dev_tryget(struct psp_dev *psd) { return refcount_inc_not_zero(&psd->refcnt); } static inline void psp_dev_put(struct psp_dev *psd) { if (refcount_dec_and_test(&psd->refcnt)) psp_dev_free(psd); } static inline bool psp_dev_is_registered(struct psp_dev *psd) { lockdep_assert_held(&psd->lock); return !!psd->ops; } #endif /* __PSP_PSP_H */
8 7 1 58 58 29 30 30 25 21 30 30 30 30 30 30 30 30 30 251 17 236 37 37 37 37 37 15 15 15 15 15 37 37 37 37 33 33 33 8 30 33 25 25 5 20 4 22 17 31 31 31 73 72 73 71 73 73 33 42 23 42 9 17 24 24 50 50 50 2 49 1 3 15 15 13 1 15 116 115 2 9 109 85 27 6 80 18 66 66 66 61 15 31 6 25 16 14 3 4 4 261 4 252 6 121 55 13 42 42 954 2 958 62 626 955 70 815 245 37 651 1 684 673 49 55 687 638 587 27 2 25 25 2 2 99 100 8 95 76 24 6 71 71 4 4 1 31 1 27 2 29 10 60 59 60 60 25 4 21 15 11 1 4 4 947 17 950 857 856 54 624 195 62 738 206 636 639 436 432 639 35 37 1 36 5 35 6 36 3 3 3 2 1 27 27 19 58 58 58 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS An implementation of the IP virtual server support for the * LINUX operating system. IPVS is now implemented as a module * over the Netfilter framework. IPVS can be used to build a * high-performance and highly available server based on a * cluster of servers. * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Peter Kese <peter.kese@ijs.si> * Julian Anastasov <ja@ssi.bg> * * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms * and others. * * Changes: * Paul `Rusty' Russell properly handle non-linear skbs * Harald Welte don't use nfcache */ #define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/ip.h> #include <linux/tcp.h> #include <linux/sctp.h> #include <linux/icmp.h> #include <linux/slab.h> #include <net/ip.h> #include <net/tcp.h> #include <net/udp.h> #include <net/icmp.h> /* for icmp_send */ #include <net/gue.h> #include <net/gre.h> #include <net/route.h> #include <net/ip6_checksum.h> #include <net/netns/generic.h> /* net_generic() */ #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #ifdef CONFIG_IP_VS_IPV6 #include <net/ipv6.h> #include <linux/netfilter_ipv6.h> #include <net/ip6_route.h> #endif #include <net/ip_vs.h> #include <linux/indirect_call_wrapper.h> EXPORT_SYMBOL(register_ip_vs_scheduler); EXPORT_SYMBOL(unregister_ip_vs_scheduler); EXPORT_SYMBOL(ip_vs_proto_name); EXPORT_SYMBOL(ip_vs_conn_new); EXPORT_SYMBOL(ip_vs_conn_in_get); EXPORT_SYMBOL(ip_vs_conn_out_get); #ifdef CONFIG_IP_VS_PROTO_TCP EXPORT_SYMBOL(ip_vs_tcp_conn_listen); #endif EXPORT_SYMBOL(ip_vs_conn_put); #ifdef CONFIG_IP_VS_DEBUG EXPORT_SYMBOL(ip_vs_get_debug_level); #endif EXPORT_SYMBOL(ip_vs_new_conn_out); #if defined(CONFIG_IP_VS_PROTO_TCP) && defined(CONFIG_IP_VS_PROTO_UDP) #define SNAT_CALL(f, ...) \ INDIRECT_CALL_2(f, tcp_snat_handler, udp_snat_handler, __VA_ARGS__) #elif defined(CONFIG_IP_VS_PROTO_TCP) #define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, tcp_snat_handler, __VA_ARGS__) #elif defined(CONFIG_IP_VS_PROTO_UDP) #define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, udp_snat_handler, __VA_ARGS__) #else #define SNAT_CALL(f, ...) f(__VA_ARGS__) #endif static unsigned int ip_vs_net_id __read_mostly; /* netns cnt used for uniqueness */ static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); /* ID used in ICMP lookups */ #define icmp_id(icmph) (((icmph)->un).echo.id) #define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) const char *ip_vs_proto_name(unsigned int proto) { static char buf[20]; switch (proto) { case IPPROTO_IP: return "IP"; case IPPROTO_UDP: return "UDP"; case IPPROTO_TCP: return "TCP"; case IPPROTO_SCTP: return "SCTP"; case IPPROTO_ICMP: return "ICMP"; #ifdef CONFIG_IP_VS_IPV6 case IPPROTO_ICMPV6: return "ICMPv6"; #endif default: sprintf(buf, "IP_%u", proto); return buf; } } void ip_vs_init_hash_table(struct list_head *table, int rows) { while (--rows >= 0) INIT_LIST_HEAD(&table[rows]); } /* IPVS Resizable Hash Tables: * - list_bl buckets with bit lock * * Goals: * - RCU lookup for entry can run in parallel with add/del/move operations * - hash keys can be on non-contiguous memory * - support entries with duplicate keys * - unlink entries without lookup, use the saved table and bucket id * - resizing can trigger on load change or depending on key refresh period * - customizable load factor to balance between speed and memory usage * - add/del/move operations should be allowed for any context * * Resizing: * - new table is attached to the current table and all entries are moved * with new hash key. Finally, the new table is installed as current one and * the old table is released after RCU grace period. * - RCU read-side critical sections will walk two tables while resizing is * in progress * - new entries are added to the new table * - entries will be deleted from the old or from the new table, the table_id * can be saved into entry as part of the hash key to know where the entry is * hashed * - move operations may delay readers or to cause retry for the modified * bucket. As result, searched entry will be found but walkers that operate * on multiple entries may see same entry twice if bucket walking is retried. * - for fast path the number of entries (load) can be compared to u_thresh * and l_thresh to decide when to trigger table growing/shrinking. They * are calculated based on load factor (shift count), negative value allows * load to be below 100% to reduce collisions by maintaining larger table * while positive value tolerates collisions by using smaller table and load * above 100%: u_thresh(load) = size * (2 ^ lfactor) * * Locking: * - lock: protect seqc if other context except resizer can move entries * - seqc: seqcount_t, delay/retry readers while entries are moved to * new table on resizing * - bit lock: serialize bucket modifications * - writers may use other locking mechanisms to serialize operations for * resizing, moving and installing new tables */ void ip_vs_rht_free(struct ip_vs_rht *t) { kvfree(t->buckets); kvfree(t->seqc); kvfree(t->lock); kfree(t); } void ip_vs_rht_rcu_free(struct rcu_head *head) { struct ip_vs_rht *t; t = container_of(head, struct ip_vs_rht, rcu_head); ip_vs_rht_free(t); } struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks) { struct ip_vs_rht *t = kzalloc(sizeof(*t), GFP_KERNEL); int i; if (!t) return NULL; if (scounts) { int ml = roundup_pow_of_two(nr_cpu_ids); scounts = min(scounts, buckets); scounts = min(scounts, ml); t->seqc = kvmalloc_array(scounts, sizeof(*t->seqc), GFP_KERNEL); if (!t->seqc) goto err; for (i = 0; i < scounts; i++) seqcount_init(&t->seqc[i]); if (locks) { locks = min(locks, scounts); t->lock = kvmalloc_array(locks, sizeof(*t->lock), GFP_KERNEL); if (!t->lock) goto err; for (i = 0; i < locks; i++) spin_lock_init(&t->lock[i].l); } } t->buckets = kvmalloc_array(buckets, sizeof(*t->buckets), GFP_KERNEL); if (!t->buckets) goto err; for (i = 0; i < buckets; i++) INIT_HLIST_BL_HEAD(&t->buckets[i]); t->mask = buckets - 1; t->size = buckets; t->seqc_mask = scounts - 1; t->lock_mask = locks - 1; t->u_thresh = buckets; t->l_thresh = buckets >> 4; t->bits = order_base_2(buckets); /* new_tbl points to self if no new table is filled */ RCU_INIT_POINTER(t->new_tbl, t); get_random_bytes(&t->hash_key, sizeof(t->hash_key)); return t; err: ip_vs_rht_free(t); return NULL; } /* Get the desired table size for n entries based on current table size and * by using the formula size = n / (2^lfactor) * lfactor: shift value for the load factor: * - >0: u_thresh=size << lfactor, for load factor above 100% * - <0: u_thresh=size >> -lfactor, for load factor below 100% * - 0: for load factor of 100% */ int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n, int lfactor, int min_bits, int max_bits) { if (!t) return 1 << min_bits; n = n > 0 ? roundup_pow_of_two(n) : 1; if (lfactor < 0) { int factor = min(-lfactor, max_bits); n = min(n, 1 << (max_bits - factor)); n <<= factor; } else { n = min(n >> lfactor, 1 << max_bits); } if (lfactor != t->lfactor) return clamp(n, 1 << min_bits, 1 << max_bits); if (n > t->size) return n; if (n > t->size >> 4) return t->size; /* Shrink but keep it n * 2 to prevent frequent resizing */ return clamp(n << 1, 1 << min_bits, 1 << max_bits); } /* Set thresholds based on table size and load factor: * u_thresh = size * (2^lfactor) * l_thresh = u_thresh / 16 * u_thresh/l_thresh can be used to check if load triggers a table grow/shrink */ void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor, int min_bits, int max_bits) { if (size >= 1 << max_bits) t->u_thresh = INT_MAX; /* stop growing */ else if (lfactor <= 0) t->u_thresh = size >> min(-lfactor, max_bits); else t->u_thresh = min(size, 1 << (30 - lfactor)) << lfactor; /* l_thresh: shrink when load is 16 times lower, can be 0 */ if (size >= 1 << max_bits) t->l_thresh = (1 << max_bits) >> 4; else if (size > 1 << min_bits) t->l_thresh = t->u_thresh >> 4; else t->l_thresh = 0; /* stop shrinking */ } /* Return hash value for local info (fast, insecure) */ u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af, const union nf_inet_addr *addr, u32 v1, u32 v2) { u32 v3; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) v3 = ipv6_addr_hash(&addr->in6); else #endif v3 = addr->all[0]; return jhash_3words(v1, v2, v3, (u32)t->hash_key.key[0]); } static inline void ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { struct ip_vs_dest *dest = cp->dest; struct netns_ipvs *ipvs = cp->ipvs; if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { struct ip_vs_cpu_stats *s; struct ip_vs_service *svc; local_bh_disable(); s = this_cpu_ptr(dest->stats.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.inpkts); u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.inpkts); u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.inpkts); u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); local_bh_enable(); } } static inline void ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { struct ip_vs_dest *dest = cp->dest; struct netns_ipvs *ipvs = cp->ipvs; if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { struct ip_vs_cpu_stats *s; struct ip_vs_service *svc; local_bh_disable(); s = this_cpu_ptr(dest->stats.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.outpkts); u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.outpkts); u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.outpkts); u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); local_bh_enable(); } } static inline void ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) { struct netns_ipvs *ipvs = svc->ipvs; struct ip_vs_cpu_stats *s; local_bh_disable(); s = this_cpu_ptr(cp->dest->stats.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); local_bh_enable(); } static inline void ip_vs_set_state(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) { if (likely(pd->pp->state_transition)) pd->pp->state_transition(cp, direction, skb, pd); } static inline int ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, struct sk_buff *skb, int protocol, const union nf_inet_addr *caddr, __be16 cport, const union nf_inet_addr *vaddr, __be16 vport, struct ip_vs_conn_param *p) { ip_vs_conn_fill_param(svc->ipvs, svc->af, protocol, caddr, cport, vaddr, vport, p); p->pe = rcu_dereference(svc->pe); if (p->pe && p->pe->fill_param) return p->pe->fill_param(p, skb); return 0; } /* * IPVS persistent scheduling function * It creates a connection entry according to its template if exists, * or selects a server and creates a connection entry plus a template. * Locking: we are svc user (svc->refcnt), so we hold all dests too * Protocols supported: TCP, UDP */ static struct ip_vs_conn * ip_vs_sched_persist(struct ip_vs_service *svc, struct sk_buff *skb, __be16 src_port, __be16 dst_port, int *ignored, struct ip_vs_iphdr *iph) { struct ip_vs_conn *cp = NULL; struct ip_vs_dest *dest; struct ip_vs_conn *ct; __be16 dport = 0; /* destination port to forward */ unsigned int flags; struct ip_vs_conn_param param; const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; union nf_inet_addr snet; /* source network of the client, after masking */ const union nf_inet_addr *src_addr, *dst_addr; if (likely(!ip_vs_iph_inverse(iph))) { src_addr = &iph->saddr; dst_addr = &iph->daddr; } else { src_addr = &iph->daddr; dst_addr = &iph->saddr; } /* Mask saddr with the netmask to adjust template granularity */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) ipv6_addr_prefix(&snet.in6, &src_addr->in6, (__force __u32) svc->netmask); else #endif snet.ip = src_addr->ip & svc->netmask; IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " "mnet %s\n", IP_VS_DBG_ADDR(svc->af, src_addr), ntohs(src_port), IP_VS_DBG_ADDR(svc->af, dst_addr), ntohs(dst_port), IP_VS_DBG_ADDR(svc->af, &snet)); /* * As far as we know, FTP is a very complicated network protocol, and * it uses control connection and data connections. For active FTP, * FTP server initialize data connection to the client, its source port * is often 20. For passive FTP, FTP server tells the clients the port * that it passively listens to, and the client issues the data * connection. In the tunneling or direct routing mode, the load * balancer is on the client-to-server half of connection, the port * number is unknown to the load balancer. So, a conn template like * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> * is created for other persistent services. */ { int protocol = iph->protocol; const union nf_inet_addr *vaddr = dst_addr; __be16 vport = 0; if (dst_port == svc->port) { /* non-FTP template: * <protocol, caddr, 0, vaddr, vport, daddr, dport> * FTP template: * <protocol, caddr, 0, vaddr, 0, daddr, 0> */ if (svc->port != FTPPORT) vport = dst_port; } else { /* Note: persistent fwmark-based services and * persistent port zero service are handled here. * fwmark template: * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> * port zero template: * <protocol,caddr,0,vaddr,0,daddr,0> */ if (svc->fwmark) { protocol = IPPROTO_IP; vaddr = &fwmark; } } /* return *ignored = -1 so NF_DROP can be used */ if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, vaddr, vport, &param) < 0) { *ignored = -1; return NULL; } } /* Check if a template already exists */ ct = ip_vs_ct_in_get(&param); if (!ct || !ip_vs_check_template(ct, NULL)) { struct ip_vs_scheduler *sched; /* * No template found or the dest of the connection * template is not available. * return *ignored=0 i.e. ICMP and NF_DROP */ sched = rcu_dereference(svc->scheduler); if (sched) { /* read svc->sched_data after svc->scheduler */ smp_rmb(); dest = sched->schedule(svc, skb, iph); } else { dest = NULL; } if (!dest) { IP_VS_DBG(1, "p-schedule: no dest found.\n"); kfree(param.pe_data); *ignored = 0; return NULL; } if (dst_port == svc->port && svc->port != FTPPORT) dport = dest->port; /* Create a template * This adds param.pe_data to the template, * and thus param.pe_data will be destroyed * when the template expires */ ct = ip_vs_conn_new(&param, dest->af, &dest->addr, dport, IP_VS_CONN_F_TEMPLATE, dest, skb->mark); if (ct == NULL) { kfree(param.pe_data); *ignored = -1; return NULL; } ct->timeout = svc->timeout; } else { /* set destination with the found template */ dest = ct->dest; kfree(param.pe_data); } dport = dst_port; if (dport == svc->port && dest->port) dport = dest->port; flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; /* * Create a new connection according to the template */ ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, src_addr, src_port, dst_addr, dst_port, &param); cp = ip_vs_conn_new(&param, dest->af, &dest->addr, dport, flags, dest, skb->mark); if (cp == NULL) { ip_vs_conn_put(ct); *ignored = -1; return NULL; } /* * Add its control */ ip_vs_control_add(cp, ct); ip_vs_conn_put(ct); ip_vs_conn_stats(cp, svc); return cp; } /* * IPVS main scheduling function * It selects a server according to the virtual service, and * creates a connection entry. * Protocols supported: TCP, UDP * * Usage of *ignored * * 1 : protocol tried to schedule (eg. on SYN), found svc but the * svc/scheduler decides that this packet should be accepted with * NF_ACCEPT because it must not be scheduled. * * 0 : scheduler can not find destination, so try bypass or * return ICMP and then NF_DROP (ip_vs_leave). * * -1 : scheduler tried to schedule but fatal error occurred, eg. * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param * failure such as missing Call-ID, ENOMEM on skb_linearize * or pe_data. In this case we should return NF_DROP without * any attempts to send ICMP with ip_vs_leave. */ struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *ignored, struct ip_vs_iphdr *iph) { struct ip_vs_protocol *pp = pd->pp; struct ip_vs_conn *cp = NULL; struct ip_vs_scheduler *sched; struct ip_vs_dest *dest; __be16 _ports[2], *pptr, cport, vport; const void *caddr, *vaddr; unsigned int flags; *ignored = 1; /* * IPv6 frags, only the first hit here. */ pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); if (pptr == NULL) return NULL; if (likely(!ip_vs_iph_inverse(iph))) { cport = pptr[0]; caddr = &iph->saddr; vport = pptr[1]; vaddr = &iph->daddr; } else { cport = pptr[1]; caddr = &iph->daddr; vport = pptr[0]; vaddr = &iph->saddr; } /* * FTPDATA needs this check when using local real server. * Never schedule Active FTPDATA connections from real server. * For LVS-NAT they must be already created. For other methods * with persistence the connection is created on SYN+ACK. */ if (cport == FTPDATA) { IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, "Not scheduling FTPDATA"); return NULL; } /* * Do not schedule replies from local real server. */ if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) { iph->hdr_flags ^= IP_VS_HDR_INVERSE; cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, svc->ipvs, svc->af, skb, iph); iph->hdr_flags ^= IP_VS_HDR_INVERSE; if (cp) { IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, "Not scheduling reply for existing" " connection"); __ip_vs_conn_put(cp); return NULL; } } /* * Persistent service */ if (svc->flags & IP_VS_SVC_F_PERSISTENT) return ip_vs_sched_persist(svc, skb, cport, vport, ignored, iph); *ignored = 0; /* * Non-persistent service */ if (!svc->fwmark && vport != svc->port) { if (!svc->port) pr_err("Schedule: port zero only supported " "in persistent services, " "check your ipvs configuration\n"); return NULL; } sched = rcu_dereference(svc->scheduler); if (sched) { /* read svc->sched_data after svc->scheduler */ smp_rmb(); dest = sched->schedule(svc, skb, iph); } else { dest = NULL; } if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); return NULL; } flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; /* * Create a connection entry. */ { struct ip_vs_conn_param p; ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, caddr, cport, vaddr, vport, &p); cp = ip_vs_conn_new(&p, dest->af, &dest->addr, dest->port ? dest->port : vport, flags, dest, skb->mark); if (!cp) { *ignored = -1; return NULL; } } IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " "d:%s:%u conn->flags:%X conn->refcnt:%d\n", ip_vs_fwd_tag(cp), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), cp->flags, refcount_read(&cp->refcnt)); ip_vs_conn_stats(cp, svc); return cp; } static inline int ip_vs_addr_is_unicast(struct net *net, int af, union nf_inet_addr *addr) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) return ipv6_addr_type(&addr->in6) & IPV6_ADDR_UNICAST; #endif return (inet_addr_type(net, addr->ip) == RTN_UNICAST); } /* * Pass or drop the packet. * Called by ip_vs_in, when the virtual service is available but * no destination is available for a new connection. */ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph) { __be16 _ports[2], *pptr, dport; struct netns_ipvs *ipvs = svc->ipvs; struct net *net = ipvs->net; pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); if (!pptr) return NF_DROP; dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0]; /* if it is fwmark-based service, the cache_bypass sysctl is up and the destination is a non-local unicast, then create a cache_bypass connection entry */ if (sysctl_cache_bypass(ipvs) && svc->fwmark && !(iph->hdr_flags & (IP_VS_HDR_INVERSE | IP_VS_HDR_ICMP)) && ip_vs_addr_is_unicast(net, svc->af, &iph->daddr)) { int ret; struct ip_vs_conn *cp; unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; /* create a new connection entry */ IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); { struct ip_vs_conn_param p; ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, &iph->saddr, pptr[0], &iph->daddr, pptr[1], &p); cp = ip_vs_conn_new(&p, svc->af, &daddr, 0, IP_VS_CONN_F_BYPASS | flags, NULL, skb->mark); if (!cp) return NF_DROP; } /* statistics */ ip_vs_in_stats(cp, skb); /* set state */ ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); /* transmit the first SYN packet */ ret = cp->packet_xmit(skb, cp, pd->pp, iph); /* do not touch skb anymore */ if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) atomic_inc(&cp->control->in_pkts); else atomic_inc(&cp->in_pkts); ip_vs_conn_put(cp); return ret; } /* * When the virtual ftp service is presented, packets destined * for other services on the VIP may get here (except services * listed in the ipvs table), pass the packets, because it is * not ipvs job to decide to drop the packets. */ if (svc->port == FTPPORT && dport != FTPPORT) return NF_ACCEPT; if (unlikely(ip_vs_iph_icmp(iph))) return NF_DROP; /* * Notify the client that the destination is unreachable, and * release the socket buffer. * Since it is in IP layer, the TCP socket is not actually * created, the TCP RST packet cannot be sent, instead that * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) { if (!skb->dev) skb->dev = net->loopback_dev; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); } else #endif icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); return NF_DROP; } #ifdef CONFIG_SYSCTL static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return ipvs->sysctl_snat_reroute; } static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return ipvs->sysctl_nat_icmp_send; } #else static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; } static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; } #endif __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) { return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); } static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) { if (NF_INET_LOCAL_IN == hooknum) return IP_DEFRAG_VS_IN; if (NF_INET_FORWARD == hooknum) return IP_DEFRAG_VS_FWD; return IP_DEFRAG_VS_OUT; } static inline int ip_vs_gather_frags(struct netns_ipvs *ipvs, struct sk_buff *skb, u_int32_t user) { int err; local_bh_disable(); err = ip_defrag(ipvs->net, skb, user); local_bh_enable(); if (!err) ip_send_check(ip_hdr(skb)); return err; } static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, unsigned int hooknum) { if (!sysctl_snat_reroute(ipvs)) return 0; /* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */ if (NF_INET_LOCAL_IN == hooknum) return 0; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { struct dst_entry *dst = skb_dst(skb); if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) && ip6_route_me_harder(ipvs->net, skb->sk, skb) != 0) return 1; } else #endif if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) && ip_route_me_harder(ipvs->net, skb->sk, skb, RTN_LOCAL) != 0) return 1; return 0; } /* * Packet has been made sufficiently writable in caller * - inout: 1=in->out, 0=out->in */ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int inout) { struct iphdr *iph = ip_hdr(skb); unsigned int icmp_offset = iph->ihl*4; struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + icmp_offset); struct iphdr *ciph = (struct iphdr *)(icmph + 1); if (inout) { iph->saddr = cp->vaddr.ip; ip_send_check(iph); ciph->daddr = cp->vaddr.ip; ip_send_check(ciph); } else { iph->daddr = cp->daddr.ip; ip_send_check(iph); ciph->saddr = cp->daddr.ip; ip_send_check(ciph); } /* the TCP/UDP/SCTP port */ if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol || IPPROTO_SCTP == ciph->protocol) { __be16 *ports = (void *)ciph + ciph->ihl*4; if (inout) ports[1] = cp->vport; else ports[0] = cp->dport; } /* And finally the ICMP checksum */ icmph->checksum = 0; icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); skb->ip_summed = CHECKSUM_UNNECESSARY; if (inout) IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered outgoing ICMP"); else IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered incoming ICMP"); } #ifdef CONFIG_IP_VS_IPV6 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int inout) { struct ipv6hdr *iph = ipv6_hdr(skb); unsigned int icmp_offset = 0; unsigned int offs = 0; /* header offset*/ int protocol; struct icmp6hdr *icmph; struct ipv6hdr *ciph; unsigned short fragoffs; ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL); icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset); offs = icmp_offset + sizeof(struct icmp6hdr); ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs); protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL); if (inout) { iph->saddr = cp->vaddr.in6; ciph->daddr = cp->vaddr.in6; } else { iph->daddr = cp->daddr.in6; ciph->saddr = cp->daddr.in6; } /* the TCP/UDP/SCTP port */ if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || IPPROTO_SCTP == protocol)) { __be16 *ports = (void *)(skb_network_header(skb) + offs); IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__, ntohs(inout ? ports[1] : ports[0]), ntohs(inout ? cp->vport : cp->dport)); if (inout) ports[1] = cp->vport; else ports[0] = cp->dport; } /* And finally the ICMP checksum */ icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, skb->len - icmp_offset, IPPROTO_ICMPV6, 0); skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset; skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); skb->ip_summed = CHECKSUM_PARTIAL; if (inout) IP_VS_DBG_PKT(11, AF_INET6, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered outgoing ICMPv6"); else IP_VS_DBG_PKT(11, AF_INET6, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered incoming ICMPv6"); } #endif /* Handle relevant response ICMP messages - forward to the right * destination host. */ static int handle_response_icmp(int af, struct sk_buff *skb, union nf_inet_addr *snet, __u8 protocol, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, unsigned int offset, unsigned int ihl, unsigned int hooknum) { unsigned int verdict = NF_DROP; if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) goto after_nat; /* Ensure the checksum is correct */ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { /* Failed checksum! */ IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", IP_VS_DBG_ADDR(af, snet)); goto out; } if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || IPPROTO_SCTP == protocol) offset += 2 * sizeof(__u16); if (skb_ensure_writable(skb, offset)) goto out; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) ip_vs_nat_icmp_v6(skb, pp, cp, 1); else #endif ip_vs_nat_icmp(skb, pp, cp, 1); if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) goto out; after_nat: /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); skb->ipvs_property = 1; if (!(cp->flags & IP_VS_CONN_F_NFCT)) ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 0); verdict = NF_ACCEPT; out: __ip_vs_conn_put(cp); return verdict; } /* * Handle ICMP messages in the inside-to-outside direction (outgoing). * Find any that might be relevant, check against existing connections. * Currently handles error types - unreachable, quench, ttl exceeded. */ static int ip_vs_out_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, unsigned int hooknum) { struct iphdr *iph; struct icmphdr _icmph, *ic; struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ struct ip_vs_iphdr ciph; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; unsigned int offset, ihl; union nf_inet_addr snet; *related = 1; /* reassemble IP fragments */ if (ip_is_fragment(ip_hdr(skb))) { if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } iph = ip_hdr(skb); offset = ihl = iph->ihl * 4; ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); if (ic == NULL) return NF_DROP; IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n", ic->type, ntohs(icmp_id(ic)), &iph->saddr, &iph->daddr); /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy * things are checked first to speed up processing.... however * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ if ((ic->type != ICMP_DEST_UNREACH) && (ic->type != ICMP_SOURCE_QUENCH) && (ic->type != ICMP_TIME_EXCEEDED)) { *related = 0; return NF_ACCEPT; } /* Now find the contained IP header */ offset += sizeof(_icmph); cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ pp = ip_vs_proto_get(cih->protocol); if (!pp) return NF_ACCEPT; /* Is the embedded protocol header present? */ if (unlikely(cih->frag_off & htons(IP_OFFSET) && pp->dont_defrag)) return NF_ACCEPT; IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, "Checking outgoing ICMP for"); ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, true, &ciph); /* The embedded headers contain source and dest in reverse order */ cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, ipvs, AF_INET, skb, &ciph); if (!cp) return NF_ACCEPT; snet.ip = iph->saddr; return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, pp, ciph.len, ihl, hooknum); } #ifdef CONFIG_IP_VS_IPV6 static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, unsigned int hooknum, struct ip_vs_iphdr *ipvsh) { struct icmp6hdr _icmph, *ic; struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; union nf_inet_addr snet; unsigned int offset; *related = 1; ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph); if (ic == NULL) return NF_DROP; /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy * things are checked first to speed up processing.... however * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { *related = 0; return NF_ACCEPT; } /* Fragment header that is before ICMP header tells us that: * it's not an error message since they can't be fragmented. */ if (ipvsh->flags & IP6_FH_F_FRAG) return NF_DROP; IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n", ic->icmp6_type, ntohs(icmpv6_id(ic)), &ipvsh->saddr, &ipvsh->daddr); if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, ipvsh->len + sizeof(_icmph), true, &ciph)) return NF_ACCEPT; /* The packet looks wrong, ignore */ pp = ip_vs_proto_get(ciph.protocol); if (!pp) return NF_ACCEPT; /* The embedded headers contain source and dest in reverse order */ cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, ipvs, AF_INET6, skb, &ciph); if (!cp) return NF_ACCEPT; snet.in6 = ciph.saddr.in6; offset = ciph.len; return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, pp, offset, sizeof(struct ipv6hdr), hooknum); } #endif /* * Check if sctp chunc is ABORT chunk */ static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len) { struct sctp_chunkhdr *sch, schunk; sch = skb_header_pointer(skb, nh_len + sizeof(struct sctphdr), sizeof(schunk), &schunk); if (sch == NULL) return 0; if (sch->type == SCTP_CID_ABORT) return 1; return 0; } static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) { struct tcphdr _tcph, *th; th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); if (th == NULL) return 0; return th->rst; } static inline bool is_new_conn(const struct sk_buff *skb, struct ip_vs_iphdr *iph) { switch (iph->protocol) { case IPPROTO_TCP: { struct tcphdr _tcph, *th; th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); if (th == NULL) return false; return th->syn; } case IPPROTO_SCTP: { struct sctp_chunkhdr *sch, schunk; sch = skb_header_pointer(skb, iph->len + sizeof(struct sctphdr), sizeof(schunk), &schunk); if (sch == NULL) return false; return sch->type == SCTP_CID_INIT; } default: return false; } } static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, int conn_reuse_mode) { /* Controlled (FTP DATA or persistence)? */ if (cp->control) return false; switch (cp->protocol) { case IPPROTO_TCP: return (cp->state == IP_VS_TCP_S_TIME_WAIT) || (cp->state == IP_VS_TCP_S_CLOSE) || ((conn_reuse_mode & 2) && (cp->state == IP_VS_TCP_S_FIN_WAIT) && (cp->flags & IP_VS_CONN_F_NOOUTPUT)); case IPPROTO_SCTP: return cp->state == IP_VS_SCTP_S_CLOSED; default: return false; } } /* Generic function to create new connections for outgoing RS packets * * Pre-requisites for successful connection creation: * 1) Virtual Service is NOT fwmark based: * In fwmark-VS actual vaddr and vport are unknown to IPVS * 2) Real Server and Virtual Service were NOT configured without port: * This is to allow match of different VS to the same RS ip-addr */ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct sk_buff *skb, const struct ip_vs_iphdr *iph, __be16 dport, __be16 cport) { struct ip_vs_conn_param param; struct ip_vs_conn *ct = NULL, *cp = NULL; const union nf_inet_addr *vaddr, *daddr, *caddr; union nf_inet_addr snet; __be16 vport; unsigned int flags; vaddr = &svc->addr; vport = svc->port; daddr = &iph->saddr; caddr = &iph->daddr; /* check pre-requisites are satisfied */ if (svc->fwmark) return NULL; if (!vport || !dport) return NULL; /* for persistent service first create connection template */ if (svc->flags & IP_VS_SVC_F_PERSISTENT) { /* apply netmask the same way ingress-side does */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) ipv6_addr_prefix(&snet.in6, &caddr->in6, (__force __u32)svc->netmask); else #endif snet.ip = caddr->ip & svc->netmask; /* fill params and create template if not existent */ if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol, &snet, 0, vaddr, vport, &param) < 0) return NULL; ct = ip_vs_ct_in_get(&param); /* check if template exists and points to the same dest */ if (!ct || !ip_vs_check_template(ct, dest)) { ct = ip_vs_conn_new(&param, dest->af, daddr, dport, IP_VS_CONN_F_TEMPLATE, dest, 0); if (!ct) { kfree(param.pe_data); return NULL; } ct->timeout = svc->timeout; } else { kfree(param.pe_data); } } /* connection flags */ flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; /* create connection */ ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, caddr, cport, vaddr, vport, &param); cp = ip_vs_conn_new(&param, dest->af, daddr, dport, flags, dest, 0); if (!cp) { if (ct) ip_vs_conn_put(ct); return NULL; } if (ct) { ip_vs_control_add(cp, ct); ip_vs_conn_put(ct); } ip_vs_conn_stats(cp, svc); /* return connection (will be used to handle outgoing packet) */ IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u " "d:%s:%u conn->flags:%X conn->refcnt:%d\n", ip_vs_fwd_tag(cp), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), cp->flags, refcount_read(&cp->refcnt)); return cp; } /* Handle outgoing packets which are considered requests initiated by * real servers, so that subsequent responses from external client can be * routed to the right real server. * Used also for outgoing responses in OPS mode. * * Connection management is handled by persistent-engine specific callback. */ static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum, struct netns_ipvs *ipvs, int af, struct sk_buff *skb, const struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest; struct ip_vs_conn *cp = NULL; __be16 _ports[2], *pptr; if (hooknum == NF_INET_LOCAL_IN) return NULL; pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); if (!pptr) return NULL; dest = ip_vs_find_real_service(ipvs, af, iph->protocol, &iph->saddr, pptr[0]); if (dest) { struct ip_vs_service *svc; struct ip_vs_pe *pe; svc = rcu_dereference(dest->svc); if (svc) { pe = rcu_dereference(svc->pe); if (pe && pe->conn_out) cp = pe->conn_out(svc, dest, skb, iph, pptr[0], pptr[1]); } } return cp; } /* Handle response packets: rewrite addresses and send away... */ static unsigned int handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph, unsigned int hooknum) { struct ip_vs_protocol *pp = pd->pp; if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) goto after_nat; IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet"); if (skb_ensure_writable(skb, iph->len)) goto drop; /* mangle the packet */ if (pp->snat_handler && !SNAT_CALL(pp->snat_handler, skb, pp, cp, iph)) goto drop; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) ipv6_hdr(skb)->saddr = cp->vaddr.in6; else #endif { ip_hdr(skb)->saddr = cp->vaddr.ip; ip_send_check(ip_hdr(skb)); } /* * nf_iterate does not expect change in the skb->dst->dev. * It looks like it is not fatal to enable this code for hooks * where our handlers are at the end of the chain list and * when all next handlers use skb->dst->dev and not outdev. * It will definitely route properly the inout NAT traffic * when multiple paths are used. */ /* For policy routing, packets originating from this * machine itself may be routed differently to packets * passing through. We want this packet to be routed as * if it came from this machine itself. So re-compute * the routing information. */ if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) goto drop; IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT"); after_nat: ip_vs_out_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); skb->ipvs_property = 1; if (!(cp->flags & IP_VS_CONN_F_NFCT)) ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 0); ip_vs_conn_put(cp); return NF_ACCEPT; drop: ip_vs_conn_put(cp); kfree_skb(skb); return NF_STOLEN; } /* * Check if outgoing packet belongs to the established ip_vs_conn. */ static unsigned int ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct netns_ipvs *ipvs = net_ipvs(state->net); unsigned int hooknum = state->hook; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; int af = state->pf; struct sock *sk; /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) return NF_ACCEPT; sk = skb_to_full_sk(skb); /* Bad... Do not break raw sockets */ if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; } if (unlikely(!skb_dst(skb))) return NF_ACCEPT; ip_vs_fill_iph_skb(af, skb, false, &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related, hooknum, &iph); if (related) return verdict; } } else #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related; int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum); if (related) return verdict; } pd = ip_vs_proto_data_get(ipvs, iph.protocol); if (unlikely(!pd)) return NF_ACCEPT; pp = pd->pp; /* reassemble IP fragments */ #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET) #endif if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) { if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; ip_vs_fill_iph_skb(AF_INET, skb, false, &iph); } /* * Check if the packet belongs to an existing entry */ cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, ipvs, af, skb, &iph); if (likely(cp)) return handle_response(af, skb, pd, cp, &iph, hooknum); /* Check for real-server-started requests */ if (atomic_read(&ipvs->conn_out_counter[ip_vs_af_index(af)])) { /* Currently only for UDP: * connection oriented protocols typically use * ephemeral ports for outgoing connections, so * related incoming responses would not match any VS */ if (pp->protocol == IPPROTO_UDP) { cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph); if (likely(cp)) return handle_response(af, skb, pd, cp, &iph, hooknum); } } if (sysctl_nat_icmp_send(ipvs) && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || pp->protocol == IPPROTO_SCTP)) { __be16 _ports[2], *pptr; pptr = frag_safe_skb_hp(skb, iph.len, sizeof(_ports), _ports); if (pptr == NULL) return NF_ACCEPT; /* Not for me */ if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr, pptr[0])) { /* * Notify the real server: there is no * existing entry if it is not RST * packet or not TCP packet. */ if ((iph.protocol != IPPROTO_TCP && iph.protocol != IPPROTO_SCTP) || ((iph.protocol == IPPROTO_TCP && !is_tcp_reset(skb, iph.len)) || (iph.protocol == IPPROTO_SCTP && !is_sctp_abort(skb, iph.len)))) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (!skb->dev) skb->dev = ipvs->net->loopback_dev; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); } else #endif icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); return NF_DROP; } } } IP_VS_DBG_PKT(12, af, pp, skb, iph.off, "ip_vs_out: packet continues traversal as normal"); return NF_ACCEPT; } static unsigned int ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { struct ip_vs_protocol *pp = pd->pp; if (!iph->fragoffs) { /* No (second) fragments need to enter here, as nf_defrag_ipv6 * replayed fragment zero will already have created the cp */ /* Schedule and create new connection entry into cpp */ if (!pp->conn_schedule(ipvs, af, skb, pd, verdict, cpp, iph)) return 0; } if (unlikely(!*cpp)) { /* sorry, all this trouble for a no-hit :) */ IP_VS_DBG_PKT(12, af, pp, skb, iph->off, "ip_vs_in: packet continues traversal as normal"); /* Fragment couldn't be mapped to a conn entry */ if (iph->fragoffs) IP_VS_DBG_PKT(7, af, pp, skb, iph->off, "unhandled fragment"); *verdict = NF_ACCEPT; return 0; } return 1; } /* Check the UDP tunnel and return its header length */ static int ipvs_udp_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, unsigned int offset, __u16 af, const union nf_inet_addr *daddr, __u8 *proto) { struct udphdr _udph, *udph; struct ip_vs_dest *dest; udph = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (!udph) goto unk; offset += sizeof(struct udphdr); dest = ip_vs_find_tunnel(ipvs, af, daddr, udph->dest); if (!dest) goto unk; if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { struct guehdr _gueh, *gueh; gueh = skb_header_pointer(skb, offset, sizeof(_gueh), &_gueh); if (!gueh) goto unk; if (gueh->control != 0 || gueh->version != 0) goto unk; /* Later we can support also IPPROTO_IPV6 */ if (gueh->proto_ctype != IPPROTO_IPIP) goto unk; *proto = gueh->proto_ctype; return sizeof(struct udphdr) + sizeof(struct guehdr) + (gueh->hlen << 2); } unk: return 0; } /* Check the GRE tunnel and return its header length */ static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, unsigned int offset, __u16 af, const union nf_inet_addr *daddr, __u8 *proto) { struct gre_base_hdr _greh, *greh; struct ip_vs_dest *dest; greh = skb_header_pointer(skb, offset, sizeof(_greh), &_greh); if (!greh) goto unk; dest = ip_vs_find_tunnel(ipvs, af, daddr, 0); if (!dest) goto unk; if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { IP_TUNNEL_DECLARE_FLAGS(flags); __be16 type; /* Only support version 0 and C (csum) */ if ((greh->flags & ~GRE_CSUM) != 0) goto unk; type = greh->protocol; /* Later we can support also IPPROTO_IPV6 */ if (type != htons(ETH_P_IP)) goto unk; *proto = IPPROTO_IPIP; gre_flags_to_tnl_flags(flags, greh->flags); return gre_calc_hlen(flags); } unk: return 0; } /* * Handle ICMP messages in the outside-to-inside direction (incoming). * Find any that might be relevant, check against existing connections, * forward to the right destination host if relevant. * Currently handles error types - unreachable, quench, ttl exceeded. */ static int ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, unsigned int hooknum) { struct iphdr *iph; struct icmphdr _icmph, *ic; struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ struct ip_vs_iphdr ciph; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; unsigned int offset, offset2, ihl, verdict; bool tunnel, new_cp = false; union nf_inet_addr *raddr; char *outer_proto = "IPIP"; *related = 1; /* reassemble IP fragments */ if (ip_is_fragment(ip_hdr(skb))) { if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } iph = ip_hdr(skb); offset = ihl = iph->ihl * 4; ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); if (ic == NULL) return NF_DROP; IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n", ic->type, ntohs(icmp_id(ic)), &iph->saddr, &iph->daddr); /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy * things are checked first to speed up processing.... however * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ if ((ic->type != ICMP_DEST_UNREACH) && (ic->type != ICMP_SOURCE_QUENCH) && (ic->type != ICMP_TIME_EXCEEDED)) { *related = 0; return NF_ACCEPT; } /* Now find the contained IP header */ offset += sizeof(_icmph); cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ raddr = (union nf_inet_addr *)&cih->daddr; /* Special case for errors for IPIP/UDP/GRE tunnel packets */ tunnel = false; if (cih->protocol == IPPROTO_IPIP) { struct ip_vs_dest *dest; if (unlikely(cih->frag_off & htons(IP_OFFSET))) return NF_ACCEPT; /* Error for our IPIP must arrive at LOCAL_IN */ if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) return NF_ACCEPT; dest = ip_vs_find_tunnel(ipvs, AF_INET, raddr, 0); /* Only for known tunnel */ if (!dest || dest->tun_type != IP_VS_CONN_F_TUNNEL_TYPE_IPIP) return NF_ACCEPT; offset += cih->ihl * 4; cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ tunnel = true; } else if ((cih->protocol == IPPROTO_UDP || /* Can be UDP encap */ cih->protocol == IPPROTO_GRE) && /* Can be GRE encap */ /* Error for our tunnel must arrive at LOCAL_IN */ (skb_rtable(skb)->rt_flags & RTCF_LOCAL)) { __u8 iproto; int ulen; /* Non-first fragment has no UDP/GRE header */ if (unlikely(cih->frag_off & htons(IP_OFFSET))) return NF_ACCEPT; offset2 = offset + cih->ihl * 4; if (cih->protocol == IPPROTO_UDP) { ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET, raddr, &iproto); outer_proto = "UDP"; } else { ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET, raddr, &iproto); outer_proto = "GRE"; } if (ulen > 0) { /* Skip IP and UDP/GRE tunnel headers */ offset = offset2 + ulen; /* Now we should be at the original IP header */ cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih && cih->version == 4 && cih->ihl >= 5 && iproto == IPPROTO_IPIP) tunnel = true; else return NF_ACCEPT; } } pd = ip_vs_proto_data_get(ipvs, cih->protocol); if (!pd) return NF_ACCEPT; pp = pd->pp; /* Is the embedded protocol header present? */ if (unlikely(cih->frag_off & htons(IP_OFFSET) && pp->dont_defrag)) return NF_ACCEPT; IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, "Checking incoming ICMP for"); offset2 = offset; ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !tunnel, &ciph); offset = ciph.len; /* The embedded headers contain source and dest in reverse order. * For IPIP/UDP/GRE tunnel this is error for request, not for reply. */ cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, ipvs, AF_INET, skb, &ciph); if (!cp) { int v; if (tunnel || !sysctl_schedule_icmp(ipvs)) return NF_ACCEPT; if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph)) return v; new_cp = true; } verdict = NF_DROP; /* Ensure the checksum is correct */ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { /* Failed checksum! */ IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n", &iph->saddr); goto out; } if (tunnel) { __be32 info = ic->un.gateway; __u8 type = ic->type; __u8 code = ic->code; /* Update the MTU */ if (ic->type == ICMP_DEST_UNREACH && ic->code == ICMP_FRAG_NEEDED) { struct ip_vs_dest *dest = cp->dest; u32 mtu = ntohs(ic->un.frag.mtu); __be16 frag_off = cih->frag_off; /* Strip outer IP and ICMP, go to IPIP/UDP/GRE header */ if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) goto ignore_tunnel; offset2 -= ihl + sizeof(_icmph); skb_reset_network_header(skb); IP_VS_DBG(12, "ICMP for %s %pI4->%pI4: mtu=%u\n", outer_proto, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0); /* Client uses PMTUD? */ if (!(frag_off & htons(IP_DF))) goto ignore_tunnel; /* Prefer the resulting PMTU */ if (dest) { struct ip_vs_dest_dst *dest_dst; dest_dst = rcu_dereference(dest->dest_dst); if (dest_dst) mtu = dst_mtu(dest_dst->dst_cache); } if (mtu > 68 + sizeof(struct iphdr)) mtu -= sizeof(struct iphdr); info = htonl(mtu); } /* Strip outer IP, ICMP and IPIP/UDP/GRE, go to IP header of * original request. */ if (pskb_pull(skb, offset2) == NULL) goto ignore_tunnel; skb_reset_network_header(skb); IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, type, code, ntohl(info)); icmp_send(skb, type, code, info); /* ICMP can be shorter but anyways, account it */ ip_vs_out_stats(cp, skb); ignore_tunnel: consume_skb(skb); verdict = NF_STOLEN; goto out; } /* do the statistics and put it back */ ip_vs_in_stats(cp, skb); if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol || IPPROTO_SCTP == cih->protocol) offset += 2 * sizeof(__u16); verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph); out: if (likely(!new_cp)) __ip_vs_conn_put(cp); else ip_vs_conn_put(cp); return verdict; } #ifdef CONFIG_IP_VS_IPV6 static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, unsigned int hooknum, struct ip_vs_iphdr *iph) { struct icmp6hdr _icmph, *ic; struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; unsigned int offset, verdict; bool new_cp = false; *related = 1; ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph); if (ic == NULL) return NF_DROP; /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy * things are checked first to speed up processing.... however * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { *related = 0; return NF_ACCEPT; } /* Fragment header that is before ICMP header tells us that: * it's not an error message since they can't be fragmented. */ if (iph->flags & IP6_FH_F_FRAG) return NF_DROP; IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n", ic->icmp6_type, ntohs(icmpv6_id(ic)), &iph->saddr, &iph->daddr); offset = iph->len + sizeof(_icmph); if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, offset, true, &ciph)) return NF_ACCEPT; pd = ip_vs_proto_data_get(ipvs, ciph.protocol); if (!pd) return NF_ACCEPT; pp = pd->pp; /* Cannot handle fragmented embedded protocol */ if (ciph.fragoffs) return NF_ACCEPT; IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, "Checking incoming ICMPv6 for"); /* The embedded headers contain source and dest in reverse order * if not from localhost */ cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, ipvs, AF_INET6, skb, &ciph); if (!cp) { int v; if (!sysctl_schedule_icmp(ipvs)) return NF_ACCEPT; if (!ip_vs_try_to_schedule(ipvs, AF_INET6, skb, pd, &v, &cp, &ciph)) return v; new_cp = true; } /* VS/TUN, VS/DR and LOCALNODE just let it go */ if ((hooknum == NF_INET_LOCAL_OUT) && (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) { verdict = NF_ACCEPT; goto out; } /* do the statistics and put it back */ ip_vs_in_stats(cp, skb); /* Need to mangle contained IPv6 header in ICMPv6 packet */ offset = ciph.len; if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol || IPPROTO_SCTP == ciph.protocol) offset += 2 * sizeof(__u16); /* Also mangle ports */ verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum, &ciph); out: if (likely(!new_cp)) __ip_vs_conn_put(cp); else ip_vs_conn_put(cp); return verdict; } #endif /* * Check if it's for virtual services, look it up, * and send it on its way... */ static unsigned int ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct netns_ipvs *ipvs = net_ipvs(state->net); unsigned int hooknum = state->hook; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; int ret, pkts; struct sock *sk; int af = state->pf; /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) return NF_ACCEPT; /* * Big tappo: * - remote client: only PACKET_HOST * - route: used for struct net when skb->dev is unset */ if (unlikely((skb->pkt_type != PACKET_HOST && hooknum != NF_INET_LOCAL_OUT) || !skb_dst(skb))) { ip_vs_fill_iph_skb(af, skb, false, &iph); IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" " ignored in hook %u\n", skb->pkt_type, iph.protocol, IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); return NF_ACCEPT; } /* ipvs enabled in this netns ? */ if (unlikely(sysctl_backup_only(ipvs))) return NF_ACCEPT; ip_vs_fill_iph_skb(af, skb, false, &iph); /* Bad... Do not break raw sockets */ sk = skb_to_full_sk(skb); if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; } #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; int verdict = ip_vs_in_icmp_v6(ipvs, skb, &related, hooknum, &iph); if (related) return verdict; } } else #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related; int verdict = ip_vs_in_icmp(ipvs, skb, &related, hooknum); if (related) return verdict; } /* Protocol supported? */ pd = ip_vs_proto_data_get(ipvs, iph.protocol); if (unlikely(!pd)) { /* The only way we'll see this packet again is if it's * encapsulated, so mark it with ipvs_property=1 so we * skip it if we're ignoring tunneled packets */ if (sysctl_ignore_tunneled(ipvs)) skb->ipvs_property = 1; return NF_ACCEPT; } pp = pd->pp; /* * Check if the packet belongs to an existing connection entry */ cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, ipvs, af, skb, &iph); if (!iph.fragoffs && is_new_conn(skb, &iph) && cp) { int conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); bool old_ct = false, resched = false; if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && unlikely(!atomic_read(&cp->dest->weight))) { resched = true; old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); } else if (conn_reuse_mode && is_new_conn_expected(cp, conn_reuse_mode)) { old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); if (!atomic_read(&cp->n_control)) { resched = true; } else { /* Do not reschedule controlling connection * that uses conntrack while it is still * referenced by controlled connection(s). */ resched = !old_ct; } } if (resched) { if (!old_ct) cp->flags &= ~IP_VS_CONN_F_NFCT; if (!atomic_read(&cp->n_control)) ip_vs_conn_expire_now(cp); __ip_vs_conn_put(cp); if (old_ct) return NF_DROP; cp = NULL; } } /* Check the server status */ if (cp && cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ if (sysctl_expire_nodest_conn(ipvs)) { bool old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); if (!old_ct) cp->flags &= ~IP_VS_CONN_F_NFCT; ip_vs_conn_expire_now(cp); __ip_vs_conn_put(cp); if (old_ct) return NF_DROP; cp = NULL; } else { __ip_vs_conn_put(cp); return NF_DROP; } } if (unlikely(!cp)) { int v; if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) return v; } IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); ip_vs_in_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); if (cp->packet_xmit) ret = cp->packet_xmit(skb, cp, pp, &iph); /* do not touch skb anymore */ else { IP_VS_DBG_RL("warning: packet_xmit is null"); ret = NF_ACCEPT; } /* Increase its packet counter and check if it is needed * to be synchronized * * Sync connection if it is about to close to * encorage the standby servers to update the connections timeout * * For ONE_PKT let ip_vs_sync_conn() do the filter work. */ if (cp->flags & IP_VS_CONN_F_ONE_PACKET) pkts = sysctl_sync_threshold(ipvs); else pkts = atomic_inc_return(&cp->in_pkts); if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(ipvs, cp, pkts); else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) /* increment is done inside ip_vs_sync_conn too */ atomic_inc(&cp->control->in_pkts); ip_vs_conn_put(cp); return ret; } /* * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP * related packets destined for 0.0.0.0/0. * When fwmark-based virtual service is used, such as transparent * cache cluster, TCP packets can be marked and routed to ip_vs_in, * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain * and send them to ip_vs_in_icmp. */ static unsigned int ip_vs_forward_icmp(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct netns_ipvs *ipvs = net_ipvs(state->net); int r; /* ipvs enabled in this netns ? */ if (unlikely(sysctl_backup_only(ipvs))) return NF_ACCEPT; if (state->pf == NFPROTO_IPV4) { if (ip_hdr(skb)->protocol != IPPROTO_ICMP) return NF_ACCEPT; #ifdef CONFIG_IP_VS_IPV6 } else { struct ip_vs_iphdr iphdr; ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr); if (iphdr.protocol != IPPROTO_ICMPV6) return NF_ACCEPT; return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr); #endif } return ip_vs_in_icmp(ipvs, skb, &r, state->hook); } static const struct nf_hook_ops ip_vs_ops4[] = { /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 2, }, /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { .hook = ip_vs_in_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 1, }, /* Before ip_vs_in, change source only for VS/NAT */ { .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 1, }, /* After mangle, schedule and forward local requests */ { .hook = ip_vs_in_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 2, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp, .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = 99, }, /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = 100, }, }; #ifdef CONFIG_IP_VS_IPV6 static const struct nf_hook_ops ip_vs_ops6[] = { /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 2, }, /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { .hook = ip_vs_in_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 1, }, /* Before ip_vs_in, change source only for VS/NAT */ { .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 1, }, /* After mangle, schedule and forward local requests */ { .hook = ip_vs_in_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 2, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 99, }, /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 100, }, }; #endif int ip_vs_register_hooks(struct netns_ipvs *ipvs, unsigned int af) { const struct nf_hook_ops *ops; unsigned int count; unsigned int afmask; int ret = 0; if (af == AF_INET6) { #ifdef CONFIG_IP_VS_IPV6 ops = ip_vs_ops6; count = ARRAY_SIZE(ip_vs_ops6); afmask = 2; #else return -EINVAL; #endif } else { ops = ip_vs_ops4; count = ARRAY_SIZE(ip_vs_ops4); afmask = 1; } if (!(ipvs->hooks_afmask & afmask)) { ret = nf_register_net_hooks(ipvs->net, ops, count); if (ret >= 0) ipvs->hooks_afmask |= afmask; } return ret; } void ip_vs_unregister_hooks(struct netns_ipvs *ipvs, unsigned int af) { const struct nf_hook_ops *ops; unsigned int count; unsigned int afmask; if (af == AF_INET6) { #ifdef CONFIG_IP_VS_IPV6 ops = ip_vs_ops6; count = ARRAY_SIZE(ip_vs_ops6); afmask = 2; #else return; #endif } else { ops = ip_vs_ops4; count = ARRAY_SIZE(ip_vs_ops4); afmask = 1; } if (ipvs->hooks_afmask & afmask) { nf_unregister_net_hooks(ipvs->net, ops, count); ipvs->hooks_afmask &= ~afmask; } } /* * Initialize IP Virtual Server netns mem. */ static int __net_init __ip_vs_init(struct net *net) { struct netns_ipvs *ipvs; ipvs = net_generic(net, ip_vs_net_id); if (ipvs == NULL) return -ENOMEM; /* Hold the beast until a service is registered */ WRITE_ONCE(ipvs->enable, 0); ipvs->net = net; /* Counters used for creating unique names */ ipvs->gen = atomic_read(&ipvs_netns_cnt); atomic_inc(&ipvs_netns_cnt); net->ipvs = ipvs; if (ip_vs_estimator_net_init(ipvs) < 0) goto estimator_fail; if (ip_vs_control_net_init(ipvs) < 0) goto control_fail; if (ip_vs_protocol_net_init(ipvs) < 0) goto protocol_fail; if (ip_vs_app_net_init(ipvs) < 0) goto app_fail; if (ip_vs_conn_net_init(ipvs) < 0) goto conn_fail; if (ip_vs_sync_net_init(ipvs) < 0) goto sync_fail; return 0; /* * Error handling */ sync_fail: ip_vs_conn_net_cleanup(ipvs); conn_fail: ip_vs_app_net_cleanup(ipvs); app_fail: ip_vs_protocol_net_cleanup(ipvs); protocol_fail: ip_vs_control_net_cleanup(ipvs); control_fail: ip_vs_estimator_net_cleanup(ipvs); estimator_fail: net->ipvs = NULL; return -ENOMEM; } static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list) { struct netns_ipvs *ipvs; struct net *net; ip_vs_service_nets_cleanup(net_list); /* ip_vs_flush() with locks */ list_for_each_entry(net, net_list, exit_list) { ipvs = net_ipvs(net); ip_vs_conn_net_cleanup(ipvs); ip_vs_app_net_cleanup(ipvs); ip_vs_protocol_net_cleanup(ipvs); ip_vs_control_net_cleanup(ipvs); ip_vs_estimator_net_cleanup(ipvs); IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); net->ipvs = NULL; } } static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) { struct netns_ipvs *ipvs; struct net *net; list_for_each_entry(net, net_list, exit_list) { ipvs = net_ipvs(net); ip_vs_unregister_hooks(ipvs, AF_INET); ip_vs_unregister_hooks(ipvs, AF_INET6); WRITE_ONCE(ipvs->enable, 0); /* Disable packet reception */ smp_wmb(); ip_vs_sync_net_cleanup(ipvs); } } static struct pernet_operations ipvs_core_ops = { .init = __ip_vs_init, .exit_batch = __ip_vs_cleanup_batch, .id = &ip_vs_net_id, .size = sizeof(struct netns_ipvs), }; static struct pernet_operations ipvs_core_dev_ops = { .exit_batch = __ip_vs_dev_cleanup_batch, }; /* * Initialize IP Virtual Server */ static int __init ip_vs_init(void) { int ret; ret = ip_vs_control_init(); if (ret < 0) { pr_err("can't setup control.\n"); goto exit; } ip_vs_protocol_init(); ret = ip_vs_conn_init(); if (ret < 0) { pr_err("can't setup connection table.\n"); goto cleanup_protocol; } ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ if (ret < 0) goto cleanup_conn; ret = register_pernet_device(&ipvs_core_dev_ops); if (ret < 0) goto cleanup_sub; ret = ip_vs_register_nl_ioctl(); if (ret < 0) { pr_err("can't register netlink/ioctl.\n"); goto cleanup_dev; } pr_info("ipvs loaded.\n"); return ret; cleanup_dev: unregister_pernet_device(&ipvs_core_dev_ops); cleanup_sub: unregister_pernet_subsys(&ipvs_core_ops); cleanup_conn: ip_vs_conn_cleanup(); cleanup_protocol: ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); exit: return ret; } static void __exit ip_vs_cleanup(void) { ip_vs_unregister_nl_ioctl(); unregister_pernet_device(&ipvs_core_dev_ops); unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ ip_vs_conn_cleanup(); ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); /* common rcu_barrier() used by: * - ip_vs_control_cleanup() */ rcu_barrier(); pr_info("ipvs unloaded.\n"); } module_init(ip_vs_init); module_exit(ip_vs_cleanup); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IP Virtual Server");
21 11 24 31 7 5 4 4 3 106 8 7 17 78 106 55 14 78 90 13 30 41 62 31 41 2 3 18 18 25 11 27 9 17 19 16 12 4 10 14 8 16 20 8 6 5 18 24 11 18 8 16 16 8 24 24 38 1 37 31 31 25 3 22 25 3 22 25 3 10 10 15 17 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_tbf.c Token Bucket Filter queue. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs - * original idea by Martin Devera */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/gso.h> #include <net/netlink.h> #include <net/sch_generic.h> #include <net/pkt_cls.h> #include <net/pkt_sched.h> /* Simple Token Bucket Filter. ======================================= SOURCE. ------- None. Description. ------------ A data flow obeys TBF with rate R and depth B, if for any time interval t_i...t_f the number of transmitted bits does not exceed B + R*(t_f-t_i). Packetized version of this definition: The sequence of packets of sizes s_i served at moments t_i obeys TBF, if for any i<=k: s_i+....+s_k <= B + R*(t_k - t_i) Algorithm. ---------- Let N(t_i) be B/R initially and N(t) grow continuously with time as: N(t+delta) = min{B/R, N(t) + delta} If the first packet in queue has length S, it may be transmitted only at the time t_* when S/R <= N(t_*), and in this case N(t) jumps: N(t_* + 0) = N(t_* - 0) - S/R. Actually, QoS requires two TBF to be applied to a data stream. One of them controls steady state burst size, another one with rate P (peak rate) and depth M (equal to link MTU) limits bursts at a smaller time scale. It is easy to see that P>R, and B>M. If P is infinity, this double TBF is equivalent to a single one. When TBF works in reshaping mode, latency is estimated as: lat = max ((L-B)/R, (L-M)/P) NOTES. ------ If TBF throttles, it starts a watchdog timer, which will wake it up when it is ready to transmit. Note that the minimal timer resolution is 1/HZ. If no new packets arrive during this period, or if the device is not awaken by EOI for some previous packet, TBF can stop its activity for 1/HZ. This means, that with depth B, the maximal rate is R_crit = B*HZ F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes. Note that the peak rate TBF is much more tough: with MTU 1500 P_crit = 150Kbytes/sec. So, if you need greater peak rates, use alpha with HZ=1000 :-) With classful TBF, limit is just kept for backwards compatibility. It is passed to the default bfifo qdisc - if the inner qdisc is changed the limit is not effective anymore. */ struct tbf_sched_data { /* Parameters */ u32 limit; /* Maximal length of backlog: bytes */ u32 max_size; s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ s64 mtu; struct psched_ratecfg rate; struct psched_ratecfg peak; /* Variables */ s64 tokens; /* Current number of B tokens */ s64 ptokens; /* Current number of P tokens */ s64 t_c; /* Time check-point */ struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */ struct qdisc_watchdog watchdog; /* Watchdog timer */ }; /* Time to Length, convert time in ns to length in bytes * to determinate how many bytes can be sent in given time. */ static u64 psched_ns_t2l(const struct psched_ratecfg *r, u64 time_in_ns) { /* The formula is : * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC */ u64 len = time_in_ns * r->rate_bytes_ps; do_div(len, NSEC_PER_SEC); if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) { do_div(len, 53); len = len * 48; } if (len > r->overhead) len -= r->overhead; else len = 0; return len; } static void tbf_offload_change(struct Qdisc *sch, struct netlink_ext_ack *extack) { struct tbf_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_tbf_qopt_offload qopt; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; qopt.extack = extack; qopt.command = TC_TBF_REPLACE; qopt.handle = sch->handle; qopt.parent = sch->parent; qopt.replace_params.rate = q->rate; qopt.replace_params.max_size = q->max_size; qopt.replace_params.qstats = &sch->qstats; dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt); } static void tbf_offload_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct tc_tbf_qopt_offload qopt; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; qopt.extack = NULL; qopt.command = TC_TBF_DESTROY; qopt.handle = sch->handle; qopt.parent = sch->parent; dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt); } static int tbf_offload_dump(struct Qdisc *sch) { struct tc_tbf_qopt_offload qopt; qopt.extack = NULL; qopt.command = TC_TBF_STATS; qopt.handle = sch->handle; qopt.parent = sch->parent; qopt.stats.bstats = &sch->bstats; qopt.stats.qstats = &sch->qstats; return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt); } static void tbf_offload_graft(struct Qdisc *sch, struct Qdisc *new, struct Qdisc *old, struct netlink_ext_ack *extack) { struct tc_tbf_qopt_offload graft_offload = { .handle = sch->handle, .parent = sch->parent, .child_handle = new->handle, .command = TC_TBF_GRAFT, .extack = extack, }; qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old, TC_SETUP_QDISC_TBF, &graft_offload, extack); } /* GSO packet is too big, segment it so that tbf can transmit * each segment in time */ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct tbf_sched_data *q = qdisc_priv(sch); struct sk_buff *segs, *nskb; netdev_features_t features = netif_skb_features(skb); unsigned int len = 0, prev_len = qdisc_pkt_len(skb), seg_len; int ret, nb; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) return qdisc_drop(skb, sch, to_free); nb = 0; skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); seg_len = segs->len; qdisc_skb_cb(segs)->pkt_len = seg_len; qdisc_skb_cb(segs)->pkt_segs = 1; ret = qdisc_enqueue(segs, q->qdisc, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); } else { nb++; len += seg_len; } } WRITE_ONCE(sch->q.qlen, sch->q.qlen + nb); qstats_backlog_add(sch, len); if (nb > 0) { qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len); consume_skb(skb); return NET_XMIT_SUCCESS; } kfree_skb(skb); return NET_XMIT_DROP; } static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct tbf_sched_data *q = qdisc_priv(sch); unsigned int len = qdisc_pkt_len(skb); int ret; if (qdisc_pkt_len(skb) > q->max_size) { if (skb_is_gso(skb) && skb_gso_validate_mac_len(skb, q->max_size)) return tbf_segment(skb, sch, to_free); return qdisc_drop(skb, sch, to_free); } ret = qdisc_enqueue(skb, q->qdisc, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); return ret; } qstats_backlog_add(sch, len); qdisc_qlen_inc(sch); return NET_XMIT_SUCCESS; } static bool tbf_peak_present(const struct tbf_sched_data *q) { return q->peak.rate_bytes_ps; } static struct sk_buff *tbf_dequeue(struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; skb = q->qdisc->ops->peek(q->qdisc); if (skb) { s64 now; s64 toks; s64 ptoks = 0; unsigned int len = qdisc_pkt_len(skb); now = ktime_get_ns(); toks = min_t(s64, now - q->t_c, q->buffer); if (tbf_peak_present(q)) { ptoks = toks + q->ptokens; if (ptoks > q->mtu) ptoks = q->mtu; ptoks -= (s64) psched_l2t_ns(&q->peak, len); } toks += q->tokens; if (toks > q->buffer) toks = q->buffer; toks -= (s64) psched_l2t_ns(&q->rate, len); if ((toks|ptoks) >= 0) { skb = qdisc_dequeue_peeked(q->qdisc); if (unlikely(!skb)) return NULL; q->t_c = now; q->tokens = toks; q->ptokens = ptoks; qdisc_qstats_backlog_dec(sch, skb); qdisc_qlen_dec(sch); qdisc_bstats_update(sch, skb); return skb; } qdisc_watchdog_schedule_ns(&q->watchdog, now + max_t(long, -toks, -ptoks)); /* Maybe we have a shorter packet in the queue, which can be sent now. It sounds cool, but, however, this is wrong in principle. We MUST NOT reorder packets under these circumstances. Really, if we split the flow into independent subflows, it would be a very good solution. This is the main idea of all FQ algorithms (cf. CSZ, HPFQ, HFSC) */ qdisc_qstats_overlimit(sch); } return NULL; } static void tbf_reset(struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); qdisc_reset(q->qdisc); q->t_c = ktime_get_ns(); q->tokens = q->buffer; q->ptokens = q->mtu; qdisc_watchdog_cancel(&q->watchdog); } static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = { [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) }, [TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, [TCA_TBF_RATE64] = { .type = NLA_U64 }, [TCA_TBF_PRATE64] = { .type = NLA_U64 }, [TCA_TBF_BURST] = { .type = NLA_U32 }, [TCA_TBF_PBURST] = { .type = NLA_U32 }, }; static int tbf_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { int err; struct tbf_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_TBF_MAX + 1]; struct tc_tbf_qopt *qopt; struct Qdisc *child = NULL; struct Qdisc *old = NULL; struct psched_ratecfg rate; struct psched_ratecfg peak; u64 max_size; s64 buffer, mtu; u64 rate64 = 0, prate64 = 0; err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy, NULL); if (err < 0) return err; err = -EINVAL; if (tb[TCA_TBF_PARMS] == NULL) goto done; qopt = nla_data(tb[TCA_TBF_PARMS]); if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE) qdisc_put_rtab(qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB], NULL)); if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE) qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB], NULL)); buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U); mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U); if (tb[TCA_TBF_RATE64]) rate64 = nla_get_u64(tb[TCA_TBF_RATE64]); psched_ratecfg_precompute(&rate, &qopt->rate, rate64); if (tb[TCA_TBF_BURST]) { max_size = nla_get_u32(tb[TCA_TBF_BURST]); buffer = psched_l2t_ns(&rate, max_size); } else { max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U); } if (qopt->peakrate.rate) { if (tb[TCA_TBF_PRATE64]) prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]); psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64); if (peak.rate_bytes_ps <= rate.rate_bytes_ps) { pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n", peak.rate_bytes_ps, rate.rate_bytes_ps); err = -EINVAL; goto done; } if (tb[TCA_TBF_PBURST]) { u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]); max_size = min_t(u32, max_size, pburst); mtu = psched_l2t_ns(&peak, pburst); } else { max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu)); } } else { memset(&peak, 0, sizeof(peak)); } if (max_size < psched_mtu(qdisc_dev(sch))) pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n", max_size, qdisc_dev(sch)->name, psched_mtu(qdisc_dev(sch))); if (!max_size) { err = -EINVAL; goto done; } if (q->qdisc != &noop_qdisc) { err = fifo_set_limit(q->qdisc, qopt->limit); if (err) goto done; } else if (qopt->limit > 0) { child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit, extack); if (IS_ERR(child)) { err = PTR_ERR(child); goto done; } /* child is fifo, no need to check for noop_qdisc */ qdisc_hash_add(child, true); } sch_tree_lock(sch); if (child) { qdisc_purge_queue(q->qdisc); old = q->qdisc; q->qdisc = child; } q->limit = qopt->limit; if (tb[TCA_TBF_PBURST]) q->mtu = mtu; else q->mtu = PSCHED_TICKS2NS(qopt->mtu); q->max_size = max_size; if (tb[TCA_TBF_BURST]) q->buffer = buffer; else q->buffer = PSCHED_TICKS2NS(qopt->buffer); q->tokens = q->buffer; q->ptokens = q->mtu; memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg)); memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg)); sch_tree_unlock(sch); qdisc_put(old); err = 0; tbf_offload_change(sch, extack); done: return err; } static int tbf_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct tbf_sched_data *q = qdisc_priv(sch); qdisc_watchdog_init(&q->watchdog, sch); q->qdisc = &noop_qdisc; if (!opt) return -EINVAL; q->t_c = ktime_get_ns(); return tbf_change(sch, opt, extack); } static void tbf_destroy(struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); qdisc_watchdog_cancel(&q->watchdog); tbf_offload_destroy(sch); qdisc_put(q->qdisc); } static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) { struct tbf_sched_data *q = qdisc_priv(sch); struct nlattr *nest; struct tc_tbf_qopt opt; int err; err = tbf_offload_dump(sch); if (err) return err; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; opt.limit = q->limit; psched_ratecfg_getrate(&opt.rate, &q->rate); if (tbf_peak_present(q)) psched_ratecfg_getrate(&opt.peakrate, &q->peak); else memset(&opt.peakrate, 0, sizeof(opt.peakrate)); opt.mtu = PSCHED_NS2TICKS(q->mtu); opt.buffer = PSCHED_NS2TICKS(q->buffer); if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if (q->rate.rate_bytes_ps >= (1ULL << 32) && nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps, TCA_TBF_PAD)) goto nla_put_failure; if (tbf_peak_present(q) && q->peak.rate_bytes_ps >= (1ULL << 32) && nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps, TCA_TBF_PAD)) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static int tbf_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct tbf_sched_data *q = qdisc_priv(sch); tcm->tcm_handle |= TC_H_MIN(1); tcm->tcm_info = q->qdisc->handle; return 0; } static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct tbf_sched_data *q = qdisc_priv(sch); if (new == NULL) new = &noop_qdisc; *old = qdisc_replace(sch, new, &q->qdisc); tbf_offload_graft(sch, new, *old, extack); return 0; } static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg) { struct tbf_sched_data *q = qdisc_priv(sch); return q->qdisc; } static unsigned long tbf_find(struct Qdisc *sch, u32 classid) { return 1; } static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker) { if (!walker->stop) { tc_qdisc_stats_dump(sch, 1, walker); } } static const struct Qdisc_class_ops tbf_class_ops = { .graft = tbf_graft, .leaf = tbf_leaf, .find = tbf_find, .walk = tbf_walk, .dump = tbf_dump_class, }; static struct Qdisc_ops tbf_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &tbf_class_ops, .id = "tbf", .priv_size = sizeof(struct tbf_sched_data), .enqueue = tbf_enqueue, .dequeue = tbf_dequeue, .peek = qdisc_peek_dequeued, .init = tbf_init, .reset = tbf_reset, .destroy = tbf_destroy, .change = tbf_change, .dump = tbf_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("tbf"); static int __init tbf_module_init(void) { return register_qdisc(&tbf_qdisc_ops); } static void __exit tbf_module_exit(void) { unregister_qdisc(&tbf_qdisc_ops); } module_init(tbf_module_init) module_exit(tbf_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Token Bucket Filter qdisc");
6 6 3 3 2 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 Counters support */ /* 2 Comments support */ /* 3 Forceadd support */ /* 4 skbinfo support */ /* 5 bucketsize, initval support */ #define IPSET_TYPE_REV_MAX 6 /* bitmask support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip"); /* Type specific function prefix */ #define HTYPE hash_ip #define IP_SET_HASH_WITH_NETMASK #define IP_SET_HASH_WITH_BITMASK /* IPv4 variant */ /* Member elements */ struct hash_ip4_elem { /* Zero valued IP addresses cannot be stored */ __be32 ip; }; /* Common functions */ static bool hash_ip4_data_equal(const struct hash_ip4_elem *e1, const struct hash_ip4_elem *e2, u32 *multi) { return e1->ip == e2->ip; } static bool hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *e) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e) { next->ip = e->ip; } #define MTYPE hash_ip4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ip4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ip4_elem e = { 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); __be32 ip; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip); ip &= h->bitmask.ip; if (ip == 0) return -EINVAL; e.ip = ip; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_ip4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ip4_elem e = { 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, hosts, i = 0; int ret = 0; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP])) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ip &= ntohl(h->bitmask.ip); e.ip = htonl(ip); if (e.ip == 0) return -IPSET_ERR_HASH_ELEM; if (adt == IPSET_TEST) return adtfn(set, &e, &ext, &ext, flags); ip_to = ip; if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) { if (ip_to == 0) return -IPSET_ERR_HASH_ELEM; swap(ip, ip_to); } } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1); if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; i++) { e.ip = htonl(ip); if (i > IPSET_MAX_RANGE) { hash_ip4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ip += hosts; if (ip == 0) return 0; ret = 0; } return ret; } /* IPv6 variant */ /* Member elements */ struct hash_ip6_elem { union nf_inet_addr ip; }; /* Common functions */ static bool hash_ip6_data_equal(const struct hash_ip6_elem *ip1, const struct hash_ip6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6); } static bool hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ip6_data_next(struct hash_ip6_elem *next, const struct hash_ip6_elem *e) { } #undef MTYPE #undef HOST_MASK #define MTYPE hash_ip6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ip6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ip6_elem e = { { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ip6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ip6_elem e = { { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP])) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -IPSET_ERR_HASH_ELEM; ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } static struct ip_set_type hash_ip_type __read_mostly = { .name = "hash:ip", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP, .dimension = IPSET_DIM_ONE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ip_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ip_init(void) { return ip_set_type_register(&hash_ip_type); } static void __exit hash_ip_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_ip_type); } module_init(hash_ip_init); module_exit(hash_ip_fini);
18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 3 34 34 16 3 18 19 16 34 23 10 11 11 11 11 3 18 40 39 40 39 3 3 3 3 11 11 11 11 42 42 42 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 // SPDX-License-Identifier: GPL-2.0 /* * llc_conn.c - Driver routines for connection component. * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #include <linux/init.h> #include <linux/slab.h> #include <net/llc.h> #include <net/llc_c_ac.h> #include <net/llc_c_ev.h> #include <net/llc_c_st.h> #include <net/llc_conn.h> #include <net/llc_pdu.h> #include <net/llc_sap.h> #include <net/sock.h> #include <net/tcp_states.h> #if 0 #define dprintk(args...) printk(KERN_DEBUG args) #else #define dprintk(args...) #endif static int llc_find_offset(int state, int ev_type); static void llc_conn_send_pdus(struct sock *sk); static int llc_conn_service(struct sock *sk, struct sk_buff *skb); static int llc_exec_conn_trans_actions(struct sock *sk, const struct llc_conn_state_trans *trans, struct sk_buff *ev); static const struct llc_conn_state_trans *llc_qualify_conn_ev(struct sock *sk, struct sk_buff *skb); /* Offset table on connection states transition diagram */ static int llc_offset_table[NBR_CONN_STATES][NBR_CONN_EV]; int sysctl_llc2_ack_timeout = LLC2_ACK_TIME * HZ; int sysctl_llc2_p_timeout = LLC2_P_TIME * HZ; int sysctl_llc2_rej_timeout = LLC2_REJ_TIME * HZ; int sysctl_llc2_busy_timeout = LLC2_BUSY_TIME * HZ; /** * llc_conn_state_process - sends event to connection state machine * @sk: connection * @skb: occurred event * * Sends an event to connection state machine. After processing event * (executing it's actions and changing state), upper layer will be * indicated or confirmed, if needed. Returns 0 for success, 1 for * failure. The socket lock has to be held before calling this function. * * This function always consumes a reference to the skb. */ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) { int rc; struct llc_sock *llc = llc_sk(skb->sk); struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->ind_prim = ev->cfm_prim = 0; /* * Send event to state machine */ rc = llc_conn_service(skb->sk, skb); if (unlikely(rc != 0)) { printk(KERN_ERR "%s: llc_conn_service failed\n", __func__); goto out_skb_put; } switch (ev->ind_prim) { case LLC_DATA_PRIM: skb_get(skb); llc_save_primitive(sk, skb, LLC_DATA_PRIM); if (unlikely(sock_queue_rcv_skb(sk, skb))) { /* * shouldn't happen */ printk(KERN_ERR "%s: sock_queue_rcv_skb failed!\n", __func__); kfree_skb(skb); } break; case LLC_CONN_PRIM: /* * Can't be sock_queue_rcv_skb, because we have to leave the * skb->sk pointing to the newly created struct sock in * llc_conn_handler. -acme */ skb_get(skb); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_state_change(sk); break; case LLC_DISC_PRIM: sock_hold(sk); if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_ESTABLISHED) { sk->sk_shutdown = SHUTDOWN_MASK; sk->sk_socket->state = SS_UNCONNECTED; sk->sk_state = TCP_CLOSE; if (!sock_flag(sk, SOCK_DEAD)) { sock_set_flag(sk, SOCK_DEAD); sk->sk_state_change(sk); } } sock_put(sk); break; case LLC_RESET_PRIM: /* * FIXME: * RESET is not being notified to upper layers for now */ printk(KERN_INFO "%s: received a reset ind!\n", __func__); break; default: if (ev->ind_prim) printk(KERN_INFO "%s: received unknown %d prim!\n", __func__, ev->ind_prim); /* No indication */ break; } switch (ev->cfm_prim) { case LLC_DATA_PRIM: if (!llc_data_accept_state(llc->state)) sk->sk_write_space(sk); else rc = llc->failed_data_req = 1; break; case LLC_CONN_PRIM: if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_SYN_SENT) { if (ev->status) { sk->sk_socket->state = SS_UNCONNECTED; sk->sk_state = TCP_CLOSE; } else { sk->sk_socket->state = SS_CONNECTED; sk->sk_state = TCP_ESTABLISHED; } sk->sk_state_change(sk); } break; case LLC_DISC_PRIM: sock_hold(sk); if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSING) { sk->sk_socket->state = SS_UNCONNECTED; sk->sk_state = TCP_CLOSE; sk->sk_state_change(sk); } sock_put(sk); break; case LLC_RESET_PRIM: /* * FIXME: * RESET is not being notified to upper layers for now */ printk(KERN_INFO "%s: received a reset conf!\n", __func__); break; default: if (ev->cfm_prim) printk(KERN_INFO "%s: received unknown %d prim!\n", __func__, ev->cfm_prim); /* No confirmation */ break; } out_skb_put: kfree_skb(skb); return rc; } void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb) { /* queue PDU to send to MAC layer */ skb_queue_tail(&sk->sk_write_queue, skb); llc_conn_send_pdus(sk); } /** * llc_conn_rtn_pdu - sends received data pdu to upper layer * @sk: Active connection * @skb: Received data frame * * Sends received data pdu to upper layer (by using indicate function). * Prepares service parameters (prim and prim_data). calling indication * function will be done in llc_conn_state_process. */ void llc_conn_rtn_pdu(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->ind_prim = LLC_DATA_PRIM; } /** * llc_conn_resend_i_pdu_as_cmd - resend all all unacknowledged I PDUs * @sk: active connection * @nr: NR * @first_p_bit: p_bit value of first pdu * * Resend all unacknowledged I PDUs, starting with the NR; send first as * command PDU with P bit equal first_p_bit; if more than one send * subsequent as command PDUs with P bit equal zero (0). */ void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit) { struct sk_buff *skb; struct llc_pdu_sn *pdu; u16 nbr_unack_pdus; struct llc_sock *llc; u8 howmany_resend = 0; llc_conn_remove_acked_pdus(sk, nr, &nbr_unack_pdus); if (!nbr_unack_pdus) goto out; /* * Process unack PDUs only if unack queue is not empty; remove * appropriate PDUs, fix them up, and put them on mac_pdu_q. */ llc = llc_sk(sk); while ((skb = skb_dequeue(&llc->pdu_unack_q)) != NULL) { pdu = llc_pdu_sn_hdr(skb); llc_pdu_set_cmd_rsp(skb, LLC_PDU_CMD); llc_pdu_set_pf_bit(skb, first_p_bit); skb_queue_tail(&sk->sk_write_queue, skb); first_p_bit = 0; llc->vS = LLC_I_GET_NS(pdu); howmany_resend++; } if (howmany_resend > 0) llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; /* any PDUs to re-send are queued up; start sending to MAC */ llc_conn_send_pdus(sk); out:; } /** * llc_conn_resend_i_pdu_as_rsp - Resend all unacknowledged I PDUs * @sk: active connection. * @nr: NR * @first_f_bit: f_bit value of first pdu. * * Resend all unacknowledged I PDUs, starting with the NR; send first as * response PDU with F bit equal first_f_bit; if more than one send * subsequent as response PDUs with F bit equal zero (0). */ void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit) { struct sk_buff *skb; u16 nbr_unack_pdus; struct llc_sock *llc = llc_sk(sk); u8 howmany_resend = 0; llc_conn_remove_acked_pdus(sk, nr, &nbr_unack_pdus); if (!nbr_unack_pdus) goto out; /* * Process unack PDUs only if unack queue is not empty; remove * appropriate PDUs, fix them up, and put them on mac_pdu_q */ while ((skb = skb_dequeue(&llc->pdu_unack_q)) != NULL) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); llc_pdu_set_cmd_rsp(skb, LLC_PDU_RSP); llc_pdu_set_pf_bit(skb, first_f_bit); skb_queue_tail(&sk->sk_write_queue, skb); first_f_bit = 0; llc->vS = LLC_I_GET_NS(pdu); howmany_resend++; } if (howmany_resend > 0) llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; /* any PDUs to re-send are queued up; start sending to MAC */ llc_conn_send_pdus(sk); out:; } /** * llc_conn_remove_acked_pdus - Removes acknowledged pdus from tx queue * @sk: active connection * @nr: NR * @how_many_unacked: size of pdu_unack_q after removing acked pdus * * Removes acknowledged pdus from transmit queue (pdu_unack_q). Returns * the number of pdus that removed from queue. */ int llc_conn_remove_acked_pdus(struct sock *sk, u8 nr, u16 *how_many_unacked) { int pdu_pos, i; struct sk_buff *skb; struct llc_pdu_sn *pdu; int nbr_acked = 0; struct llc_sock *llc = llc_sk(sk); int q_len = skb_queue_len(&llc->pdu_unack_q); if (!q_len) goto out; skb = skb_peek(&llc->pdu_unack_q); pdu = llc_pdu_sn_hdr(skb); /* finding position of last acked pdu in queue */ pdu_pos = ((int)LLC_2_SEQ_NBR_MODULO + (int)nr - (int)LLC_I_GET_NS(pdu)) % LLC_2_SEQ_NBR_MODULO; for (i = 0; i < pdu_pos && i < q_len; i++) { skb = skb_dequeue(&llc->pdu_unack_q); kfree_skb(skb); nbr_acked++; } out: *how_many_unacked = skb_queue_len(&llc->pdu_unack_q); return nbr_acked; } /** * llc_conn_send_pdus - Sends queued PDUs * @sk: active connection * * Sends queued pdus to MAC layer for transmission. */ static void llc_conn_send_pdus(struct sock *sk) { struct sk_buff *skb; while ((skb = skb_dequeue(&sk->sk_write_queue)) != NULL) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); if (LLC_PDU_TYPE_IS_I(pdu) && !(skb->dev->flags & IFF_LOOPBACK)) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); skb_queue_tail(&llc_sk(sk)->pdu_unack_q, skb); if (!skb2) break; skb = skb2; } dev_queue_xmit(skb); } } /** * llc_conn_service - finds transition and changes state of connection * @sk: connection * @skb: happened event * * This function finds transition that matches with happened event, then * executes related actions and finally changes state of connection. * Returns 0 for success, 1 for failure. */ static int llc_conn_service(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_trans *trans; struct llc_sock *llc = llc_sk(sk); int rc = 1; if (llc->state > NBR_CONN_STATES) goto out; rc = 0; trans = llc_qualify_conn_ev(sk, skb); if (trans) { rc = llc_exec_conn_trans_actions(sk, trans, skb); if (!rc && trans->next_state != NO_STATE_CHANGE) { llc->state = trans->next_state; if (!llc_data_accept_state(llc->state)) sk->sk_state_change(sk); } } out: return rc; } /** * llc_qualify_conn_ev - finds transition for event * @sk: connection * @skb: happened event * * This function finds transition that matches with happened event. * Returns pointer to found transition on success, %NULL otherwise. */ static const struct llc_conn_state_trans *llc_qualify_conn_ev(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_trans **next_trans; const llc_conn_ev_qfyr_t *next_qualifier; struct llc_conn_state_ev *ev = llc_conn_ev(skb); struct llc_sock *llc = llc_sk(sk); struct llc_conn_state *curr_state = &llc_conn_state_table[llc->state - 1]; /* search thru events for this state until * list exhausted or until no more */ for (next_trans = curr_state->transitions + llc_find_offset(llc->state - 1, ev->type); (*next_trans)->ev; next_trans++) { if (!((*next_trans)->ev)(sk, skb)) { /* got POSSIBLE event match; the event may require * qualification based on the values of a number of * state flags; if all qualifications are met (i.e., * if all qualifying functions return success, or 0, * then this is THE event we're looking for */ for (next_qualifier = (*next_trans)->ev_qualifiers; next_qualifier && *next_qualifier && !(*next_qualifier)(sk, skb); next_qualifier++) /* nothing */; if (!next_qualifier || !*next_qualifier) /* all qualifiers executed successfully; this is * our transition; return it so we can perform * the associated actions & change the state */ return *next_trans; } } return NULL; } /** * llc_exec_conn_trans_actions - executes related actions * @sk: connection * @trans: transition that it's actions must be performed * @skb: event * * Executes actions that is related to happened event. Returns 0 for * success, 1 to indicate failure of at least one action. */ static int llc_exec_conn_trans_actions(struct sock *sk, const struct llc_conn_state_trans *trans, struct sk_buff *skb) { int rc = 0; const llc_conn_action_t *next_action; for (next_action = trans->ev_actions; next_action && *next_action; next_action++) { int rc2 = (*next_action)(sk, skb); if (rc2 == 2) { rc = rc2; break; } else if (rc2) rc = 1; } return rc; } static inline bool llc_estab_match(const struct llc_sap *sap, const struct llc_addr *daddr, const struct llc_addr *laddr, const struct sock *sk, const struct net *net) { struct llc_sock *llc = llc_sk(sk); return net_eq(sock_net(sk), net) && llc->laddr.lsap == laddr->lsap && llc->daddr.lsap == daddr->lsap && ether_addr_equal(llc->laddr.mac, laddr->mac) && ether_addr_equal(llc->daddr.mac, daddr->mac); } /** * __llc_lookup_established - Finds connection for the remote/local sap/mac * @sap: SAP * @daddr: address of remote LLC (MAC + SAP) * @laddr: address of local LLC (MAC + SAP) * @net: netns to look up a socket in * * Search connection list of the SAP and finds connection using the remote * mac, remote sap, local mac, and local sap. Returns pointer for * connection found, %NULL otherwise. * Caller has to make sure local_bh is disabled. */ static struct sock *__llc_lookup_established(struct llc_sap *sap, struct llc_addr *daddr, struct llc_addr *laddr, const struct net *net) { struct sock *rc; struct hlist_nulls_node *node; int slot = llc_sk_laddr_hashfn(sap, laddr); struct hlist_nulls_head *laddr_hb = &sap->sk_laddr_hash[slot]; rcu_read_lock(); again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { if (llc_estab_match(sap, daddr, laddr, rc, net)) { /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!refcount_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || !llc_estab_match(sap, daddr, laddr, rc, net))) { sock_put(rc); continue; } goto found; } } rc = NULL; /* * if the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (unlikely(get_nulls_value(node) != slot)) goto again; found: rcu_read_unlock(); return rc; } struct sock *llc_lookup_established(struct llc_sap *sap, struct llc_addr *daddr, struct llc_addr *laddr, const struct net *net) { struct sock *sk; local_bh_disable(); sk = __llc_lookup_established(sap, daddr, laddr, net); local_bh_enable(); return sk; } static inline bool llc_listener_match(const struct llc_sap *sap, const struct llc_addr *laddr, const struct sock *sk, const struct net *net) { struct llc_sock *llc = llc_sk(sk); return net_eq(sock_net(sk), net) && sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN && llc->laddr.lsap == laddr->lsap && ether_addr_equal(llc->laddr.mac, laddr->mac); } static struct sock *__llc_lookup_listener(struct llc_sap *sap, struct llc_addr *laddr, const struct net *net) { struct sock *rc; struct hlist_nulls_node *node; int slot = llc_sk_laddr_hashfn(sap, laddr); struct hlist_nulls_head *laddr_hb = &sap->sk_laddr_hash[slot]; rcu_read_lock(); again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { if (llc_listener_match(sap, laddr, rc, net)) { /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!refcount_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || !llc_listener_match(sap, laddr, rc, net))) { sock_put(rc); continue; } goto found; } } rc = NULL; /* * if the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (unlikely(get_nulls_value(node) != slot)) goto again; found: rcu_read_unlock(); return rc; } /** * llc_lookup_listener - Finds listener for local MAC + SAP * @sap: SAP * @laddr: address of local LLC (MAC + SAP) * @net: netns to look up a socket in * * Search connection list of the SAP and finds connection listening on * local mac, and local sap. Returns pointer for parent socket found, * %NULL otherwise. * Caller has to make sure local_bh is disabled. */ static struct sock *llc_lookup_listener(struct llc_sap *sap, struct llc_addr *laddr, const struct net *net) { struct sock *rc = __llc_lookup_listener(sap, laddr, net); static struct llc_addr null_addr; if (!rc) rc = __llc_lookup_listener(sap, &null_addr, net); return rc; } static struct sock *__llc_lookup(struct llc_sap *sap, struct llc_addr *daddr, struct llc_addr *laddr, const struct net *net) { struct sock *sk = __llc_lookup_established(sap, daddr, laddr, net); return sk ? : llc_lookup_listener(sap, laddr, net); } /** * llc_data_accept_state - designates if in this state data can be sent. * @state: state of connection. * * Returns 0 if data can be sent, 1 otherwise. */ u8 llc_data_accept_state(u8 state) { return state != LLC_CONN_STATE_NORMAL && state != LLC_CONN_STATE_BUSY && state != LLC_CONN_STATE_REJ; } /** * llc_find_next_offset - finds offset for next category of transitions * @state: state table. * @offset: start offset. * * Finds offset of next category of transitions in transition table. * Returns the start index of next category. */ static u16 __init llc_find_next_offset(struct llc_conn_state *state, u16 offset) { const struct llc_conn_state_trans **next_trans; u16 cnt = 0; for (next_trans = state->transitions + offset; (*next_trans)->ev; next_trans++) ++cnt; return cnt; } /** * llc_build_offset_table - builds offset table of connection * * Fills offset table of connection state transition table * (llc_offset_table). */ void __init llc_build_offset_table(void) { struct llc_conn_state *curr_state; int state, ev_type, next_offset; for (state = 0; state < NBR_CONN_STATES; state++) { curr_state = &llc_conn_state_table[state]; next_offset = 0; for (ev_type = 0; ev_type < NBR_CONN_EV; ev_type++) { llc_offset_table[state][ev_type] = next_offset; next_offset += llc_find_next_offset(curr_state, next_offset) + 1; } } } /** * llc_find_offset - finds start offset of category of transitions * @state: state of connection * @ev_type: type of happened event * * Finds start offset of desired category of transitions. Returns the * desired start offset. */ static int llc_find_offset(int state, int ev_type) { int rc = 0; /* at this stage, llc_offset_table[..][2] is not important. it is for * init_pf_cycle and I don't know what is it. */ switch (ev_type) { case LLC_CONN_EV_TYPE_PRIM: rc = llc_offset_table[state][0]; break; case LLC_CONN_EV_TYPE_PDU: rc = llc_offset_table[state][4]; break; case LLC_CONN_EV_TYPE_SIMPLE: rc = llc_offset_table[state][1]; break; case LLC_CONN_EV_TYPE_P_TMR: case LLC_CONN_EV_TYPE_ACK_TMR: case LLC_CONN_EV_TYPE_REJ_TMR: case LLC_CONN_EV_TYPE_BUSY_TMR: rc = llc_offset_table[state][3]; break; } return rc; } /** * llc_sap_add_socket - adds a socket to a SAP * @sap: SAP * @sk: socket * * This function adds a socket to the hash tables of a SAP. */ void llc_sap_add_socket(struct llc_sap *sap, struct sock *sk) { struct llc_sock *llc = llc_sk(sk); struct hlist_head *dev_hb = llc_sk_dev_hash(sap, llc->dev->ifindex); struct hlist_nulls_head *laddr_hb = llc_sk_laddr_hash(sap, &llc->laddr); llc_sap_hold(sap); llc_sk(sk)->sap = sap; spin_lock_bh(&sap->sk_lock); sock_set_flag(sk, SOCK_RCU_FREE); sap->sk_count++; sk_nulls_add_node_rcu(sk, laddr_hb); hlist_add_head(&llc->dev_hash_node, dev_hb); spin_unlock_bh(&sap->sk_lock); } /** * llc_sap_remove_socket - removes a socket from SAP * @sap: SAP * @sk: socket * * This function removes a connection from the hash tables of a SAP if * the connection was in this list. */ void llc_sap_remove_socket(struct llc_sap *sap, struct sock *sk) { struct llc_sock *llc = llc_sk(sk); spin_lock_bh(&sap->sk_lock); sk_nulls_del_node_init_rcu(sk); hlist_del(&llc->dev_hash_node); sap->sk_count--; spin_unlock_bh(&sap->sk_lock); llc_sap_put(sap); } /** * llc_conn_rcv - sends received pdus to the connection state machine * @sk: current connection structure. * @skb: received frame. * * Sends received pdus to the connection state machine. */ static int llc_conn_rcv(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->type = LLC_CONN_EV_TYPE_PDU; ev->reason = 0; return llc_conn_state_process(sk, skb); } static struct sock *llc_create_incoming_sock(struct sock *sk, struct net_device *dev, struct llc_addr *saddr, struct llc_addr *daddr) { struct sock *newsk = llc_sk_alloc(sock_net(sk), sk->sk_family, GFP_ATOMIC, sk->sk_prot, 0); struct llc_sock *newllc, *llc = llc_sk(sk); if (!newsk) goto out; newllc = llc_sk(newsk); memcpy(&newllc->laddr, daddr, sizeof(newllc->laddr)); memcpy(&newllc->daddr, saddr, sizeof(newllc->daddr)); newllc->dev = dev; dev_hold(dev); llc_sap_add_socket(llc->sap, newsk); llc_sap_hold(llc->sap); out: return newsk; } void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb) { struct llc_addr saddr, daddr; struct sock *sk; llc_pdu_decode_sa(skb, saddr.mac); llc_pdu_decode_ssap(skb, &saddr.lsap); llc_pdu_decode_da(skb, daddr.mac); llc_pdu_decode_dsap(skb, &daddr.lsap); sk = __llc_lookup(sap, &saddr, &daddr, dev_net(skb->dev)); if (!sk) goto drop; bh_lock_sock(sk); /* * This has to be done here and not at the upper layer ->accept * method because of the way the PROCOM state machine works: * it needs to set several state variables (see, for instance, * llc_adm_actions_2 in net/llc/llc_c_st.c) and send a packet to * the originator of the new connection, and this state has to be * in the newly created struct sock private area. -acme */ if (unlikely(sk->sk_state == TCP_LISTEN)) { struct sock *newsk = llc_create_incoming_sock(sk, skb->dev, &saddr, &daddr); if (!newsk) goto drop_unlock; skb_set_owner_r(skb, newsk); } else { /* * Can't be skb_set_owner_r, this will be done at the * llc_conn_state_process function, later on, when we will use * skb_queue_rcv_skb to send it to upper layers, this is * another trick required to cope with how the PROCOM state * machine works. -acme */ skb_orphan(skb); sock_hold(sk); skb->sk = sk; skb->destructor = sock_efree; } if (!sock_owned_by_user(sk)) llc_conn_rcv(sk, skb); else { dprintk("%s: adding to backlog...\n", __func__); llc_set_backlog_type(skb, LLC_PACKET); if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) goto drop_unlock; } out: bh_unlock_sock(sk); sock_put(sk); return; drop: kfree_skb(skb); return; drop_unlock: kfree_skb(skb); goto out; } #undef LLC_REFCNT_DEBUG #ifdef LLC_REFCNT_DEBUG static atomic_t llc_sock_nr; #endif /** * llc_backlog_rcv - Processes rx frames and expired timers. * @sk: LLC sock (p8022 connection) * @skb: queued rx frame or event * * This function processes frames that has received and timers that has * expired during sending an I pdu (refer to data_req_handler). frames * queue by llc_rcv function (llc_mac.c) and timers queue by timer * callback functions(llc_c_ac.c). */ static int llc_backlog_rcv(struct sock *sk, struct sk_buff *skb) { int rc = 0; struct llc_sock *llc = llc_sk(sk); if (likely(llc_backlog_type(skb) == LLC_PACKET)) { if (likely(llc->state > 1)) /* not closed */ rc = llc_conn_rcv(sk, skb); else goto out_kfree_skb; } else if (llc_backlog_type(skb) == LLC_EVENT) { /* timer expiration event */ if (likely(llc->state > 1)) /* not closed */ rc = llc_conn_state_process(sk, skb); else goto out_kfree_skb; } else { printk(KERN_ERR "%s: invalid skb in backlog\n", __func__); goto out_kfree_skb; } out: return rc; out_kfree_skb: kfree_skb(skb); goto out; } /** * llc_sk_init - Initializes a socket with default llc values. * @sk: socket to initialize. * * Initializes a socket with default llc values. */ static void llc_sk_init(struct sock *sk) { struct llc_sock *llc = llc_sk(sk); llc->state = LLC_CONN_STATE_ADM; llc->inc_cntr = llc->dec_cntr = 2; llc->dec_step = llc->connect_step = 1; timer_setup(&llc->ack_timer.timer, llc_conn_ack_tmr_cb, 0); llc->ack_timer.expire = sysctl_llc2_ack_timeout; timer_setup(&llc->pf_cycle_timer.timer, llc_conn_pf_cycle_tmr_cb, 0); llc->pf_cycle_timer.expire = sysctl_llc2_p_timeout; timer_setup(&llc->rej_sent_timer.timer, llc_conn_rej_tmr_cb, 0); llc->rej_sent_timer.expire = sysctl_llc2_rej_timeout; timer_setup(&llc->busy_state_timer.timer, llc_conn_busy_tmr_cb, 0); llc->busy_state_timer.expire = sysctl_llc2_busy_timeout; llc->n2 = 2; /* max retransmit */ llc->k = 2; /* tx win size, will adjust dynam */ llc->rw = 128; /* rx win size (opt and equal to * tx_win of remote LLC) */ skb_queue_head_init(&llc->pdu_unack_q); sk->sk_backlog_rcv = llc_backlog_rcv; } /** * llc_sk_alloc - Allocates LLC sock * @net: network namespace * @family: upper layer protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @prot: struct proto associated with this new sock instance * @kern: is this to be a kernel socket? * * Allocates a LLC sock and initializes it. Returns the new LLC sock * or %NULL if there's no memory available for one */ struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern) { struct sock *sk = sk_alloc(net, family, priority, prot, kern); if (!sk) goto out; llc_sk_init(sk); sock_init_data(NULL, sk); #ifdef LLC_REFCNT_DEBUG atomic_inc(&llc_sock_nr); printk(KERN_DEBUG "LLC socket %p created in %s, now we have %d alive\n", sk, __func__, atomic_read(&llc_sock_nr)); #endif out: return sk; } void llc_sk_stop_all_timers(struct sock *sk, bool sync) { struct llc_sock *llc = llc_sk(sk); if (sync) { timer_delete_sync(&llc->pf_cycle_timer.timer); timer_delete_sync(&llc->ack_timer.timer); timer_delete_sync(&llc->rej_sent_timer.timer); timer_delete_sync(&llc->busy_state_timer.timer); } else { timer_delete(&llc->pf_cycle_timer.timer); timer_delete(&llc->ack_timer.timer); timer_delete(&llc->rej_sent_timer.timer); timer_delete(&llc->busy_state_timer.timer); } llc->ack_must_be_send = 0; llc->ack_pf = 0; } /** * llc_sk_free - Frees a LLC socket * @sk: - socket to free * * Frees a LLC socket */ void llc_sk_free(struct sock *sk) { struct llc_sock *llc = llc_sk(sk); llc->state = LLC_CONN_OUT_OF_SVC; /* Stop all (possibly) running timers */ llc_sk_stop_all_timers(sk, true); #ifdef DEBUG_LLC_CONN_ALLOC printk(KERN_INFO "%s: unackq=%d, txq=%d\n", __func__, skb_queue_len(&llc->pdu_unack_q), skb_queue_len(&sk->sk_write_queue)); #endif skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); skb_queue_purge(&llc->pdu_unack_q); #ifdef LLC_REFCNT_DEBUG if (refcount_read(&sk->sk_refcnt) != 1) { printk(KERN_DEBUG "Destruction of LLC sock %p delayed in %s, cnt=%d\n", sk, __func__, refcount_read(&sk->sk_refcnt)); printk(KERN_DEBUG "%d LLC sockets are still alive\n", atomic_read(&llc_sock_nr)); } else { atomic_dec(&llc_sock_nr); printk(KERN_DEBUG "LLC socket %p released in %s, %d are still alive\n", sk, __func__, atomic_read(&llc_sock_nr)); } #endif sock_put(sk); } /** * llc_sk_reset - resets a connection * @sk: LLC socket to reset * * Resets a connection to the out of service state. Stops its timers * and frees any frames in the queues of the connection. */ void llc_sk_reset(struct sock *sk) { struct llc_sock *llc = llc_sk(sk); llc_conn_ac_stop_all_timers(sk, NULL); skb_queue_purge(&sk->sk_write_queue); skb_queue_purge(&llc->pdu_unack_q); llc->remote_busy_flag = 0; llc->cause_flag = 0; llc->retry_count = 0; llc_conn_set_p_flag(sk, 0); llc->f_flag = 0; llc->s_flag = 0; llc->ack_pf = 0; llc->first_pdu_Ns = 0; llc->ack_must_be_send = 0; llc->dec_step = 1; llc->inc_cntr = 2; llc->dec_cntr = 2; llc->X = 0; llc->failed_data_req = 0 ; llc->last_nr = 0; }
15595 44 229 16113 44 15476 15468 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NET_SCM_H #define __LINUX_NET_SCM_H #include <linux/limits.h> #include <linux/net.h> #include <linux/cred.h> #include <linux/file.h> #include <linux/security.h> #include <linux/pid.h> #include <linux/nsproxy.h> #include <linux/sched/signal.h> #include <net/compat.h> /* Well, we should have at least one descriptor open * to accept passed FDs 8) */ #define SCM_MAX_FD 253 struct scm_creds { u32 pid; kuid_t uid; kgid_t gid; }; #ifdef CONFIG_UNIX struct unix_edge; #endif struct scm_fp_list { short count; short count_unix; short max; #ifdef CONFIG_UNIX bool inflight; bool dead; struct list_head vertices; struct unix_edge *edges; #endif struct user_struct *user; struct file *fp[SCM_MAX_FD]; }; struct scm_cookie { struct pid *pid; /* Skb credentials */ struct scm_fp_list *fp; /* Passed files */ struct scm_creds creds; /* Skb credentials */ #ifdef CONFIG_SECURITY_NETWORK u32 secid; /* Passed security ID */ #endif }; void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm); void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm); int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm); void __scm_destroy(struct scm_cookie *scm); struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl); #ifdef CONFIG_SECURITY_NETWORK static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm) { security_socket_getpeersec_dgram(sock, NULL, &scm->secid); } #else static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm) { } #endif /* CONFIG_SECURITY_NETWORK */ static __inline__ void scm_set_cred(struct scm_cookie *scm, struct pid *pid, kuid_t uid, kgid_t gid) { scm->pid = get_pid(pid); scm->creds.pid = pid_vnr(pid); scm->creds.uid = uid; scm->creds.gid = gid; } static __inline__ void scm_destroy_cred(struct scm_cookie *scm) { put_pid(scm->pid); scm->pid = NULL; } static __inline__ void scm_destroy(struct scm_cookie *scm) { scm_destroy_cred(scm); if (scm->fp) __scm_destroy(scm); } static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, bool forcecreds) { memset(scm, 0, sizeof(*scm)); scm->creds.uid = INVALID_UID; scm->creds.gid = INVALID_GID; if (forcecreds) scm_set_cred(scm, task_tgid(current), current_uid(), current_gid()); unix_get_peersec_dgram(sock, scm); if (msg->msg_controllen <= 0) return 0; return __scm_send(sock, msg, scm); } void scm_recv(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, int flags); void scm_recv_unix(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, int flags); static inline int scm_recv_one_fd(struct file *f, int __user *ufd, unsigned int flags) { if (!ufd) return -EFAULT; return receive_fd(f, ufd, flags); } #endif /* __LINUX_NET_SCM_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 #ifndef __NET_SCHED_CODEL_H #define __NET_SCHED_CODEL_H /* * Codel - The Controlled-Delay Active Queue Management algorithm * * Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com> * Copyright (C) 2011-2012 Van Jacobson <van@pollere.net> * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net> * Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote products * derived from this software without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/types.h> #include <linux/ktime.h> #include <linux/skbuff.h> /* Controlling Queue Delay (CoDel) algorithm * ========================================= * Source : Kathleen Nichols and Van Jacobson * http://queue.acm.org/detail.cfm?id=2209336 * * Implemented on linux by Dave Taht and Eric Dumazet */ /* CoDel uses a 1024 nsec clock, encoded in u32 * This gives a range of 2199 seconds, because of signed compares */ typedef u32 codel_time_t; typedef s32 codel_tdiff_t; #define CODEL_SHIFT 10 #define MS2TIME(a) ((a * NSEC_PER_MSEC) >> CODEL_SHIFT) static inline codel_time_t codel_get_time(void) { u64 ns = ktime_get_ns(); return ns >> CODEL_SHIFT; } /* Dealing with timer wrapping, according to RFC 1982, as desc in wikipedia: * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution * codel_time_after(a,b) returns true if the time a is after time b. */ #define codel_time_after(a, b) \ (typecheck(codel_time_t, a) && \ typecheck(codel_time_t, b) && \ ((s32)((a) - (b)) > 0)) #define codel_time_before(a, b) codel_time_after(b, a) #define codel_time_after_eq(a, b) \ (typecheck(codel_time_t, a) && \ typecheck(codel_time_t, b) && \ ((s32)((a) - (b)) >= 0)) #define codel_time_before_eq(a, b) codel_time_after_eq(b, a) static inline u32 codel_time_to_us(codel_time_t val) { u64 valns = ((u64)val << CODEL_SHIFT); do_div(valns, NSEC_PER_USEC); return (u32)valns; } /** * struct codel_params - contains codel parameters * @target: target queue size (in time units) * @ce_threshold: threshold for marking packets with ECN CE * @interval: width of moving time window * @mtu: device mtu, or minimal queue backlog in bytes. * @ecn: is Explicit Congestion Notification enabled * @ce_threshold_selector: apply ce_threshold to packets matching this value * in the diffserv/ECN byte of the IP header * @ce_threshold_mask: mask to apply to ce_threshold_selector comparison */ struct codel_params { codel_time_t target; codel_time_t ce_threshold; codel_time_t interval; u32 mtu; bool ecn; u8 ce_threshold_selector; u8 ce_threshold_mask; }; /** * struct codel_vars - contains codel variables * @count: how many drops we've done since the last time we * entered dropping state * @lastcount: count at entry to dropping state * @dropping: set to true if in dropping state * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 * @first_above_time: when we went (or will go) continuously above target * for interval * @drop_next: time to drop next packet, or when we dropped last * @ldelay: sojourn time of last dequeued packet */ struct codel_vars { u32 count; u32 lastcount; bool dropping; u16 rec_inv_sqrt; codel_time_t first_above_time; codel_time_t drop_next; codel_time_t ldelay; }; #define REC_INV_SQRT_BITS (8 * sizeof(u16)) /* or sizeof_in_bits(rec_inv_sqrt) */ /* needed shift to get a Q0.32 number from rec_inv_sqrt */ #define REC_INV_SQRT_SHIFT (32 - REC_INV_SQRT_BITS) /** * struct codel_stats - contains codel shared variables and stats * @maxpacket: largest packet we've seen so far * @drop_count: temp count of dropped packets in dequeue() * @drop_len: bytes of dropped packets in dequeue() * @ecn_mark: number of packets we ECN marked instead of dropping * @ce_mark: number of packets CE marked because sojourn time was above ce_threshold */ struct codel_stats { u32 maxpacket; u32 drop_count; u32 drop_len; u32 ecn_mark; u32 ce_mark; }; #define CODEL_DISABLED_THRESHOLD INT_MAX typedef u32 (*codel_skb_len_t)(const struct sk_buff *skb); typedef codel_time_t (*codel_skb_time_t)(const struct sk_buff *skb); typedef void (*codel_skb_drop_t)(struct sk_buff *skb, void *ctx); typedef struct sk_buff * (*codel_skb_dequeue_t)(struct codel_vars *vars, void *ctx); #endif
261 261 64 90 10 82 75 75 75 90 28 1 27 28 28 56 14 56 13 33 54 1 56 56 21 56 56 56 117 114 117 92 28 56 2 117 255 256 255 252 46 256 67 67 67 3 65 65 67 67 67 22 22 22 22 22 22 22 22 44 44 43 44 44 44 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/mlock.c * * (C) Copyright 1995 Linus Torvalds * (C) Copyright 2002 Christoph Hellwig */ #include <linux/capability.h> #include <linux/mman.h> #include <linux/mm.h> #include <linux/sched/user.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/pagemap.h> #include <linux/folio_batch.h> #include <linux/pagewalk.h> #include <linux/mempolicy.h> #include <linux/syscalls.h> #include <linux/sched.h> #include <linux/export.h> #include <linux/rmap.h> #include <linux/mmzone.h> #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/secretmem.h> #include "internal.h" struct mlock_fbatch { local_lock_t lock; struct folio_batch fbatch; }; static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = { .lock = INIT_LOCAL_LOCK(lock), }; bool can_do_mlock(void) { if (rlimit(RLIMIT_MEMLOCK) != 0) return true; if (capable(CAP_IPC_LOCK)) return true; return false; } EXPORT_SYMBOL(can_do_mlock); /* * Mlocked folios are marked with the PG_mlocked flag for efficient testing * in vmscan and, possibly, the fault path; and to support semi-accurate * statistics. * * An mlocked folio [folio_test_mlocked(folio)] is unevictable. As such, it * will be ostensibly placed on the LRU "unevictable" list (actually no such * list exists), rather than the [in]active lists. PG_unevictable is set to * indicate the unevictable state. */ static struct lruvec *__mlock_folio(struct folio *folio, struct lruvec *lruvec) { /* There is nothing more we can do while it's off LRU */ if (!folio_test_clear_lru(folio)) return lruvec; lruvec = folio_lruvec_relock_irq(folio, lruvec); if (unlikely(folio_evictable(folio))) { /* * This is a little surprising, but quite possible: PG_mlocked * must have got cleared already by another CPU. Could this * folio be unevictable? I'm not sure, but move it now if so. */ if (folio_test_unevictable(folio)) { lruvec_del_folio(lruvec, folio); folio_clear_unevictable(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGRESCUED, folio_nr_pages(folio)); } goto out; } if (folio_test_unevictable(folio)) { if (folio_test_mlocked(folio)) folio->mlock_count++; goto out; } lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_set_unevictable(folio); folio->mlock_count = !!folio_test_mlocked(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio)); out: folio_set_lru(folio); return lruvec; } static struct lruvec *__mlock_new_folio(struct folio *folio, struct lruvec *lruvec) { VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); lruvec = folio_lruvec_relock_irq(folio, lruvec); /* As above, this is a little surprising, but possible */ if (unlikely(folio_evictable(folio))) goto out; folio_set_unevictable(folio); folio->mlock_count = !!folio_test_mlocked(folio); __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio)); out: lruvec_add_folio(lruvec, folio); folio_set_lru(folio); return lruvec; } static struct lruvec *__munlock_folio(struct folio *folio, struct lruvec *lruvec) { int nr_pages = folio_nr_pages(folio); bool isolated = false; if (!folio_test_clear_lru(folio)) goto munlock; isolated = true; lruvec = folio_lruvec_relock_irq(folio, lruvec); if (folio_test_unevictable(folio)) { /* Then mlock_count is maintained, but might undercount */ if (folio->mlock_count) folio->mlock_count--; if (folio->mlock_count) goto out; } /* else assume that was the last mlock: reclaim will fix it if not */ munlock: if (folio_test_clear_mlocked(folio)) { __zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages); if (isolated || !folio_test_unevictable(folio)) __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); else __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); } /* folio_evictable() has to be checked *after* clearing Mlocked */ if (isolated && folio_test_unevictable(folio) && folio_evictable(folio)) { lruvec_del_folio(lruvec, folio); folio_clear_unevictable(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } out: if (isolated) folio_set_lru(folio); return lruvec; } /* * Flags held in the low bits of a struct folio pointer on the mlock_fbatch. */ #define LRU_FOLIO 0x1 #define NEW_FOLIO 0x2 static inline struct folio *mlock_lru(struct folio *folio) { return (struct folio *)((unsigned long)folio + LRU_FOLIO); } static inline struct folio *mlock_new(struct folio *folio) { return (struct folio *)((unsigned long)folio + NEW_FOLIO); } /* * mlock_folio_batch() is derived from folio_batch_move_lru(): perhaps that can * make use of such folio pointer flags in future, but for now just keep it for * mlock. We could use three separate folio batches instead, but one feels * better (munlocking a full folio batch does not need to drain mlocking folio * batches first). */ static void mlock_folio_batch(struct folio_batch *fbatch) { struct lruvec *lruvec = NULL; unsigned long mlock; struct folio *folio; int i; for (i = 0; i < folio_batch_count(fbatch); i++) { folio = fbatch->folios[i]; mlock = (unsigned long)folio & (LRU_FOLIO | NEW_FOLIO); folio = (struct folio *)((unsigned long)folio - mlock); fbatch->folios[i] = folio; if (mlock & LRU_FOLIO) lruvec = __mlock_folio(folio, lruvec); else if (mlock & NEW_FOLIO) lruvec = __mlock_new_folio(folio, lruvec); else lruvec = __munlock_folio(folio, lruvec); } if (lruvec) lruvec_unlock_irq(lruvec); folios_put(fbatch); } void mlock_drain_local(void) { struct folio_batch *fbatch; local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); if (folio_batch_count(fbatch)) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } void mlock_drain_remote(int cpu) { struct folio_batch *fbatch; WARN_ON_ONCE(cpu_online(cpu)); fbatch = &per_cpu(mlock_fbatch.fbatch, cpu); if (folio_batch_count(fbatch)) mlock_folio_batch(fbatch); } bool need_mlock_drain(int cpu) { return folio_batch_count(&per_cpu(mlock_fbatch.fbatch, cpu)); } /** * mlock_folio - mlock a folio already on (or temporarily off) LRU * @folio: folio to be mlocked. */ void mlock_folio(struct folio *folio) { struct folio_batch *fbatch; local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); if (!folio_test_set_mlocked(folio)) { int nr_pages = folio_nr_pages(folio); zone_stat_mod_folio(folio, NR_MLOCK, nr_pages); __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); } folio_get(folio); if (!folio_batch_add(fbatch, mlock_lru(folio)) || !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } /** * mlock_new_folio - mlock a newly allocated folio not yet on LRU * @folio: folio to be mlocked, either normal or a THP head. */ void mlock_new_folio(struct folio *folio) { struct folio_batch *fbatch; int nr_pages = folio_nr_pages(folio); local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); folio_set_mlocked(folio); zone_stat_mod_folio(folio, NR_MLOCK, nr_pages); __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); folio_get(folio); if (!folio_batch_add(fbatch, mlock_new(folio)) || !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } /** * munlock_folio - munlock a folio * @folio: folio to be munlocked, either normal or a THP head. */ void munlock_folio(struct folio *folio) { struct folio_batch *fbatch; local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); /* * folio_test_clear_mlocked(folio) must be left to __munlock_folio(), * which will check whether the folio is multiply mlocked. */ folio_get(folio); if (!folio_batch_add(fbatch, folio) || !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } static inline unsigned int folio_mlock_step(struct folio *folio, pte_t *pte, unsigned long addr, unsigned long end) { unsigned int count = (end - addr) >> PAGE_SHIFT; pte_t ptent = ptep_get(pte); if (!folio_test_large(folio)) return 1; return folio_pte_batch(folio, pte, ptent, count); } static inline bool allow_mlock_munlock(struct folio *folio, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned int step) { /* * For unlock, allow munlock large folio which is partially * mapped to VMA. As it's possible that large folio is * mlocked and VMA is split later. * * During memory pressure, such kind of large folio can * be split. And the pages are not in VM_LOCKed VMA * can be reclaimed. */ if (!(vma->vm_flags & VM_LOCKED)) return true; /* folio_within_range() cannot take KSM, but any small folio is OK */ if (!folio_test_large(folio)) return true; /* folio not in range [start, end), skip mlock */ if (!folio_within_range(folio, vma, start, end)) return false; /* folio is not fully mapped, skip mlock */ if (step != folio_nr_pages(folio)) return false; return true; } static int mlock_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte; pte_t ptent; struct folio *folio; unsigned int step = 1; unsigned long start = addr; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { if (!pmd_present(*pmd)) goto out; if (is_huge_zero_pmd(*pmd)) goto out; folio = pmd_folio(*pmd); if (folio_is_zone_device(folio)) goto out; if (vma->vm_flags & VM_LOCKED) mlock_folio(folio); else munlock_folio(folio); goto out; } start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!start_pte) { walk->action = ACTION_AGAIN; return 0; } for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); if (!pte_present(ptent)) continue; folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; step = folio_mlock_step(folio, pte, addr, end); if (!allow_mlock_munlock(folio, vma, start, end, step)) goto next_entry; if (vma->vm_flags & VM_LOCKED) mlock_folio(folio); else munlock_folio(folio); next_entry: pte += step - 1; addr += (step - 1) << PAGE_SHIFT; } pte_unmap(start_pte); out: spin_unlock(ptl); cond_resched(); return 0; } /* * mlock_vma_pages_range() - mlock any pages already in the range, * or munlock all pages in the range. * @vma - vma containing range to be mlock()ed or munlock()ed * @start - start address in @vma of the range * @end - end of range in @vma * @new_vma_flags - the new set of flags for @vma. * * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED; * called for munlock() and munlockall(), to clear VM_LOCKED from @vma. */ static void mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, vma_flags_t *new_vma_flags) { static const struct mm_walk_ops mlock_walk_ops = { .pmd_entry = mlock_pte_range, .walk_lock = PGWALK_WRLOCK_VERIFY, }; /* * There is a slight chance that concurrent page migration, * or page reclaim finding a page of this now-VM_LOCKED vma, * will call mlock_vma_folio() and raise page's mlock_count: * double counting, leaving the page unevictable indefinitely. * Communicate this danger to mlock_vma_folio() with VM_IO, * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas. * mmap_lock is held in write mode here, so this weird * combination should not be visible to other mmap_lock users; * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED. */ if (vma_flags_test(new_vma_flags, VMA_LOCKED_BIT)) vma_flags_set(new_vma_flags, VMA_IO_BIT); vma_start_write(vma); vma_flags_reset_once(vma, new_vma_flags); lru_add_drain(); walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); lru_add_drain(); if (vma_flags_test(new_vma_flags, VMA_IO_BIT)) { vma_flags_clear(new_vma_flags, VMA_IO_BIT); vma_flags_reset_once(vma, new_vma_flags); } } /* * mlock_fixup - handle mlock[all]/munlock[all] requests. * * Filters out "special" vmas -- VM_LOCKED never gets set for these, and * munlock is a no-op. However, for some special vmas, we go ahead and * populate the ptes. * * For vmas that pass the filters, merge/split as appropriate. */ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, vm_flags_t newflags) { vma_flags_t new_vma_flags = legacy_to_vma_flags(newflags); const vma_flags_t old_vma_flags = vma->flags; struct mm_struct *mm = vma->vm_mm; int nr_pages; int ret = 0; if (vma_flags_same_pair(&old_vma_flags, &new_vma_flags) || vma_is_secretmem(vma) || !vma_supports_mlock(vma)) { /* * Don't set VM_LOCKED or VM_LOCKONFAULT and don't count. * For secretmem, don't allow the memory to be unlocked. */ goto out; } vma = vma_modify_flags(vmi, *prev, vma, start, end, &new_vma_flags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; } /* * Keep track of amount of locked VM. */ nr_pages = (end - start) >> PAGE_SHIFT; if (!vma_flags_test(&new_vma_flags, VMA_LOCKED_BIT)) nr_pages = -nr_pages; else if (vma_flags_test(&old_vma_flags, VMA_LOCKED_BIT)) nr_pages = 0; mm->locked_vm += nr_pages; /* * vm_flags is protected by the mmap_lock held in write mode. * It's okay if try_to_unmap_one unmaps a page just after we * set VM_LOCKED, populate_vma_page_range will bring it back. */ if (vma_flags_test(&new_vma_flags, VMA_LOCKED_BIT) && vma_flags_test(&old_vma_flags, VMA_LOCKED_BIT)) { /* No work to do, and mlocking twice would be wrong */ vma_start_write(vma); vma->flags = new_vma_flags; } else { mlock_vma_pages_range(vma, start, end, &new_vma_flags); } out: *prev = vma; return ret; } static int apply_vma_lock_flags(unsigned long start, size_t len, vm_flags_t flags) { unsigned long nstart, end, tmp; struct vm_area_struct *vma, *prev; VMA_ITERATOR(vmi, current->mm, start); VM_BUG_ON(offset_in_page(start)); VM_BUG_ON(len != PAGE_ALIGN(len)); end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; vma = vma_iter_load(&vmi); if (!vma) return -ENOMEM; prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; nstart = start; tmp = vma->vm_start; for_each_vma_range(vmi, vma, end) { int error; vm_flags_t newflags; if (vma->vm_start != tmp) return -ENOMEM; newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= flags; /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ tmp = vma->vm_end; if (tmp > end) tmp = end; error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags); if (error) return error; tmp = vma_iter_end(&vmi); nstart = tmp; } if (tmp < end) return -ENOMEM; return 0; } /* * Go through vma areas and sum size of mlocked * vma pages, as return value. * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT) * is also counted. * Return value: previously mlocked page counts */ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, unsigned long start, size_t len) { struct vm_area_struct *vma; unsigned long count = 0; unsigned long end; VMA_ITERATOR(vmi, mm, start); /* Don't overflow past ULONG_MAX */ if (unlikely(ULONG_MAX - len < start)) end = ULONG_MAX; else end = start + len; for_each_vma_range(vmi, vma, end) { if (vma->vm_flags & VM_LOCKED) { if (start > vma->vm_start) count -= (start - vma->vm_start); if (end < vma->vm_end) { count += end - vma->vm_start; break; } count += vma->vm_end - vma->vm_start; } } return count >> PAGE_SHIFT; } /* * convert get_user_pages() return value to posix mlock() error */ static int __mlock_posix_error_return(long retval) { if (retval == -EFAULT) retval = -ENOMEM; else if (retval == -ENOMEM) retval = -EAGAIN; return retval; } static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) { unsigned long locked; unsigned long lock_limit; int error = -ENOMEM; start = untagged_addr(start); if (!can_do_mlock()) return -EPERM; len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; locked = len >> PAGE_SHIFT; if (mmap_write_lock_killable(current->mm)) return -EINTR; locked += current->mm->locked_vm; if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) { /* * It is possible that the regions requested intersect with * previously mlocked areas, that part area in "mm->locked_vm" * should not be counted to new mlock increment count. So check * and adjust locked count if necessary. */ locked -= count_mm_mlocked_page_nr(current->mm, start, len); } /* check against resource limits */ if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) error = apply_vma_lock_flags(start, len, flags); mmap_write_unlock(current->mm); if (error) return error; error = __mm_populate(start, len, 0); if (error) return __mlock_posix_error_return(error); return 0; } SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { return do_mlock(start, len, VM_LOCKED); } SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags) { vm_flags_t vm_flags = VM_LOCKED; if (flags & ~MLOCK_ONFAULT) return -EINVAL; if (flags & MLOCK_ONFAULT) vm_flags |= VM_LOCKONFAULT; return do_mlock(start, len, vm_flags); } SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; start = untagged_addr(start); len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; if (mmap_write_lock_killable(current->mm)) return -EINTR; ret = apply_vma_lock_flags(start, len, 0); mmap_write_unlock(current->mm); return ret; } /* * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall) * and translate into the appropriate modifications to mm->def_flags and/or the * flags for all current VMAs. * * There are a couple of subtleties with this. If mlockall() is called multiple * times with different flags, the values do not necessarily stack. If mlockall * is called once including the MCL_FUTURE flag and then a second time without * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags. */ static int apply_mlockall_flags(int flags) { VMA_ITERATOR(vmi, current->mm, 0); struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; current->mm->def_flags &= ~VM_LOCKED_MASK; if (flags & MCL_FUTURE) { current->mm->def_flags |= VM_LOCKED; if (flags & MCL_ONFAULT) current->mm->def_flags |= VM_LOCKONFAULT; if (!(flags & MCL_CURRENT)) goto out; } if (flags & MCL_CURRENT) { to_add |= VM_LOCKED; if (flags & MCL_ONFAULT) to_add |= VM_LOCKONFAULT; } for_each_vma(vmi, vma) { int error; vm_flags_t newflags; newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= to_add; error = mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end, newflags); /* Ignore errors, but prev needs fixing up. */ if (error) prev = vma; cond_resched(); } out: return 0; } SYSCALL_DEFINE1(mlockall, int, flags) { unsigned long lock_limit; int ret; if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) || flags == MCL_ONFAULT) return -EINVAL; if (!can_do_mlock()) return -EPERM; lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; if (mmap_write_lock_killable(current->mm)) return -EINTR; ret = -ENOMEM; if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK)) ret = apply_mlockall_flags(flags); mmap_write_unlock(current->mm); if (!ret && (flags & MCL_CURRENT)) mm_populate(0, TASK_SIZE); return ret; } SYSCALL_DEFINE0(munlockall) { int ret; if (mmap_write_lock_killable(current->mm)) return -EINTR; ret = apply_mlockall_flags(0); mmap_write_unlock(current->mm); return ret; } /* * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB * shm segments) get accounted against the user_struct instead. */ static DEFINE_SPINLOCK(shmlock_user_lock); int user_shm_lock(size_t size, struct ucounts *ucounts) { unsigned long lock_limit, locked; long memlock; int allowed = 0; locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; lock_limit = rlimit(RLIMIT_MEMLOCK); if (lock_limit != RLIM_INFINITY) lock_limit >>= PAGE_SHIFT; spin_lock(&shmlock_user_lock); memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) { dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); goto out; } if (!get_ucounts(ucounts)) { dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); allowed = 0; goto out; } allowed = 1; out: spin_unlock(&shmlock_user_lock); return allowed; } void user_shm_unlock(size_t size, struct ucounts *ucounts) { spin_lock(&shmlock_user_lock); dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT); spin_unlock(&shmlock_user_lock); put_ucounts(ucounts); }
2009 2011 1871 1874 670 667 669 669 11 162 161 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2022 Christian Brauner <brauner@kernel.org> */ #include <linux/cred.h> #include <linux/fs.h> #include <linux/mnt_idmapping.h> #include <linux/slab.h> #include <linux/user_namespace.h> #include <linux/seq_file.h> #include "internal.h" /* * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t, * never from raw values. These are just internal helpers. */ #define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val } #define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val } struct mnt_idmap { struct uid_gid_map uid_map; struct uid_gid_map gid_map; refcount_t count; }; /* * Carries the initial idmapping of 0:0:4294967295 which is an identity * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. */ struct mnt_idmap nop_mnt_idmap = { .count = REFCOUNT_INIT(1), }; EXPORT_SYMBOL_GPL(nop_mnt_idmap); /* * Carries the invalid idmapping of a full 0-4294967295 {g,u}id range. * This means that all {g,u}ids are mapped to INVALID_VFS{G,U}ID. */ struct mnt_idmap invalid_mnt_idmap = { .count = REFCOUNT_INIT(1), }; EXPORT_SYMBOL_GPL(invalid_mnt_idmap); /** * initial_idmapping - check whether this is the initial mapping * @ns: idmapping to check * * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1, * [...], 1000 to 1000 [...]. * * Return: true if this is the initial mapping, false if not. */ static inline bool initial_idmapping(const struct user_namespace *ns) { return ns == &init_user_ns; } /** * make_vfsuid - map a filesystem kuid according to an idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @kuid : kuid to be mapped * * Take a @kuid and remap it from @fs_userns into @idmap. Use this * function when preparing a @kuid to be reported to userspace. * * If initial_idmapping() determines that this is not an idmapped mount * we can simply return @kuid unchanged. * If initial_idmapping() tells us that the filesystem is not mounted with an * idmapping we know the value of @kuid won't change when calling * from_kuid() so we can simply retrieve the value via __kuid_val() * directly. * * Return: @kuid mapped according to @idmap. * If @kuid has no mapping in either @idmap or @fs_userns INVALID_UID is * returned. */ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, kuid_t kuid) { uid_t uid; if (idmap == &nop_mnt_idmap) return VFSUIDT_INIT(kuid); if (idmap == &invalid_mnt_idmap) return INVALID_VFSUID; if (initial_idmapping(fs_userns)) uid = __kuid_val(kuid); else uid = from_kuid(fs_userns, kuid); if (uid == (uid_t)-1) return INVALID_VFSUID; return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid)); } EXPORT_SYMBOL_GPL(make_vfsuid); /** * make_vfsgid - map a filesystem kgid according to an idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @kgid : kgid to be mapped * * Take a @kgid and remap it from @fs_userns into @idmap. Use this * function when preparing a @kgid to be reported to userspace. * * If initial_idmapping() determines that this is not an idmapped mount * we can simply return @kgid unchanged. * If initial_idmapping() tells us that the filesystem is not mounted with an * idmapping we know the value of @kgid won't change when calling * from_kgid() so we can simply retrieve the value via __kgid_val() * directly. * * Return: @kgid mapped according to @idmap. * If @kgid has no mapping in either @idmap or @fs_userns INVALID_GID is * returned. */ vfsgid_t make_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, kgid_t kgid) { gid_t gid; if (idmap == &nop_mnt_idmap) return VFSGIDT_INIT(kgid); if (idmap == &invalid_mnt_idmap) return INVALID_VFSGID; if (initial_idmapping(fs_userns)) gid = __kgid_val(kgid); else gid = from_kgid(fs_userns, kgid); if (gid == (gid_t)-1) return INVALID_VFSGID; return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid)); } EXPORT_SYMBOL_GPL(make_vfsgid); /** * from_vfsuid - map a vfsuid into the filesystem idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @vfsuid : vfsuid to be mapped * * Map @vfsuid into the filesystem idmapping. This function has to be used in * order to e.g. write @vfsuid to inode->i_uid. * * Return: @vfsuid mapped into the filesystem idmapping */ kuid_t from_vfsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsuid_t vfsuid) { uid_t uid; if (idmap == &nop_mnt_idmap) return AS_KUIDT(vfsuid); if (idmap == &invalid_mnt_idmap) return INVALID_UID; uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid)); if (uid == (uid_t)-1) return INVALID_UID; if (initial_idmapping(fs_userns)) return KUIDT_INIT(uid); return make_kuid(fs_userns, uid); } EXPORT_SYMBOL_GPL(from_vfsuid); /** * from_vfsgid - map a vfsgid into the filesystem idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @vfsgid : vfsgid to be mapped * * Map @vfsgid into the filesystem idmapping. This function has to be used in * order to e.g. write @vfsgid to inode->i_gid. * * Return: @vfsgid mapped into the filesystem idmapping */ kgid_t from_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsgid_t vfsgid) { gid_t gid; if (idmap == &nop_mnt_idmap) return AS_KGIDT(vfsgid); if (idmap == &invalid_mnt_idmap) return INVALID_GID; gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid)); if (gid == (gid_t)-1) return INVALID_GID; if (initial_idmapping(fs_userns)) return KGIDT_INIT(gid); return make_kgid(fs_userns, gid); } EXPORT_SYMBOL_GPL(from_vfsgid); #ifdef CONFIG_MULTIUSER /** * vfsgid_in_group_p() - check whether a vfsuid matches the caller's groups * @vfsgid: the mnt gid to match * * This function can be used to determine whether @vfsuid matches any of the * caller's groups. * * Return: 1 if vfsuid matches caller's groups, 0 if not. */ int vfsgid_in_group_p(vfsgid_t vfsgid) { return in_group_p(AS_KGIDT(vfsgid)); } #else int vfsgid_in_group_p(vfsgid_t vfsgid) { return 1; } #endif EXPORT_SYMBOL_GPL(vfsgid_in_group_p); static int copy_mnt_idmap(struct uid_gid_map *map_from, struct uid_gid_map *map_to) { struct uid_gid_extent *forward, *reverse; u32 nr_extents = READ_ONCE(map_from->nr_extents); /* Pairs with smp_wmb() when writing the idmapping. */ smp_rmb(); /* * Don't blindly copy @map_to into @map_from if nr_extents is * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we * read @nr_extents someone could have written an idmapping and * then we might end up with inconsistent data. So just don't do * anything at all. */ if (nr_extents == 0) return -EINVAL; /* * Here we know that nr_extents is greater than zero which means * a map has been written. Since idmappings can't be changed * once they have been written we know that we can safely copy * from @map_to into @map_from. */ if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { *map_to = *map_from; return 0; } forward = kmemdup_array(map_from->forward, nr_extents, sizeof(struct uid_gid_extent), GFP_KERNEL_ACCOUNT); if (!forward) return -ENOMEM; reverse = kmemdup_array(map_from->reverse, nr_extents, sizeof(struct uid_gid_extent), GFP_KERNEL_ACCOUNT); if (!reverse) { kfree(forward); return -ENOMEM; } /* * The idmapping isn't exposed anywhere so we don't need to care * about ordering between extent pointers and @nr_extents * initialization. */ map_to->forward = forward; map_to->reverse = reverse; map_to->nr_extents = nr_extents; return 0; } static void free_mnt_idmap(struct mnt_idmap *idmap) { if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(idmap->uid_map.forward); kfree(idmap->uid_map.reverse); } if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(idmap->gid_map.forward); kfree(idmap->gid_map.reverse); } kfree(idmap); } struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) { struct mnt_idmap *idmap; int ret; idmap = kzalloc_obj(struct mnt_idmap, GFP_KERNEL_ACCOUNT); if (!idmap) return ERR_PTR(-ENOMEM); refcount_set(&idmap->count, 1); ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map); if (!ret) ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map); if (ret) { free_mnt_idmap(idmap); idmap = ERR_PTR(ret); } return idmap; } /** * mnt_idmap_get - get a reference to an idmapping * @idmap: the idmap to bump the reference on * * If @idmap is not the @nop_mnt_idmap bump the reference count. * * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed. */ struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap) { if (idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap) refcount_inc(&idmap->count); return idmap; } EXPORT_SYMBOL_GPL(mnt_idmap_get); /** * mnt_idmap_put - put a reference to an idmapping * @idmap: the idmap to put the reference on * * If this is a non-initial idmapping, put the reference count when a mount is * released and free it if we're the last user. */ void mnt_idmap_put(struct mnt_idmap *idmap) { if (idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap && refcount_dec_and_test(&idmap->count)) free_mnt_idmap(idmap); } EXPORT_SYMBOL_GPL(mnt_idmap_put); int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map) { struct uid_gid_map *map, *map_up; u32 idx, nr_mappings; if (!is_valid_mnt_idmap(idmap)) return 0; /* * Idmappings are shown relative to the caller's idmapping. * This is both the most intuitive and most useful solution. */ if (uid_map) { map = &idmap->uid_map; map_up = &current_user_ns()->uid_map; } else { map = &idmap->gid_map; map_up = &current_user_ns()->gid_map; } for (idx = 0, nr_mappings = 0; idx < map->nr_extents; idx++) { uid_t lower; struct uid_gid_extent *extent; if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = &map->extent[idx]; else extent = &map->forward[idx]; /* * Verify that the whole range of the mapping can be * resolved in the caller's idmapping. If it cannot be * resolved skip the mapping. */ lower = map_id_range_up(map_up, extent->lower_first, extent->count); if (lower == (uid_t) -1) continue; seq_printf(seq, "%u %u %u", extent->first, lower, extent->count); if (seq_has_overflowed(seq)) return -EAGAIN; seq->count++; /* mappings are separated by \0 */ if (seq_has_overflowed(seq)) return -EAGAIN; nr_mappings++; } return nr_mappings; }
10155 1496 10167 173 177 204 68 205 205 189 173 189 187 189 192 190 4 4 170 170 129 170 170 129 129 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 // SPDX-License-Identifier: GPL-2.0 /* * Fast batching percpu counters. */ #include <linux/percpu_counter.h> #include <linux/mutex.h> #include <linux/init.h> #include <linux/cpu.h> #include <linux/module.h> #include <linux/debugobjects.h> #ifdef CONFIG_HOTPLUG_CPU static LIST_HEAD(percpu_counters); static DEFINE_SPINLOCK(percpu_counters_lock); #endif #ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER static const struct debug_obj_descr percpu_counter_debug_descr; static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state) { struct percpu_counter *fbc = addr; switch (state) { case ODEBUG_STATE_ACTIVE: percpu_counter_destroy(fbc); debug_object_free(fbc, &percpu_counter_debug_descr); return true; default: return false; } } static const struct debug_obj_descr percpu_counter_debug_descr = { .name = "percpu_counter", .fixup_free = percpu_counter_fixup_free, }; static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) { debug_object_init(fbc, &percpu_counter_debug_descr); debug_object_activate(fbc, &percpu_counter_debug_descr); } static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) { debug_object_deactivate(fbc, &percpu_counter_debug_descr); debug_object_free(fbc, &percpu_counter_debug_descr); } #else /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) { } static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) { } #endif /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { int cpu; unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); for_each_possible_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); *pcount = 0; } fbc->count = amount; raw_spin_unlock_irqrestore(&fbc->lock, flags); } EXPORT_SYMBOL(percpu_counter_set); /* * Add to a counter while respecting batch size. * * There are 2 implementations, both dealing with the following problem: * * The decision slow path/fast path and the actual update must be atomic. * Otherwise a call in process context could check the current values and * decide that the fast path can be used. If now an interrupt occurs before * the this_cpu_add(), and the interrupt updates this_cpu(*fbc->counters), * then the this_cpu_add() that is executed after the interrupt has completed * can produce values larger than "batch" or even overflows. */ #ifdef CONFIG_HAVE_CMPXCHG_LOCAL /* * Safety against interrupts is achieved in 2 ways: * 1. the fast path uses local cmpxchg (note: no lock prefix) * 2. the slow path operates with interrupts disabled */ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; unsigned long flags; count = this_cpu_read(*fbc->counters); do { if (unlikely(abs(count + amount) >= batch)) { raw_spin_lock_irqsave(&fbc->lock, flags); /* * Note: by now we might have migrated to another CPU * or the value might have changed. */ count = __this_cpu_read(*fbc->counters); fbc->count += count + amount; __this_cpu_sub(*fbc->counters, count); raw_spin_unlock_irqrestore(&fbc->lock, flags); return; } } while (!this_cpu_try_cmpxchg(*fbc->counters, &count, count + amount)); } #else /* * local_irq_save() is used to make the function irq safe: * - The slow path would be ok as protected by an irq-safe spinlock. * - this_cpu_add would be ok as it is irq-safe by definition. */ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; unsigned long flags; local_irq_save(flags); count = __this_cpu_read(*fbc->counters) + amount; if (abs(count) >= batch) { raw_spin_lock(&fbc->lock); fbc->count += count; __this_cpu_sub(*fbc->counters, count - amount); raw_spin_unlock(&fbc->lock); } else { this_cpu_add(*fbc->counters, amount); } local_irq_restore(flags); } #endif EXPORT_SYMBOL(percpu_counter_add_batch); /* * For percpu_counter with a big batch, the devication of its count could * be big, and there is requirement to reduce the deviation, like when the * counter's batch could be runtime decreased to get a better accuracy, * which can be achieved by running this sync function on each CPU. */ void percpu_counter_sync(struct percpu_counter *fbc) { unsigned long flags; s64 count; raw_spin_lock_irqsave(&fbc->lock, flags); count = __this_cpu_read(*fbc->counters); fbc->count += count; __this_cpu_sub(*fbc->counters, count); raw_spin_unlock_irqrestore(&fbc->lock, flags); } EXPORT_SYMBOL(percpu_counter_sync); /* * Add up all the per-cpu counts, return the result. This is a more accurate * but much slower version of percpu_counter_read_positive(). * * We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums * from CPUs that are in the process of being taken offline. Dying cpus have * been removed from the online mask, but may not have had the hotplug dead * notifier called to fold the percpu count back into the global counter sum. * By including dying CPUs in the iteration mask, we avoid this race condition * so __percpu_counter_sum() just does the right thing when CPUs are being taken * offline. */ s64 __percpu_counter_sum(struct percpu_counter *fbc) { s64 ret; int cpu; unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); ret = fbc->count; for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } raw_spin_unlock_irqrestore(&fbc->lock, flags); return ret; } EXPORT_SYMBOL(__percpu_counter_sum); int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp, u32 nr_counters, struct lock_class_key *key) { unsigned long flags __maybe_unused; size_t counter_size; s32 __percpu *counters; u32 i; counter_size = ALIGN(sizeof(*counters), __alignof__(*counters)); counters = __alloc_percpu_gfp(nr_counters * counter_size, __alignof__(*counters), gfp); if (!counters) { fbc[0].counters = NULL; return -ENOMEM; } for (i = 0; i < nr_counters; i++) { raw_spin_lock_init(&fbc[i].lock); lockdep_set_class(&fbc[i].lock, key); #ifdef CONFIG_HOTPLUG_CPU INIT_LIST_HEAD(&fbc[i].list); #endif fbc[i].count = amount; fbc[i].counters = (void __percpu *)counters + i * counter_size; debug_percpu_counter_activate(&fbc[i]); } #ifdef CONFIG_HOTPLUG_CPU spin_lock_irqsave(&percpu_counters_lock, flags); for (i = 0; i < nr_counters; i++) list_add(&fbc[i].list, &percpu_counters); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif return 0; } EXPORT_SYMBOL(__percpu_counter_init_many); void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters) { unsigned long flags __maybe_unused; u32 i; if (WARN_ON_ONCE(!fbc)) return; if (!fbc[0].counters) return; for (i = 0; i < nr_counters; i++) debug_percpu_counter_deactivate(&fbc[i]); #ifdef CONFIG_HOTPLUG_CPU spin_lock_irqsave(&percpu_counters_lock, flags); for (i = 0; i < nr_counters; i++) list_del(&fbc[i].list); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif free_percpu(fbc[0].counters); for (i = 0; i < nr_counters; i++) fbc[i].counters = NULL; } EXPORT_SYMBOL(percpu_counter_destroy_many); int percpu_counter_batch __read_mostly = 32; EXPORT_SYMBOL(percpu_counter_batch); static int compute_batch_value(unsigned int cpu) { int nr = num_online_cpus(); percpu_counter_batch = max(32, nr*2); return 0; } static int percpu_counter_cpu_dead(unsigned int cpu) { #ifdef CONFIG_HOTPLUG_CPU struct percpu_counter *fbc; compute_batch_value(cpu); spin_lock_irq(&percpu_counters_lock); list_for_each_entry(fbc, &percpu_counters, list) { s32 *pcount; raw_spin_lock(&fbc->lock); pcount = per_cpu_ptr(fbc->counters, cpu); fbc->count += *pcount; *pcount = 0; raw_spin_unlock(&fbc->lock); } spin_unlock_irq(&percpu_counters_lock); #endif return 0; } /* * Compare counter against given value. * Return 1 if greater, 0 if equal and -1 if less */ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) { s64 count; count = percpu_counter_read(fbc); /* Check to see if rough count will be sufficient for comparison */ if (abs(count - rhs) > (batch * num_online_cpus())) { if (count > rhs) return 1; else return -1; } /* Need to use precise count */ count = percpu_counter_sum(fbc); if (count > rhs) return 1; else if (count < rhs) return -1; else return 0; } EXPORT_SYMBOL(__percpu_counter_compare); /* * Compare counter, and add amount if total is: less than or equal to limit if * amount is positive, or greater than or equal to limit if amount is negative. * Return true if amount is added, or false if total would be beyond the limit. * * Negative limit is allowed, but unusual. * When negative amounts (subs) are given to percpu_counter_limited_add(), * the limit would most naturally be 0 - but other limits are also allowed. * * Overflow beyond S64_MAX is not allowed for: counter, limit and amount * are all assumed to be sane (far from S64_MIN and S64_MAX). */ bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount, s32 batch) { s64 count; s64 unknown; unsigned long flags; bool good = false; if (amount == 0) return true; local_irq_save(flags); unknown = batch * num_online_cpus(); count = __this_cpu_read(*fbc->counters); /* Skip taking the lock when safe */ if (abs(count + amount) <= batch && ((amount > 0 && fbc->count + unknown <= limit) || (amount < 0 && fbc->count - unknown >= limit))) { this_cpu_add(*fbc->counters, amount); local_irq_restore(flags); return true; } raw_spin_lock(&fbc->lock); count = fbc->count + amount; /* Skip percpu_counter_sum() when safe */ if (amount > 0) { if (count - unknown > limit) goto out; if (count + unknown <= limit) good = true; } else { if (count + unknown < limit) goto out; if (count - unknown >= limit) good = true; } if (!good) { s32 *pcount; int cpu; for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { pcount = per_cpu_ptr(fbc->counters, cpu); count += *pcount; } if (amount > 0) { if (count > limit) goto out; } else { if (count < limit) goto out; } good = true; } count = __this_cpu_read(*fbc->counters); fbc->count += count + amount; __this_cpu_sub(*fbc->counters, count); out: raw_spin_unlock(&fbc->lock); local_irq_restore(flags); return good; } static int __init percpu_counter_startup(void) { int ret; ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online", compute_batch_value, NULL); WARN_ON(ret < 0); ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD, "lib/percpu_cnt:dead", NULL, percpu_counter_cpu_dead); WARN_ON(ret < 0); return 0; } module_init(percpu_counter_startup);
232 248 8 181 683 181 250 249 2 250 2 2 243 244 244 4 4 4 4 4 4 4 4 4 4 4 4 4 542 541 542 540 542 34 30 4 4 4 2 2 2 2 2 2 2 2 5 5 10 4 7 580 22 559 5 120 27 93 3 2 2 2 2 5 5 5 6 3 2 2 1 2 15 5 1 10 7 10 7 3 3 3 14 4 4 6 1 2 2 3 11 4 2 1 4 2 3 2 13 6 2 1 6 1 2 5 11 2 10 10 2 10 10 59 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright 2007 Hewlett-Packard Development Company, L.P. * * This file is part of the SCTP kernel implementation * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Vlad Yasevich <vladislav.yasevich@hp.com> */ #include <crypto/sha1.h> #include <crypto/sha2.h> #include <linux/slab.h> #include <linux/types.h> #include <net/sctp/sctp.h> #include <net/sctp/auth.h> static const struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = { { /* id 0 is reserved. as all 0 */ .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_0, }, { .hmac_id = SCTP_AUTH_HMAC_ID_SHA1, .hmac_len = SHA1_DIGEST_SIZE, }, { /* id 2 is reserved as well */ .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2, }, { .hmac_id = SCTP_AUTH_HMAC_ID_SHA256, .hmac_len = SHA256_DIGEST_SIZE, } }; static bool sctp_hmac_supported(__u16 hmac_id) { return hmac_id < ARRAY_SIZE(sctp_hmac_list) && sctp_hmac_list[hmac_id].hmac_len != 0; } void sctp_auth_key_put(struct sctp_auth_bytes *key) { if (!key) return; if (refcount_dec_and_test(&key->refcnt)) { kfree_sensitive(key); SCTP_DBG_OBJCNT_DEC(keys); } } /* Create a new key structure of a given length */ static struct sctp_auth_bytes *sctp_auth_create_key(__u32 key_len, gfp_t gfp) { struct sctp_auth_bytes *key; /* Verify that we are not going to overflow INT_MAX */ if (key_len > (INT_MAX - sizeof(struct sctp_auth_bytes))) return NULL; /* Allocate the shared key */ key = kmalloc(sizeof(struct sctp_auth_bytes) + key_len, gfp); if (!key) return NULL; key->len = key_len; refcount_set(&key->refcnt, 1); SCTP_DBG_OBJCNT_INC(keys); return key; } /* Create a new shared key container with a give key id */ struct sctp_shared_key *sctp_auth_shkey_create(__u16 key_id, gfp_t gfp) { struct sctp_shared_key *new; /* Allocate the shared key container */ new = kzalloc_obj(struct sctp_shared_key, gfp); if (!new) return NULL; INIT_LIST_HEAD(&new->key_list); refcount_set(&new->refcnt, 1); new->key_id = key_id; return new; } /* Free the shared key structure */ static void sctp_auth_shkey_destroy(struct sctp_shared_key *sh_key) { BUG_ON(!list_empty(&sh_key->key_list)); sctp_auth_key_put(sh_key->key); sh_key->key = NULL; kfree(sh_key); } void sctp_auth_shkey_release(struct sctp_shared_key *sh_key) { if (refcount_dec_and_test(&sh_key->refcnt)) sctp_auth_shkey_destroy(sh_key); } void sctp_auth_shkey_hold(struct sctp_shared_key *sh_key) { refcount_inc(&sh_key->refcnt); } /* Destroy the entire key list. This is done during the * associon and endpoint free process. */ void sctp_auth_destroy_keys(struct list_head *keys) { struct sctp_shared_key *ep_key; struct sctp_shared_key *tmp; if (list_empty(keys)) return; key_for_each_safe(ep_key, tmp, keys) { list_del_init(&ep_key->key_list); sctp_auth_shkey_release(ep_key); } } /* Compare two byte vectors as numbers. Return values * are: * 0 - vectors are equal * < 0 - vector 1 is smaller than vector2 * > 0 - vector 1 is greater than vector2 * * Algorithm is: * This is performed by selecting the numerically smaller key vector... * If the key vectors are equal as numbers but differ in length ... * the shorter vector is considered smaller * * Examples (with small values): * 000123456789 > 123456789 (first number is longer) * 000123456789 < 234567891 (second number is larger numerically) * 123456789 > 2345678 (first number is both larger & longer) */ static int sctp_auth_compare_vectors(struct sctp_auth_bytes *vector1, struct sctp_auth_bytes *vector2) { int diff; int i; const __u8 *longer; diff = vector1->len - vector2->len; if (diff) { longer = (diff > 0) ? vector1->data : vector2->data; /* Check to see if the longer number is * lead-zero padded. If it is not, it * is automatically larger numerically. */ for (i = 0; i < abs(diff); i++) { if (longer[i] != 0) return diff; } } /* lengths are the same, compare numbers */ return memcmp(vector1->data, vector2->data, vector1->len); } /* * Create a key vector as described in SCTP-AUTH, Section 6.1 * The RANDOM parameter, the CHUNKS parameter and the HMAC-ALGO * parameter sent by each endpoint are concatenated as byte vectors. * These parameters include the parameter type, parameter length, and * the parameter value, but padding is omitted; all padding MUST be * removed from this concatenation before proceeding with further * computation of keys. Parameters which were not sent are simply * omitted from the concatenation process. The resulting two vectors * are called the two key vectors. */ static struct sctp_auth_bytes *sctp_auth_make_key_vector( struct sctp_random_param *random, struct sctp_chunks_param *chunks, struct sctp_hmac_algo_param *hmacs, gfp_t gfp) { struct sctp_auth_bytes *new; __u32 len; __u32 offset = 0; __u16 random_len, hmacs_len, chunks_len = 0; random_len = ntohs(random->param_hdr.length); hmacs_len = ntohs(hmacs->param_hdr.length); if (chunks) chunks_len = ntohs(chunks->param_hdr.length); len = random_len + hmacs_len + chunks_len; new = sctp_auth_create_key(len, gfp); if (!new) return NULL; memcpy(new->data, random, random_len); offset += random_len; if (chunks) { memcpy(new->data + offset, chunks, chunks_len); offset += chunks_len; } memcpy(new->data + offset, hmacs, hmacs_len); return new; } /* Make a key vector based on our local parameters */ static struct sctp_auth_bytes *sctp_auth_make_local_vector( const struct sctp_association *asoc, gfp_t gfp) { return sctp_auth_make_key_vector( (struct sctp_random_param *)asoc->c.auth_random, (struct sctp_chunks_param *)asoc->c.auth_chunks, (struct sctp_hmac_algo_param *)asoc->c.auth_hmacs, gfp); } /* Make a key vector based on peer's parameters */ static struct sctp_auth_bytes *sctp_auth_make_peer_vector( const struct sctp_association *asoc, gfp_t gfp) { return sctp_auth_make_key_vector(asoc->peer.peer_random, asoc->peer.peer_chunks, asoc->peer.peer_hmacs, gfp); } /* Set the value of the association shared key base on the parameters * given. The algorithm is: * From the endpoint pair shared keys and the key vectors the * association shared keys are computed. This is performed by selecting * the numerically smaller key vector and concatenating it to the * endpoint pair shared key, and then concatenating the numerically * larger key vector to that. The result of the concatenation is the * association shared key. */ static struct sctp_auth_bytes *sctp_auth_asoc_set_secret( struct sctp_shared_key *ep_key, struct sctp_auth_bytes *first_vector, struct sctp_auth_bytes *last_vector, gfp_t gfp) { struct sctp_auth_bytes *secret; __u32 offset = 0; __u32 auth_len; auth_len = first_vector->len + last_vector->len; if (ep_key->key) auth_len += ep_key->key->len; secret = sctp_auth_create_key(auth_len, gfp); if (!secret) return NULL; if (ep_key->key) { memcpy(secret->data, ep_key->key->data, ep_key->key->len); offset += ep_key->key->len; } memcpy(secret->data + offset, first_vector->data, first_vector->len); offset += first_vector->len; memcpy(secret->data + offset, last_vector->data, last_vector->len); return secret; } /* Create an association shared key. Follow the algorithm * described in SCTP-AUTH, Section 6.1 */ static struct sctp_auth_bytes *sctp_auth_asoc_create_secret( const struct sctp_association *asoc, struct sctp_shared_key *ep_key, gfp_t gfp) { struct sctp_auth_bytes *local_key_vector; struct sctp_auth_bytes *peer_key_vector; struct sctp_auth_bytes *first_vector, *last_vector; struct sctp_auth_bytes *secret = NULL; int cmp; /* Now we need to build the key vectors * SCTP-AUTH , Section 6.1 * The RANDOM parameter, the CHUNKS parameter and the HMAC-ALGO * parameter sent by each endpoint are concatenated as byte vectors. * These parameters include the parameter type, parameter length, and * the parameter value, but padding is omitted; all padding MUST be * removed from this concatenation before proceeding with further * computation of keys. Parameters which were not sent are simply * omitted from the concatenation process. The resulting two vectors * are called the two key vectors. */ local_key_vector = sctp_auth_make_local_vector(asoc, gfp); peer_key_vector = sctp_auth_make_peer_vector(asoc, gfp); if (!peer_key_vector || !local_key_vector) goto out; /* Figure out the order in which the key_vectors will be * added to the endpoint shared key. * SCTP-AUTH, Section 6.1: * This is performed by selecting the numerically smaller key * vector and concatenating it to the endpoint pair shared * key, and then concatenating the numerically larger key * vector to that. If the key vectors are equal as numbers * but differ in length, then the concatenation order is the * endpoint shared key, followed by the shorter key vector, * followed by the longer key vector. Otherwise, the key * vectors are identical, and may be concatenated to the * endpoint pair key in any order. */ cmp = sctp_auth_compare_vectors(local_key_vector, peer_key_vector); if (cmp < 0) { first_vector = local_key_vector; last_vector = peer_key_vector; } else { first_vector = peer_key_vector; last_vector = local_key_vector; } secret = sctp_auth_asoc_set_secret(ep_key, first_vector, last_vector, gfp); out: sctp_auth_key_put(local_key_vector); sctp_auth_key_put(peer_key_vector); return secret; } /* * Populate the association overlay list with the list * from the endpoint. */ int sctp_auth_asoc_copy_shkeys(const struct sctp_endpoint *ep, struct sctp_association *asoc, gfp_t gfp) { struct sctp_shared_key *sh_key; struct sctp_shared_key *new; BUG_ON(!list_empty(&asoc->endpoint_shared_keys)); key_for_each(sh_key, &ep->endpoint_shared_keys) { new = sctp_auth_shkey_create(sh_key->key_id, gfp); if (!new) goto nomem; new->key = sh_key->key; sctp_auth_key_hold(new->key); list_add(&new->key_list, &asoc->endpoint_shared_keys); } return 0; nomem: sctp_auth_destroy_keys(&asoc->endpoint_shared_keys); return -ENOMEM; } /* Public interface to create the association shared key. * See code above for the algorithm. */ int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp) { struct sctp_auth_bytes *secret; struct sctp_shared_key *ep_key; struct sctp_chunk *chunk; /* If we don't support AUTH, or peer is not capable * we don't need to do anything. */ if (!asoc->peer.auth_capable) return 0; /* If the key_id is non-zero and we couldn't find an * endpoint pair shared key, we can't compute the * secret. * For key_id 0, endpoint pair shared key is a NULL key. */ ep_key = sctp_auth_get_shkey(asoc, asoc->active_key_id); BUG_ON(!ep_key); secret = sctp_auth_asoc_create_secret(asoc, ep_key, gfp); if (!secret) return -ENOMEM; sctp_auth_key_put(asoc->asoc_shared_key); asoc->asoc_shared_key = secret; asoc->shkey = ep_key; /* Update send queue in case any chunk already in there now * needs authenticating */ list_for_each_entry(chunk, &asoc->outqueue.out_chunk_list, list) { if (sctp_auth_send_cid(chunk->chunk_hdr->type, asoc)) { chunk->auth = 1; if (!chunk->shkey) { chunk->shkey = asoc->shkey; sctp_auth_shkey_hold(chunk->shkey); } } } return 0; } /* Find the endpoint pair shared key based on the key_id */ struct sctp_shared_key *sctp_auth_get_shkey( const struct sctp_association *asoc, __u16 key_id) { struct sctp_shared_key *key; /* First search associations set of endpoint pair shared keys */ key_for_each(key, &asoc->endpoint_shared_keys) { if (key->key_id == key_id) { if (!key->deactivated) return key; break; } } return NULL; } const struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id) { return &sctp_hmac_list[hmac_id]; } /* Get an hmac description information that we can use to build * the AUTH chunk */ const struct sctp_hmac * sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) { struct sctp_hmac_algo_param *hmacs; __u16 n_elt; __u16 id = 0; int i; /* If we have a default entry, use it */ if (asoc->default_hmac_id) return &sctp_hmac_list[asoc->default_hmac_id]; /* Since we do not have a default entry, find the first entry * we support and return that. Do not cache that id. */ hmacs = asoc->peer.peer_hmacs; if (!hmacs) return NULL; n_elt = (ntohs(hmacs->param_hdr.length) - sizeof(struct sctp_paramhdr)) >> 1; for (i = 0; i < n_elt; i++) { id = ntohs(hmacs->hmac_ids[i]); if (sctp_hmac_supported(id)) return &sctp_hmac_list[id]; } return NULL; } static int __sctp_auth_find_hmacid(__be16 *hmacs, int n_elts, __be16 hmac_id) { int found = 0; int i; for (i = 0; i < n_elts; i++) { if (hmac_id == hmacs[i]) { found = 1; break; } } return found; } /* See if the HMAC_ID is one that we claim as supported */ int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc, __be16 hmac_id) { struct sctp_hmac_algo_param *hmacs; __u16 n_elt; if (!asoc) return 0; hmacs = (struct sctp_hmac_algo_param *)asoc->c.auth_hmacs; n_elt = (ntohs(hmacs->param_hdr.length) - sizeof(struct sctp_paramhdr)) >> 1; return __sctp_auth_find_hmacid(hmacs->hmac_ids, n_elt, hmac_id); } /* Cache the default HMAC id. This to follow this text from SCTP-AUTH: * Section 6.1: * The receiver of a HMAC-ALGO parameter SHOULD use the first listed * algorithm it supports. */ void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, struct sctp_hmac_algo_param *hmacs) { __u16 id; int i; int n_params; /* if the default id is already set, use it */ if (asoc->default_hmac_id) return; n_params = (ntohs(hmacs->param_hdr.length) - sizeof(struct sctp_paramhdr)) >> 1; for (i = 0; i < n_params; i++) { id = ntohs(hmacs->hmac_ids[i]); if (sctp_hmac_supported(id)) { asoc->default_hmac_id = id; break; } } } /* Check to see if the given chunk is supposed to be authenticated */ static int __sctp_auth_cid(enum sctp_cid chunk, struct sctp_chunks_param *param) { unsigned short len; int found = 0; int i; if (!param || param->param_hdr.length == 0) return 0; len = ntohs(param->param_hdr.length) - sizeof(struct sctp_paramhdr); /* SCTP-AUTH, Section 3.2 * The chunk types for INIT, INIT-ACK, SHUTDOWN-COMPLETE and AUTH * chunks MUST NOT be listed in the CHUNKS parameter. However, if * a CHUNKS parameter is received then the types for INIT, INIT-ACK, * SHUTDOWN-COMPLETE and AUTH chunks MUST be ignored. */ for (i = 0; !found && i < len; i++) { switch (param->chunks[i]) { case SCTP_CID_INIT: case SCTP_CID_INIT_ACK: case SCTP_CID_SHUTDOWN_COMPLETE: case SCTP_CID_AUTH: break; default: if (param->chunks[i] == chunk) found = 1; break; } } return found; } /* Check if peer requested that this chunk is authenticated */ int sctp_auth_send_cid(enum sctp_cid chunk, const struct sctp_association *asoc) { if (!asoc) return 0; if (!asoc->peer.auth_capable) return 0; return __sctp_auth_cid(chunk, asoc->peer.peer_chunks); } /* Check if we requested that peer authenticate this chunk. */ int sctp_auth_recv_cid(enum sctp_cid chunk, const struct sctp_association *asoc) { if (!asoc) return 0; if (!asoc->peer.auth_capable) return 0; return __sctp_auth_cid(chunk, (struct sctp_chunks_param *)asoc->c.auth_chunks); } /* SCTP-AUTH: Section 6.2: * The sender MUST calculate the MAC as described in RFC2104 [2] using * the hash function H as described by the MAC Identifier and the shared * association key K based on the endpoint pair shared key described by * the shared key identifier. The 'data' used for the computation of * the AUTH-chunk is given by the AUTH chunk with its HMAC field set to * zero (as shown in Figure 6) followed by all chunks that are placed * after the AUTH chunk in the SCTP packet. */ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, struct sk_buff *skb, struct sctp_auth_chunk *auth, struct sctp_shared_key *ep_key, gfp_t gfp) { struct sctp_auth_bytes *asoc_key; __u16 key_id, hmac_id; int free_key = 0; size_t data_len; __u8 *digest; /* Extract the info we need: * - hmac id * - key id */ key_id = ntohs(auth->auth_hdr.shkey_id); hmac_id = ntohs(auth->auth_hdr.hmac_id); if (key_id == asoc->active_key_id) asoc_key = asoc->asoc_shared_key; else { /* ep_key can't be NULL here */ asoc_key = sctp_auth_asoc_create_secret(asoc, ep_key, gfp); if (!asoc_key) return; free_key = 1; } data_len = skb_tail_pointer(skb) - (unsigned char *)auth; digest = (u8 *)(&auth->auth_hdr + 1); if (hmac_id == SCTP_AUTH_HMAC_ID_SHA1) { hmac_sha1_usingrawkey(asoc_key->data, asoc_key->len, (const u8 *)auth, data_len, digest); } else { WARN_ON_ONCE(hmac_id != SCTP_AUTH_HMAC_ID_SHA256); hmac_sha256_usingrawkey(asoc_key->data, asoc_key->len, (const u8 *)auth, data_len, digest); } if (free_key) sctp_auth_key_put(asoc_key); } /* API Helpers */ /* Add a chunk to the endpoint authenticated chunk list */ int sctp_auth_ep_add_chunkid(struct sctp_endpoint *ep, __u8 chunk_id) { struct sctp_chunks_param *p = ep->auth_chunk_list; __u16 nchunks; __u16 param_len; /* If this chunk is already specified, we are done */ if (__sctp_auth_cid(chunk_id, p)) return 0; /* Check if we can add this chunk to the array */ param_len = ntohs(p->param_hdr.length); nchunks = param_len - sizeof(struct sctp_paramhdr); if (nchunks == SCTP_NUM_CHUNK_TYPES) return -EINVAL; p->chunks[nchunks] = chunk_id; p->param_hdr.length = htons(param_len + 1); return 0; } /* Add hmac identifires to the endpoint list of supported hmac ids */ int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep, struct sctp_hmacalgo *hmacs) { int has_sha1 = 0; __u16 id; int i; /* Scan the list looking for unsupported id. Also make sure that * SHA1 is specified. */ for (i = 0; i < hmacs->shmac_num_idents; i++) { id = hmacs->shmac_idents[i]; if (!sctp_hmac_supported(id)) return -EOPNOTSUPP; if (SCTP_AUTH_HMAC_ID_SHA1 == id) has_sha1 = 1; } if (!has_sha1) return -EINVAL; for (i = 0; i < hmacs->shmac_num_idents; i++) ep->auth_hmacs_list->hmac_ids[i] = htons(hmacs->shmac_idents[i]); ep->auth_hmacs_list->param_hdr.length = htons(sizeof(struct sctp_paramhdr) + hmacs->shmac_num_idents * sizeof(__u16)); return 0; } /* Set a new shared key on either endpoint or association. If the * key with a same ID already exists, replace the key (remove the * old key and add a new one). */ int sctp_auth_set_key(struct sctp_endpoint *ep, struct sctp_association *asoc, struct sctp_authkey *auth_key) { struct sctp_shared_key *cur_key, *shkey; struct sctp_auth_bytes *key; struct list_head *sh_keys; int replace = 0; /* Try to find the given key id to see if * we are doing a replace, or adding a new key */ if (asoc) { if (!asoc->peer.auth_capable) return -EACCES; sh_keys = &asoc->endpoint_shared_keys; } else { if (!ep->auth_enable) return -EACCES; sh_keys = &ep->endpoint_shared_keys; } key_for_each(shkey, sh_keys) { if (shkey->key_id == auth_key->sca_keynumber) { replace = 1; break; } } cur_key = sctp_auth_shkey_create(auth_key->sca_keynumber, GFP_KERNEL); if (!cur_key) return -ENOMEM; /* Create a new key data based on the info passed in */ key = sctp_auth_create_key(auth_key->sca_keylength, GFP_KERNEL); if (!key) { kfree(cur_key); return -ENOMEM; } memcpy(key->data, &auth_key->sca_key[0], auth_key->sca_keylength); cur_key->key = key; if (!replace) { list_add(&cur_key->key_list, sh_keys); return 0; } list_del_init(&shkey->key_list); list_add(&cur_key->key_list, sh_keys); if (asoc && asoc->active_key_id == auth_key->sca_keynumber && sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL)) { list_del_init(&cur_key->key_list); sctp_auth_shkey_release(cur_key); list_add(&shkey->key_list, sh_keys); return -ENOMEM; } sctp_auth_shkey_release(shkey); return 0; } int sctp_auth_set_active_key(struct sctp_endpoint *ep, struct sctp_association *asoc, __u16 key_id) { struct sctp_shared_key *key; struct list_head *sh_keys; int found = 0; /* The key identifier MUST correst to an existing key */ if (asoc) { if (!asoc->peer.auth_capable) return -EACCES; sh_keys = &asoc->endpoint_shared_keys; } else { if (!ep->auth_enable) return -EACCES; sh_keys = &ep->endpoint_shared_keys; } key_for_each(key, sh_keys) { if (key->key_id == key_id) { found = 1; break; } } if (!found || key->deactivated) return -EINVAL; if (asoc) { __u16 active_key_id = asoc->active_key_id; asoc->active_key_id = key_id; if (sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL)) { asoc->active_key_id = active_key_id; return -ENOMEM; } } else ep->active_key_id = key_id; return 0; } int sctp_auth_del_key_id(struct sctp_endpoint *ep, struct sctp_association *asoc, __u16 key_id) { struct sctp_shared_key *key; struct list_head *sh_keys; int found = 0; /* The key identifier MUST NOT be the current active key * The key identifier MUST correst to an existing key */ if (asoc) { if (!asoc->peer.auth_capable) return -EACCES; if (asoc->active_key_id == key_id) return -EINVAL; sh_keys = &asoc->endpoint_shared_keys; } else { if (!ep->auth_enable) return -EACCES; if (ep->active_key_id == key_id) return -EINVAL; sh_keys = &ep->endpoint_shared_keys; } key_for_each(key, sh_keys) { if (key->key_id == key_id) { found = 1; break; } } if (!found) return -EINVAL; /* Delete the shared key */ list_del_init(&key->key_list); sctp_auth_shkey_release(key); return 0; } int sctp_auth_deact_key_id(struct sctp_endpoint *ep, struct sctp_association *asoc, __u16 key_id) { struct sctp_shared_key *key; struct list_head *sh_keys; int found = 0; /* The key identifier MUST NOT be the current active key * The key identifier MUST correst to an existing key */ if (asoc) { if (!asoc->peer.auth_capable) return -EACCES; if (asoc->active_key_id == key_id) return -EINVAL; sh_keys = &asoc->endpoint_shared_keys; } else { if (!ep->auth_enable) return -EACCES; if (ep->active_key_id == key_id) return -EINVAL; sh_keys = &ep->endpoint_shared_keys; } key_for_each(key, sh_keys) { if (key->key_id == key_id) { found = 1; break; } } if (!found) return -EINVAL; /* refcnt == 1 and !list_empty mean it's not being used anywhere * and deactivated will be set, so it's time to notify userland * that this shkey can be freed. */ if (asoc && !list_empty(&key->key_list) && refcount_read(&key->refcnt) == 1) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_authkey(asoc, key->key_id, SCTP_AUTH_FREE_KEY, GFP_KERNEL); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } key->deactivated = 1; return 0; } int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp) { /* Allocate space for HMACS and CHUNKS authentication * variables. There are arrays that we encode directly * into parameters to make the rest of the operations easier. */ if (!ep->auth_hmacs_list) { struct sctp_hmac_algo_param *auth_hmacs; auth_hmacs = kzalloc_flex(*auth_hmacs, hmac_ids, SCTP_AUTH_NUM_HMACS, gfp); if (!auth_hmacs) goto nomem; /* Initialize the HMACS parameter. * SCTP-AUTH: Section 3.3 * Every endpoint supporting SCTP chunk authentication MUST * support the HMAC based on the SHA-1 algorithm. */ auth_hmacs->param_hdr.type = SCTP_PARAM_HMAC_ALGO; auth_hmacs->param_hdr.length = htons(sizeof(struct sctp_paramhdr) + 2); auth_hmacs->hmac_ids[0] = htons(SCTP_AUTH_HMAC_ID_SHA1); ep->auth_hmacs_list = auth_hmacs; } if (!ep->auth_chunk_list) { struct sctp_chunks_param *auth_chunks; auth_chunks = kzalloc(sizeof(*auth_chunks) + SCTP_NUM_CHUNK_TYPES, gfp); if (!auth_chunks) goto nomem; /* Initialize the CHUNKS parameter */ auth_chunks->param_hdr.type = SCTP_PARAM_CHUNKS; auth_chunks->param_hdr.length = htons(sizeof(struct sctp_paramhdr)); ep->auth_chunk_list = auth_chunks; } return 0; nomem: /* Free all allocations */ kfree(ep->auth_hmacs_list); kfree(ep->auth_chunk_list); ep->auth_hmacs_list = NULL; ep->auth_chunk_list = NULL; return -ENOMEM; } void sctp_auth_free(struct sctp_endpoint *ep) { kfree(ep->auth_hmacs_list); kfree(ep->auth_chunk_list); ep->auth_hmacs_list = NULL; ep->auth_chunk_list = NULL; }
6 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_WAIT_BIT_H #define _LINUX_WAIT_BIT_H /* * Linux wait-bit related types and methods: */ #include <linux/wait.h> struct wait_bit_key { unsigned long *flags; int bit_nr; unsigned long timeout; }; struct wait_bit_queue_entry { struct wait_bit_key key; struct wait_queue_entry wq_entry; }; #define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ { .flags = word, .bit_nr = bit, } typedef int wait_bit_action_f(struct wait_bit_key *key, int mode); void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit); int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); void wake_up_bit(unsigned long *word, int bit); int out_of_line_wait_on_bit(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode); int out_of_line_wait_on_bit_timeout(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout); int out_of_line_wait_on_bit_lock(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode); struct wait_queue_head *bit_waitqueue(unsigned long *word, int bit); extern void __init wait_bit_init(void); int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); #define DEFINE_WAIT_BIT(name, word, bit) \ struct wait_bit_queue_entry name = { \ .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \ .wq_entry = { \ .private = current, \ .func = wake_bit_function, \ .entry = \ LIST_HEAD_INIT((name).wq_entry.entry), \ }, \ } extern int bit_wait(struct wait_bit_key *key, int mode); extern int bit_wait_io(struct wait_bit_key *key, int mode); extern int bit_wait_timeout(struct wait_bit_key *key, int mode); /** * wait_on_bit - wait for a bit to be cleared * @word: the address containing the bit being waited on * @bit: the bit at that address being waited on * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP()) * to be cleared. The clearing of the bit must be signalled with * wake_up_bit(), often as clear_and_wake_up_bit(). * * The process will wait on a waitqueue selected by hash from a shared * pool. It will only be woken on a wake_up for the target bit, even * if other processes on the same queue are waiting for other bits. * * Returned value will be zero if the bit was cleared in which case the * call has ACQUIRE semantics, or %-EINTR if the process received a * signal and the mode permitted wake up on that signal. */ static inline int wait_on_bit(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, bit_wait, mode); } /** * wait_on_bit_io - wait for a bit to be cleared * @word: the address containing the bit being waited on * @bit: the bit at that address being waited on * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP()) * to be cleared. The clearing of the bit must be signalled with * wake_up_bit(), often as clear_and_wake_up_bit(). * * This is similar to wait_on_bit(), but calls io_schedule() instead of * schedule() for the actual waiting. * * Returned value will be zero if the bit was cleared in which case the * call has ACQUIRE semantics, or %-EINTR if the process received a * signal and the mode permitted wake up on that signal. */ static inline int wait_on_bit_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, bit_wait_io, mode); } /** * wait_on_bit_timeout - wait for a bit to be cleared or a timeout to elapse * @word: the address containing the bit being waited on * @bit: the bit at that address being waited on * @mode: the task state to sleep in * @timeout: timeout, in jiffies * * Wait for the given bit in an unsigned long or bitmap (see * DECLARE_BITMAP()) to be cleared, or for a timeout to expire. The * clearing of the bit must be signalled with wake_up_bit(), often as * clear_and_wake_up_bit(). * * This is similar to wait_on_bit(), except it also takes a timeout * parameter. * * Returned value will be zero if the bit was cleared in which case the * call has ACQUIRE semantics, or %-EINTR if the process received a * signal and the mode permitted wake up on that signal, or %-EAGAIN if the * timeout elapsed. */ static inline int wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode, unsigned long timeout) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit_timeout(word, bit, bit_wait_timeout, mode, timeout); } /** * wait_on_bit_action - wait for a bit to be cleared * @word: the address containing the bit waited on * @bit: the bit at that address being waited on * @action: the function used to sleep, which may take special actions * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP()) * to be cleared. The clearing of the bit must be signalled with * wake_up_bit(), often as clear_and_wake_up_bit(). * * This is similar to wait_on_bit(), but calls @action() instead of * schedule() for the actual waiting. * * Returned value will be zero if the bit was cleared in which case the * call has ACQUIRE semantics, or the error code returned by @action if * that call returned non-zero. */ static inline int wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, action, mode); } /** * wait_on_bit_lock - wait for a bit to be cleared, then set it * @word: the address containing the bit being waited on * @bit: the bit of the word being waited on and set * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see * DECLARE_BITMAP()) to be cleared. The clearing of the bit must be * signalled with wake_up_bit(), often as clear_and_wake_up_bit(). As * soon as it is clear, atomically set it and return. * * This is similar to wait_on_bit(), but sets the bit before returning. * * Returned value will be zero if the bit was successfully set in which * case the call has the same memory sequencing semantics as * test_and_clear_bit(), or %-EINTR if the process received a signal and * the mode permitted wake up on that signal. */ static inline int wait_on_bit_lock(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode); } /** * wait_on_bit_lock_io - wait for a bit to be cleared, then set it * @word: the address containing the bit being waited on * @bit: the bit of the word being waited on and set * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see * DECLARE_BITMAP()) to be cleared. The clearing of the bit must be * signalled with wake_up_bit(), often as clear_and_wake_up_bit(). As * soon as it is clear, atomically set it and return. * * This is similar to wait_on_bit_lock(), but calls io_schedule() instead * of schedule(). * * Returns zero if the bit was (eventually) found to be clear and was * set. Returns non-zero if a signal was delivered to the process and * the @mode allows that signal to wake the process. */ static inline int wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode); } /** * wait_on_bit_lock_action - wait for a bit to be cleared, then set it * @word: the address containing the bit being waited on * @bit: the bit of the word being waited on and set * @action: the function used to sleep, which may take special actions * @mode: the task state to sleep in * * This is similar to wait_on_bit_lock(), but calls @action() instead of * schedule() for the actual waiting. * * Returned value will be zero if the bit was successfully set in which * case the call has the same memory sequencing semantics as * test_and_clear_bit(), or the error code returned by @action if that * call returned non-zero. */ static inline int wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, action, mode); } extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags); extern void wake_up_var(void *var); extern wait_queue_head_t *__var_waitqueue(void *p); #define ___wait_var_event(var, condition, state, exclusive, ret, cmd) \ ({ \ __label__ __out; \ struct wait_queue_head *__wq_head = __var_waitqueue(var); \ struct wait_bit_queue_entry __wbq_entry; \ long __ret = ret; /* explicit shadow */ \ \ init_wait_var_entry(&__wbq_entry, var, \ exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ for (;;) { \ long __int = prepare_to_wait_event(__wq_head, \ &__wbq_entry.wq_entry, \ state); \ if (condition) \ break; \ \ if (___wait_is_interruptible(state) && __int) { \ __ret = __int; \ goto __out; \ } \ \ cmd; \ } \ finish_wait(__wq_head, &__wbq_entry.wq_entry); \ __out: __ret; \ }) #define __wait_var_event(var, condition) \ ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ schedule()) #define __wait_var_event_io(var, condition) \ ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ io_schedule()) /** * wait_var_event - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * * Wait for a @condition to be true, only re-checking when a wake up is * received for the given @var (an arbitrary kernel address which need * not be directly related to the given condition, but usually is). * * The process will wait on a waitqueue selected by hash from a shared * pool. It will only be woken on a wake_up for the given address. * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event(var, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __wait_var_event(var, condition); \ } while (0) /** * wait_var_event_io - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * * Wait for an IO related @condition to be true, only re-checking when a * wake up is received for the given @var (an arbitrary kernel address * which need not be directly related to the given condition, but * usually is). * * The process will wait on a waitqueue selected by hash from a shared * pool. It will only be woken on a wake_up for the given address. * * This is similar to wait_var_event(), but calls io_schedule() instead * of schedule(). * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event_io(var, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __wait_var_event_io(var, condition); \ } while (0) #define __wait_var_event_killable(var, condition) \ ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0, \ schedule()) /** * wait_var_event_killable - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * * Wait for a @condition to be true or a fatal signal to be received, * only re-checking the condition when a wake up is received for the given * @var (an arbitrary kernel address which need not be directly related * to the given condition, but usually is). * * This is similar to wait_var_event() but returns a value which is * 0 if the condition became true, or %-ERESTARTSYS if a fatal signal * was received. * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event_killable(var, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_var_event_killable(var, condition); \ __ret; \ }) #define __wait_var_event_timeout(var, condition, timeout) \ ___wait_var_event(var, ___wait_cond_timeout(condition), \ TASK_UNINTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_var_event_timeout - wait for a variable to be updated or a timeout to expire * @var: the address of variable being waited on * @condition: the condition to wait for * @timeout: maximum time to wait in jiffies * * Wait for a @condition to be true or a timeout to expire, only * re-checking the condition when a wake up is received for the given * @var (an arbitrary kernel address which need not be directly related * to the given condition, but usually is). * * This is similar to wait_var_event() but returns a value which is 0 if * the timeout expired and the condition was still false, or the * remaining time left in the timeout (but at least 1) if the condition * was found to be true. * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event_timeout(var, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_var_event_timeout(var, condition, timeout); \ __ret; \ }) #define __wait_var_event_interruptible(var, condition) \ ___wait_var_event(var, condition, TASK_INTERRUPTIBLE, 0, 0, \ schedule()) /** * wait_var_event_interruptible - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * * Wait for a @condition to be true or a signal to be received, only * re-checking the condition when a wake up is received for the given * @var (an arbitrary kernel address which need not be directly related * to the given condition, but usually is). * * This is similar to wait_var_event() but returns a value which is 0 if * the condition became true, or %-ERESTARTSYS if a signal was received. * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event_interruptible(var, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_var_event_interruptible(var, condition); \ __ret; \ }) /** * wait_var_event_any_lock - wait for a variable to be updated under a lock * @var: the address of the variable being waited on * @condition: condition to wait for * @lock: the object that is locked to protect updates to the variable * @type: prefix on lock and unlock operations * @state: waiting state, %TASK_UNINTERRUPTIBLE etc. * * Wait for a condition which can only be reliably tested while holding * a lock. The variables assessed in the condition will normal be updated * under the same lock, and the wake up should be signalled with * wake_up_var_locked() under the same lock. * * This is similar to wait_var_event(), but assumes a lock is held * while calling this function and while updating the variable. * * This must be called while the given lock is held and the lock will be * dropped when schedule() is called to wait for a wake up, and will be * reclaimed before testing the condition again. The functions used to * unlock and lock the object are constructed by appending _unlock and _lock * to @type. * * Return %-ERESTARTSYS if a signal arrives which is allowed to interrupt * the wait according to @state. */ #define wait_var_event_any_lock(var, condition, lock, type, state) \ ({ \ int __ret = 0; \ if (!(condition)) \ __ret = ___wait_var_event(var, condition, state, 0, 0, \ type ## _unlock(lock); \ schedule(); \ type ## _lock(lock)); \ __ret; \ }) /** * wait_var_event_spinlock - wait for a variable to be updated under a spinlock * @var: the address of the variable being waited on * @condition: condition to wait for * @lock: the spinlock which protects updates to the variable * * Wait for a condition which can only be reliably tested while holding * a spinlock. The variables assessed in the condition will normal be updated * under the same spinlock, and the wake up should be signalled with * wake_up_var_locked() under the same spinlock. * * This is similar to wait_var_event(), but assumes a spinlock is held * while calling this function and while updating the variable. * * This must be called while the given lock is held and the lock will be * dropped when schedule() is called to wait for a wake up, and will be * reclaimed before testing the condition again. */ #define wait_var_event_spinlock(var, condition, lock) \ wait_var_event_any_lock(var, condition, lock, spin, TASK_UNINTERRUPTIBLE) /** * wait_var_event_mutex - wait for a variable to be updated under a mutex * @var: the address of the variable being waited on * @condition: condition to wait for * @lock: the mutex which protects updates to the variable * * Wait for a condition which can only be reliably tested while holding * a mutex. The variables assessed in the condition will normal be * updated under the same mutex, and the wake up should be signalled * with wake_up_var_locked() under the same mutex. * * This is similar to wait_var_event(), but assumes a mutex is held * while calling this function and while updating the variable. * * This must be called while the given mutex is held and the mutex will be * dropped when schedule() is called to wait for a wake up, and will be * reclaimed before testing the condition again. */ #define wait_var_event_mutex(var, condition, lock) \ wait_var_event_any_lock(var, condition, lock, mutex, TASK_UNINTERRUPTIBLE) /** * wake_up_var_protected - wake up waiters for a variable asserting that it is safe * @var: the address of the variable being waited on * @cond: the condition which afirms this is safe * * When waking waiters which use wait_var_event_any_lock() the waker must be * holding the reelvant lock to avoid races. This version of wake_up_var() * asserts that the relevant lock is held and so no barrier is needed. * The @cond is only tested when CONFIG_LOCKDEP is enabled. */ #define wake_up_var_protected(var, cond) \ do { \ lockdep_assert(cond); \ wake_up_var(var); \ } while (0) /** * wake_up_var_locked - wake up waiters for a variable while holding a spinlock or mutex * @var: the address of the variable being waited on * @lock: The spinlock or mutex what protects the variable * * Send a wake up for the given variable which should be waited for with * wait_var_event_spinlock() or wait_var_event_mutex(). Unlike wake_up_var(), * no extra barriers are needed as the locking provides sufficient sequencing. */ #define wake_up_var_locked(var, lock) \ wake_up_var_protected(var, lockdep_is_held(lock)) /** * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit * @bit: the bit of the word being waited on * @word: the address containing the bit being waited on * * The designated bit is cleared and any tasks waiting in wait_on_bit() * or similar will be woken. This call has RELEASE semantics so that * any changes to memory made before this call are guaranteed to be visible * after the corresponding wait_on_bit() completes. */ static inline void clear_and_wake_up_bit(int bit, unsigned long *word) { clear_bit_unlock(bit, word); /* See wake_up_bit() for which memory barrier you need to use. */ smp_mb__after_atomic(); wake_up_bit(word, bit); } /** * test_and_clear_wake_up_bit - clear a bit if it was set: wake up anyone waiting on that bit * @bit: the bit of the word being waited on * @word: the address of memory containing that bit * * If the bit is set and can be atomically cleared, any tasks waiting in * wait_on_bit() or similar will be woken. This call has the same * complete ordering semantics as test_and_clear_bit(). Any changes to * memory made before this call are guaranteed to be visible after the * corresponding wait_on_bit() completes. * * Returns %true if the bit was successfully set and the wake up was sent. */ static inline bool test_and_clear_wake_up_bit(int bit, unsigned long *word) { if (!test_and_clear_bit(bit, word)) return false; /* no extra barrier required */ wake_up_bit(word, bit); return true; } /** * atomic_dec_and_wake_up - decrement an atomic_t and if zero, wake up waiters * @var: the variable to dec and test * * Decrements the atomic variable and if it reaches zero, send a wake_up to any * processes waiting on the variable. * * This function has the same complete ordering semantics as atomic_dec_and_test. * * Returns %true is the variable reaches zero and the wake up was sent. */ static inline bool atomic_dec_and_wake_up(atomic_t *var) { if (!atomic_dec_and_test(var)) return false; /* No extra barrier required */ wake_up_var(var); return true; } /** * store_release_wake_up - update a variable and send a wake_up * @var: the address of the variable to be updated and woken * @val: the value to store in the variable. * * Store the given value in the variable send a wake up to any tasks * waiting on the variable. All necessary barriers are included to ensure * the task calling wait_var_event() sees the new value and all values * written to memory before this call. */ #define store_release_wake_up(var, val) \ do { \ smp_store_release(var, val); \ smp_mb(); \ wake_up_var(var); \ } while (0) #endif /* _LINUX_WAIT_BIT_H */
2 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 /* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/percpu.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include "rds.h" #include "ib.h" DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); static const char *const rds_ib_stat_names[] = { "ib_connect_raced", "ib_listen_closed_stale", "ib_evt_handler_call", "ib_tasklet_call", "ib_tx_cq_event", "ib_tx_ring_full", "ib_tx_throttle", "ib_tx_sg_mapping_failure", "ib_tx_stalled", "ib_tx_credit_updates", "ib_rx_cq_event", "ib_rx_ring_empty", "ib_rx_refill_from_cq", "ib_rx_refill_from_thread", "ib_rx_alloc_limit", "ib_rx_total_frags", "ib_rx_total_incs", "ib_rx_credit_updates", "ib_ack_sent", "ib_ack_send_failure", "ib_ack_send_delayed", "ib_ack_send_piggybacked", "ib_ack_received", "ib_rdma_mr_8k_alloc", "ib_rdma_mr_8k_free", "ib_rdma_mr_8k_used", "ib_rdma_mr_8k_pool_flush", "ib_rdma_mr_8k_pool_wait", "ib_rdma_mr_8k_pool_depleted", "ib_rdma_mr_1m_alloc", "ib_rdma_mr_1m_free", "ib_rdma_mr_1m_used", "ib_rdma_mr_1m_pool_flush", "ib_rdma_mr_1m_pool_wait", "ib_rdma_mr_1m_pool_depleted", "ib_rdma_mr_8k_reused", "ib_rdma_mr_1m_reused", "ib_atomic_cswp", "ib_atomic_fadd", }; unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail) { struct rds_ib_statistics stats = {0, }; uint64_t *src; uint64_t *sum; size_t i; int cpu; if (avail < ARRAY_SIZE(rds_ib_stat_names)) goto out; for_each_online_cpu(cpu) { src = (uint64_t *)&(per_cpu(rds_ib_stats, cpu)); sum = (uint64_t *)&stats; for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) *(sum++) += *(src++); } rds_stats_info_copy(iter, (uint64_t *)&stats, rds_ib_stat_names, ARRAY_SIZE(rds_ib_stat_names)); out: return ARRAY_SIZE(rds_ib_stat_names); }
325 275 7 68 280 133 388 288 101 112 101 82 25 25 5 3 19 13 2 10 11 1 10 9 9 9 9 9 3 6 9 9 3 6 5 4 6 3 25 25 25 5 1 18 4 1 2 1 36 31 31 22 31 4 11 10 10 10 10 10 10 10 10 12 12 1 10 1 8 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPV4 GSO/GRO offload support * Linux INET implementation * * UDPv4 GSO support */ #include <linux/skbuff.h> #include <net/gro.h> #include <net/gso.h> #include <net/udp.h> #include <net/protocol.h> #include <net/inet_common.h> #include <net/udp_tunnel.h> #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) /* * Dummy GRO tunnel callback, exists mainly to avoid dangling/NULL * values for the udp tunnel static call. */ static struct sk_buff *dummy_gro_rcv(struct sock *sk, struct list_head *head, struct sk_buff *skb) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } typedef struct sk_buff *(*udp_tunnel_gro_rcv_t)(struct sock *sk, struct list_head *head, struct sk_buff *skb); struct udp_tunnel_type_entry { udp_tunnel_gro_rcv_t gro_receive; refcount_t count; }; #define UDP_MAX_TUNNEL_TYPES (IS_ENABLED(CONFIG_GENEVE) + \ IS_ENABLED(CONFIG_VXLAN) * 2 + \ IS_ENABLED(CONFIG_NET_FOU) * 2 + \ IS_ENABLED(CONFIG_XFRM) * 2) DEFINE_STATIC_CALL(udp_tunnel_gro_rcv, dummy_gro_rcv); static DEFINE_STATIC_KEY_FALSE(udp_tunnel_static_call); static DEFINE_MUTEX(udp_tunnel_gro_type_lock); static struct udp_tunnel_type_entry udp_tunnel_gro_types[UDP_MAX_TUNNEL_TYPES]; static unsigned int udp_tunnel_gro_type_nr; static DEFINE_SPINLOCK(udp_tunnel_gro_lock); void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) { bool is_ipv6 = sk->sk_family == AF_INET6; struct udp_sock *tup, *up = udp_sk(sk); struct udp_tunnel_gro *udp_tunnel_gro; spin_lock(&udp_tunnel_gro_lock); udp_tunnel_gro = &net->ipv4.udp_tunnel_gro[is_ipv6]; if (add) hlist_add_head(&up->tunnel_list, &udp_tunnel_gro->list); else if (up->tunnel_list.pprev) hlist_del_init(&up->tunnel_list); if (udp_tunnel_gro->list.first && !udp_tunnel_gro->list.first->next) { tup = hlist_entry(udp_tunnel_gro->list.first, struct udp_sock, tunnel_list); rcu_assign_pointer(udp_tunnel_gro->sk, (struct sock *)tup); } else { RCU_INIT_POINTER(udp_tunnel_gro->sk, NULL); } spin_unlock(&udp_tunnel_gro_lock); } EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_lookup); void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) { struct udp_tunnel_type_entry *cur = NULL; struct udp_sock *up = udp_sk(sk); int i, old_gro_type_nr; if (!UDP_MAX_TUNNEL_TYPES || !up->gro_receive) return; mutex_lock(&udp_tunnel_gro_type_lock); /* Check if the static call is permanently disabled. */ if (udp_tunnel_gro_type_nr > UDP_MAX_TUNNEL_TYPES) goto out; for (i = 0; i < udp_tunnel_gro_type_nr; i++) if (udp_tunnel_gro_types[i].gro_receive == up->gro_receive) cur = &udp_tunnel_gro_types[i]; old_gro_type_nr = udp_tunnel_gro_type_nr; if (add) { /* * Update the matching entry, if found, or add a new one * if needed */ if (cur) { refcount_inc(&cur->count); goto out; } if (unlikely(udp_tunnel_gro_type_nr == UDP_MAX_TUNNEL_TYPES)) { pr_err_once("Too many UDP tunnel types, please increase UDP_MAX_TUNNEL_TYPES\n"); /* Ensure static call will never be enabled */ udp_tunnel_gro_type_nr = UDP_MAX_TUNNEL_TYPES + 1; } else { cur = &udp_tunnel_gro_types[udp_tunnel_gro_type_nr++]; refcount_set(&cur->count, 1); cur->gro_receive = up->gro_receive; } } else { /* * The stack cleanups only successfully added tunnel, the * lookup on removal should never fail. */ if (WARN_ON_ONCE(!cur)) goto out; if (!refcount_dec_and_test(&cur->count)) goto out; /* Avoid gaps, so that the enable tunnel has always id 0 */ *cur = udp_tunnel_gro_types[--udp_tunnel_gro_type_nr]; } if (udp_tunnel_gro_type_nr == 1) { static_call_update(udp_tunnel_gro_rcv, udp_tunnel_gro_types[0].gro_receive); static_branch_enable(&udp_tunnel_static_call); } else if (old_gro_type_nr == 1) { static_branch_disable(&udp_tunnel_static_call); static_call_update(udp_tunnel_gro_rcv, dummy_gro_rcv); } out: mutex_unlock(&udp_tunnel_gro_type_lock); } EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_rcv); static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, struct list_head *head, struct sk_buff *skb) { if (static_branch_likely(&udp_tunnel_static_call)) { if (unlikely(gro_recursion_inc_test(skb))) { NAPI_GRO_CB(skb)->flush |= 1; return NULL; } return static_call(udp_tunnel_gro_rcv)(sk, head, skb); } return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); } #else static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, struct list_head *head, struct sk_buff *skb) { return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); } #endif static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, netdev_features_t features), __be16 new_protocol, bool is_ipv6) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); bool remcsum, need_csum, offload_csum, gso_partial; struct sk_buff *segs = ERR_PTR(-EINVAL); struct udphdr *uh = udp_hdr(skb); u16 mac_offset = skb->mac_header; __be16 protocol = skb->protocol; u16 mac_len = skb->mac_len; int udp_offset, outer_hlen; __wsum partial; bool need_ipsec; if (unlikely(!pskb_may_pull(skb, tnl_hlen))) goto out; /* Adjust partial header checksum to negate old length. * We cannot rely on the value contained in uh->len as it is * possible that the actual value exceeds the boundaries of the * 16 bit length field due to the header being added outside of an * IP or IPv6 frame that was already limited to 64K - 1. */ if (skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) partial = (__force __wsum)uh->len; else partial = (__force __wsum)htonl(skb->len); partial = csum_sub(csum_unfold(uh->check), partial); /* setup inner skb. */ skb->encapsulation = 0; SKB_GSO_CB(skb)->encap_level = 0; __skb_pull(skb, tnl_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); skb_set_transport_header(skb, skb_inner_transport_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); skb->protocol = new_protocol; need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); skb->encap_hdr_csum = need_csum; remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); skb->remcsum_offload = remcsum; need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb); /* Try to offload checksum if possible */ offload_csum = !!(need_csum && !need_ipsec && (skb->dev->features & (is_ipv6 ? (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM) : (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)))); features &= skb->dev->hw_enc_features; if (need_csum) features &= ~NETIF_F_SCTP_CRC; /* The only checksum offload we care about from here on out is the * outer one so strip the existing checksum feature flags and * instead set the flag based on our outer checksum offload value. */ if (remcsum) { features &= ~NETIF_F_CSUM_MASK; if (!need_csum || offload_csum) features |= NETIF_F_HW_CSUM; } /* segment inner packet. */ segs = gso_inner_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, mac_len); goto out; } gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); outer_hlen = skb_tnl_header_len(skb); udp_offset = outer_hlen - tnl_hlen; skb = segs; do { unsigned int len; if (remcsum) skb->ip_summed = CHECKSUM_NONE; /* Set up inner headers if we are offloading inner checksum */ if (skb->ip_summed == CHECKSUM_PARTIAL) { skb_reset_inner_headers(skb); skb->encapsulation = 1; } skb->mac_len = mac_len; skb->protocol = protocol; __skb_push(skb, outer_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); skb_set_transport_header(skb, udp_offset); len = skb->len - udp_offset; uh = udp_hdr(skb); /* If we are only performing partial GSO the inner header * will be using a length value equal to only one MSS sized * segment instead of the entire frame. */ if (gso_partial && skb_is_gso(skb)) { uh->len = htons(skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)uh); } else { uh->len = htons(len); } if (!need_csum) continue; uh->check = ~csum_fold(csum_add(partial, (__force __wsum)htonl(len))); if (skb->encapsulation || !offload_csum) { uh->check = gso_make_checksum(skb, ~uh->check); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); } } while ((skb = skb->next)); out: return segs; } struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, bool is_ipv6) { const struct net_offload __rcu **offloads; __be16 protocol = skb->protocol; const struct net_offload *ops; struct sk_buff *segs = ERR_PTR(-EINVAL); struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, netdev_features_t features); rcu_read_lock(); switch (skb->inner_protocol_type) { case ENCAP_TYPE_ETHER: protocol = skb->inner_protocol; gso_inner_segment = skb_mac_gso_segment; break; case ENCAP_TYPE_IPPROTO: offloads = is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[skb->inner_ipproto]); if (!ops || !ops->callbacks.gso_segment) goto out_unlock; gso_inner_segment = ops->callbacks.gso_segment; break; default: goto out_unlock; } segs = __skb_udp_tunnel_segment(skb, features, gso_inner_segment, protocol, is_ipv6); out_unlock: rcu_read_unlock(); return segs; } static void __udpv4_gso_segment_csum(struct sk_buff *seg, __be32 *oldip, __be32 *newip, __be16 *oldport, __be16 *newport) { struct udphdr *uh; struct iphdr *iph; if (*oldip == *newip && *oldport == *newport) return; uh = udp_hdr(seg); iph = ip_hdr(seg); if (uh->check) { inet_proto_csum_replace4(&uh->check, seg, *oldip, *newip, true); inet_proto_csum_replace2(&uh->check, seg, *oldport, *newport, false); if (!uh->check) uh->check = CSUM_MANGLED_0; } *oldport = *newport; csum_replace4(&iph->check, *oldip, *newip); *oldip = *newip; } static struct sk_buff *__udpv4_gso_segment_list_csum(struct sk_buff *segs) { struct sk_buff *seg; struct udphdr *uh, *uh2; struct iphdr *iph, *iph2; seg = segs; uh = udp_hdr(seg); iph = ip_hdr(seg); if ((udp_hdr(seg)->dest == udp_hdr(seg->next)->dest) && (udp_hdr(seg)->source == udp_hdr(seg->next)->source) && (ip_hdr(seg)->daddr == ip_hdr(seg->next)->daddr) && (ip_hdr(seg)->saddr == ip_hdr(seg->next)->saddr)) return segs; while ((seg = seg->next)) { uh2 = udp_hdr(seg); iph2 = ip_hdr(seg); __udpv4_gso_segment_csum(seg, &iph2->saddr, &iph->saddr, &uh2->source, &uh->source); __udpv4_gso_segment_csum(seg, &iph2->daddr, &iph->daddr, &uh2->dest, &uh->dest); } return segs; } static void __udpv6_gso_segment_csum(struct sk_buff *seg, struct in6_addr *oldip, const struct in6_addr *newip, __be16 *oldport, __be16 newport) { struct udphdr *uh = udp_hdr(seg); if (ipv6_addr_equal(oldip, newip) && *oldport == newport) return; if (uh->check) { inet_proto_csum_replace16(&uh->check, seg, oldip->s6_addr32, newip->s6_addr32, true); inet_proto_csum_replace2(&uh->check, seg, *oldport, newport, false); if (!uh->check) uh->check = CSUM_MANGLED_0; } *oldip = *newip; *oldport = newport; } static struct sk_buff *__udpv6_gso_segment_list_csum(struct sk_buff *segs) { const struct ipv6hdr *iph; const struct udphdr *uh; struct ipv6hdr *iph2; struct sk_buff *seg; struct udphdr *uh2; seg = segs; uh = udp_hdr(seg); iph = ipv6_hdr(seg); uh2 = udp_hdr(seg->next); iph2 = ipv6_hdr(seg->next); if (!(*(const u32 *)&uh->source ^ *(const u32 *)&uh2->source) && ipv6_addr_equal(&iph->saddr, &iph2->saddr) && ipv6_addr_equal(&iph->daddr, &iph2->daddr)) return segs; while ((seg = seg->next)) { uh2 = udp_hdr(seg); iph2 = ipv6_hdr(seg); __udpv6_gso_segment_csum(seg, &iph2->saddr, &iph->saddr, &uh2->source, uh->source); __udpv6_gso_segment_csum(seg, &iph2->daddr, &iph->daddr, &uh2->dest, uh->dest); } return segs; } static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb, netdev_features_t features, bool is_ipv6) { unsigned int mss = skb_shinfo(skb)->gso_size; skb = skb_segment_list(skb, features, skb_mac_header_len(skb)); if (IS_ERR(skb)) return skb; udp_hdr(skb)->len = htons(sizeof(struct udphdr) + mss); if (is_ipv6) return __udpv6_gso_segment_list_csum(skb); else return __udpv4_gso_segment_list_csum(skb); } struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features, bool is_ipv6) { struct sock *sk = gso_skb->sk; unsigned int sum_truesize = 0; struct sk_buff *segs, *seg; struct udphdr *uh; unsigned int mss; bool copy_dtor; __sum16 check; __be16 newlen; int ret = 0; mss = skb_shinfo(gso_skb)->gso_size; if (gso_skb->len <= sizeof(*uh) + mss) return ERR_PTR(-EINVAL); if (unlikely(skb_checksum_start(gso_skb) != skb_transport_header(gso_skb) && !(skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST))) return ERR_PTR(-EINVAL); /* We don't know if egress device can segment and checksum the packet * when IPv6 extension headers are present. Fall back to software GSO. */ if (gso_skb->ip_summed != CHECKSUM_PARTIAL) features &= ~(NETIF_F_GSO_UDP_L4 | NETIF_F_CSUM_MASK); if (skb_gso_ok(gso_skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ skb_shinfo(gso_skb)->gso_segs = DIV_ROUND_UP(gso_skb->len - sizeof(*uh), mss); return NULL; } if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST) { /* Detect modified geometry and pass those to skb_segment. */ if ((skb_pagelen(gso_skb) - sizeof(*uh) == skb_shinfo(gso_skb)->gso_size) && !(skb_shinfo(gso_skb)->gso_type & SKB_GSO_DODGY)) return __udp_gso_segment_list(gso_skb, features, is_ipv6); ret = __skb_linearize(gso_skb); if (ret) return ERR_PTR(ret); /* Setup csum, as fraglist skips this in udp4_gro_receive. */ gso_skb->csum_start = skb_transport_header(gso_skb) - gso_skb->head; gso_skb->csum_offset = offsetof(struct udphdr, check); gso_skb->ip_summed = CHECKSUM_PARTIAL; uh = udp_hdr(gso_skb); if (is_ipv6) uh->check = ~udp_v6_check(gso_skb->len, &ipv6_hdr(gso_skb)->saddr, &ipv6_hdr(gso_skb)->daddr, 0); else uh->check = ~udp_v4_check(gso_skb->len, ip_hdr(gso_skb)->saddr, ip_hdr(gso_skb)->daddr, 0); } skb_pull(gso_skb, sizeof(*uh)); /* clear destructor to avoid skb_segment assigning it to tail */ copy_dtor = gso_skb->destructor == sock_wfree; if (copy_dtor) { gso_skb->destructor = NULL; gso_skb->sk = NULL; } segs = skb_segment(gso_skb, features); if (IS_ERR_OR_NULL(segs)) { if (copy_dtor) { gso_skb->destructor = sock_wfree; gso_skb->sk = sk; } return segs; } seg = segs; uh = udp_hdr(seg); /* preserve TX timestamp flags and TS key for first segment */ skb_shinfo(seg)->tskey = skb_shinfo(gso_skb)->tskey; skb_shinfo(seg)->tx_flags |= (skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP); /* compute checksum adjustment based on old length versus new */ newlen = htons(sizeof(*uh) + mss); check = csum16_add(csum16_sub(uh->check, uh->len), newlen); for (;;) { if (copy_dtor) { seg->destructor = sock_wfree; seg->sk = sk; sum_truesize += seg->truesize; } if (!seg->next) break; uh->len = newlen; uh->check = check; if (seg->ip_summed == CHECKSUM_PARTIAL) gso_reset_checksum(seg, ~check); else uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0; seg = seg->next; uh = udp_hdr(seg); } /* Unless skb fits perfectly as GSO_PARTIAL, the trailing * segment may not be full MSS, account for that in the checksum */ if (!skb_is_gso(seg)) newlen = htons(skb_tail_pointer(seg) - skb_transport_header(seg) + seg->data_len); check = csum16_add(csum16_sub(uh->check, uh->len), newlen); uh->len = newlen; uh->check = check; if (seg->ip_summed == CHECKSUM_PARTIAL) gso_reset_checksum(seg, ~check); else uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0; /* On the TX path, CHECKSUM_NONE and CHECKSUM_UNNECESSARY have the same * meaning. However, check for bad offloads in the GSO stack expects the * latter, if the checksum was calculated in software. To vouch for the * segment skbs we actually need to set it on the gso_skb. */ if (gso_skb->ip_summed == CHECKSUM_NONE) gso_skb->ip_summed = CHECKSUM_UNNECESSARY; /* update refcount for the packet */ if (copy_dtor) { int delta = sum_truesize - gso_skb->truesize; /* In some pathological cases, delta can be negative. * We need to either use refcount_add() or refcount_sub_and_test() */ if (likely(delta >= 0)) refcount_add(delta, &sk->sk_wmem_alloc); else WARN_ON_ONCE(refcount_sub_and_test(-delta, &sk->sk_wmem_alloc)); } return segs; } static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int mss; __wsum csum; struct udphdr *uh; struct iphdr *iph; if (skb->encapsulation && (skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { segs = skb_udp_tunnel_segment(skb, features, false); goto out; } if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4))) goto out; if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto out; if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) return __udp_gso_segment(skb, features, false); mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) goto out; /* Do software UFO. Complete and fill in the UDP checksum as * HW cannot do checksum of UDP packets sent as multiple * IP fragments. */ uh = udp_hdr(skb); iph = ip_hdr(skb); uh->check = 0; csum = skb_checksum(skb, 0, skb->len, 0); uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; skb->ip_summed = CHECKSUM_UNNECESSARY; /* If there is no outer header we can fake a checksum offload * due to the fact that we have already done the checksum in * software prior to segmenting the frame. */ if (!skb->encap_hdr_csum) features |= NETIF_F_HW_CSUM; /* Fragment the skb. IP headers of the fragments are updated in * inet_gso_segment() */ segs = skb_segment(skb, features); out: return segs; } #define UDP_GRO_CNT_MAX 64 static struct sk_buff *udp_gro_receive_segment(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); struct sk_buff *pp = NULL; struct udphdr *uh2; struct sk_buff *p; unsigned int ulen; int ret = 0; int flush; /* requires non zero csum, for symmetry with GSO */ if (!uh->check) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } /* Do not deal with padded or malicious packets, sorry ! */ ulen = ntohs(uh->len); if (ulen <= sizeof(*uh) || ulen != skb_gro_len(skb)) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } /* pull encapsulating udp header */ skb_gro_pull(skb, sizeof(struct udphdr)); list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; uh2 = udp_hdr(p); /* Match ports only, as csum is always non zero */ if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } if (NAPI_GRO_CB(skb)->is_flist != NAPI_GRO_CB(p)->is_flist) { NAPI_GRO_CB(skb)->flush = 1; return p; } flush = gro_receive_network_flush(uh, uh2, p); /* Terminate the flow on len mismatch or if it grow "too much". * Under small packet flood GRO count could elsewhere grow a lot * leading to excessive truesize values. * On len mismatch merge the first packet shorter than gso_size, * otherwise complete the GRO packet. */ if (ulen > ntohs(uh2->len) || flush) { pp = p; } else { if (NAPI_GRO_CB(skb)->is_flist) { if (!pskb_may_pull(skb, skb_gro_offset(skb))) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } if ((skb->ip_summed != p->ip_summed) || (skb->csum_level != p->csum_level)) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } skb_set_network_header(skb, skb_gro_receive_network_offset(skb)); ret = skb_gro_receive_list(p, skb); } else { skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); ret = skb_gro_receive(p, skb); } } if (ret || ulen != ntohs(uh2->len) || NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX) pp = p; return pp; } /* mismatch, but we never need to flush */ return NULL; } struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, struct udphdr *uh, struct sock *sk) { struct sk_buff *pp = NULL; struct sk_buff *p; struct udphdr *uh2; unsigned int off = skb_gro_offset(skb); int flush = 1; /* We can do L4 aggregation only if the packet can't land in a tunnel * otherwise we could corrupt the inner stream. Detecting such packets * cannot be foolproof and the aggregation might still happen in some * cases. Such packets should be caught in udp_unexpected_gso later. */ NAPI_GRO_CB(skb)->is_flist = 0; if (!sk || !udp_sk(sk)->gro_receive) { /* If the packet was locally encapsulated in a UDP tunnel that * wasn't detected above, do not GRO. */ if (skb->encapsulation) goto out; if (skb->dev->features & NETIF_F_GRO_FRAGLIST) NAPI_GRO_CB(skb)->is_flist = sk ? !udp_test_bit(GRO_ENABLED, sk) : 1; if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) || (sk && udp_test_bit(GRO_ENABLED, sk)) || NAPI_GRO_CB(skb)->is_flist) return call_gro_receive(udp_gro_receive_segment, head, skb); /* no GRO, be sure flush the current packet */ goto out; } if (NAPI_GRO_CB(skb)->encap_mark || (uh->check && skb->ip_summed != CHECKSUM_PARTIAL && NAPI_GRO_CB(skb)->csum_cnt == 0 && !NAPI_GRO_CB(skb)->csum_valid)) goto out; /* mark that this skb passed once through the tunnel gro layer */ NAPI_GRO_CB(skb)->encap_mark = 1; flush = 0; list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; uh2 = (struct udphdr *)(p->data + off); /* Match ports and either checksums are either both zero * or nonzero. */ if ((*(u32 *)&uh->source != *(u32 *)&uh2->source) || (!uh->check ^ !uh2->check)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); pp = udp_tunnel_gro_rcv(sk, head, skb); out: skb_gro_flush_final(skb, pp, flush); return pp; } static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport) { const struct iphdr *iph = skb_gro_network_header(skb); struct net *net = dev_net_rcu(skb->dev); struct sock *sk; int iif, sdif; sk = udp_tunnel_sk(net, false); if (sk && dport == htons(sk->sk_num)) return sk; inet_get_iif_sdif(skb, &iif, &sdif); return __udp4_lib_lookup(net, iph->saddr, sport, iph->daddr, dport, iif, sdif, NULL); } INDIRECT_CALLABLE_SCOPE struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); struct sock *sk = NULL; struct sk_buff *pp; if (unlikely(!uh)) goto flush; /* Don't bother verifying checksum if we're going to flush anyway. */ if (NAPI_GRO_CB(skb)->flush) goto skip; if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, inet_gro_compute_pseudo)) goto flush; else if (uh->check) skb_gro_checksum_try_convert(skb, IPPROTO_UDP, inet_gro_compute_pseudo); skip: if (static_branch_unlikely(&udp_encap_needed_key)) sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest); pp = udp_gro_receive(head, skb, uh, sk); return pp; flush: NAPI_GRO_CB(skb)->flush = 1; return NULL; } static int udp_gro_complete_segment(struct sk_buff *skb) { struct udphdr *uh = udp_hdr(skb); skb->csum_start = (unsigned char *)uh - skb->head; skb->csum_offset = offsetof(struct udphdr, check); skb->ip_summed = CHECKSUM_PARTIAL; skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4; if (skb->encapsulation) skb->inner_transport_header = skb->transport_header; return 0; } int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup) { __be16 newlen = htons(skb->len - nhoff); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); struct sock *sk; int err; uh->len = newlen; sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb, udp4_lib_lookup_skb, skb, uh->source, uh->dest); if (sk && udp_sk(sk)->gro_complete) { skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; /* clear the encap mark, so that inner frag_list gro_complete * can take place */ NAPI_GRO_CB(skb)->encap_mark = 0; /* Set encapsulation before calling into inner gro_complete() * functions to make them set up the inner offsets. */ skb->encapsulation = 1; err = udp_sk(sk)->gro_complete(sk, skb, nhoff + sizeof(struct udphdr)); } else { err = udp_gro_complete_segment(skb); } if (skb->remcsum_offload) skb_shinfo(skb)->gso_type |= SKB_GSO_TUNNEL_REMCSUM; return err; } INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff) { const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct iphdr *iph = (struct iphdr *)(skb->data + offset); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); /* do fraglist only if there is no outer UDP encap (or we already processed it) */ if (NAPI_GRO_CB(skb)->is_flist && !NAPI_GRO_CB(skb)->encap_mark) { uh->len = htons(skb->len - nhoff); skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; __skb_incr_checksum_unnecessary(skb); return 0; } if (uh->check) uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr, iph->daddr, 0); return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb); } int __init udpv4_offload_init(void) { net_hotdata.udpv4_offload = (struct net_offload) { .callbacks = { .gso_segment = udp4_ufo_fragment, .gro_receive = udp4_gro_receive, .gro_complete = udp4_gro_complete, }, }; return inet_add_offload(&net_hotdata.udpv4_offload, IPPROTO_UDP); }
11 11 11 10 10 10 10 10 10 10 10 10 10 3 3 3 3 3 3 34 33 34 34 9 14 30 30 16 14 14 30 29 35 22 31 31 16 14 14 14 3 3 3 3 3 6 84 81 6 67 67 17 17 9 10 10 10 10 2 3 3 3 3 3 9 9 61 23 4 4 11 61 61 35 31 35 11 11 23 61 61 61 61 61 19 19 61 61 19 19 61 73 74 74 87 1 86 1 85 23 22 23 23 22 6 8 16 15 15 15 6 7 7 7 7 4 7 1 4 4 4 2 2 4 4 2 2 3 3 3 3 3 3 3 3 207 1 207 208 205 2 208 205 208 207 206 58 57 55 3 58 58 58 12 45 21 57 18 18 3 3 3 207 204 206 207 206 208 26 28 28 243 242 242 10 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> #include <crypto/sha2.h> #include <crypto/utils.h> #include <net/sock.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> #include <net/protocol.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include <net/ip6_route.h> #include <net/transp_v6.h> #endif #include <net/mptcp.h> #include "protocol.h" #include "mib.h" #include <trace/events/mptcp.h> #include <trace/events/sock.h> static void mptcp_subflow_ops_undo_override(struct sock *ssk); static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, enum linux_mptcp_mib_field field) { MPTCP_INC_STATS(sock_net(req_to_sk(req)), field); } static void subflow_req_destructor(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); pr_debug("subflow_req=%p\n", subflow_req); if (subflow_req->msk) sock_put((struct sock *)subflow_req->msk); mptcp_token_destroy_request(req); } static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, void *hmac) { u8 msg[8]; put_unaligned_be32(nonce1, &msg[0]); put_unaligned_be32(nonce2, &msg[4]); mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); } static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk) { return mptcp_is_fully_established((void *)msk) && ((mptcp_pm_is_userspace(msk) && mptcp_userspace_pm_active(msk)) || READ_ONCE(msk->pm.accept_subflow)); } /* validate received token and create truncated hmac and nonce for SYN-ACK */ static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req) { struct mptcp_sock *msk = subflow_req->msk; u8 hmac[SHA256_DIGEST_SIZE]; subflow_req->local_nonce = get_random_u32(); subflow_generate_hmac(READ_ONCE(msk->local_key), READ_ONCE(msk->remote_key), subflow_req->local_nonce, subflow_req->remote_nonce, hmac); subflow_req->thmac = get_unaligned_be64(hmac); } static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_sock *msk; int local_id; msk = mptcp_token_get_sock(sock_net(req_to_sk(req)), subflow_req->token); if (!msk) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); return NULL; } local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req); if (local_id < 0) { sock_put((struct sock *)msk); return NULL; } subflow_req->local_id = local_id; subflow_req->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)req); return msk; } static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); subflow_req->mp_capable = 0; subflow_req->mp_join = 0; subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener)); subflow_req->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk_listener)); subflow_req->msk = NULL; mptcp_token_init_request(req); } static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk) { return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport; } static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason) { struct mptcp_ext *mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (mpext) { memset(mpext, 0, sizeof(*mpext)); mpext->reset_reason = reason; } } static int subflow_reset_req_endp(struct request_sock *req, struct sk_buff *skb) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEENDPATTEMPT); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } /* Init mptcp request socket. * * Returns an error code if a JOIN has failed and a TCP reset * should be sent. */ static int subflow_check_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_options_received mp_opt; bool opt_mp_capable, opt_mp_join; pr_debug("subflow_req=%p, listener=%p\n", subflow_req, listener); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) { subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); return -EINVAL; } #endif mptcp_get_options(skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYN); opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYN); if (opt_mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); if (unlikely(listener->pm_listener)) return subflow_reset_req_endp(req, skb); if (opt_mp_join) return 0; } else if (opt_mp_join) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); if (mp_opt.backup) SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNBACKUPRX); } else if (unlikely(listener->pm_listener)) { return subflow_reset_req_endp(req, skb); } if (opt_mp_capable && listener->request_mptcp) { int err, retries = MPTCP_TOKEN_MAX_RETRIES; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; again: do { get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key)); } while (subflow_req->local_key == 0); if (unlikely(req->syncookie)) { mptcp_crypto_key_sha(subflow_req->local_key, &subflow_req->token, &subflow_req->idsn); if (mptcp_token_exists(subflow_req->token)) { if (retries-- > 0) goto again; SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); } else { subflow_req->mp_capable = 1; } return 0; } err = mptcp_token_new_request(req); if (err == 0) subflow_req->mp_capable = 1; else if (retries-- > 0) goto again; else SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); } else if (opt_mp_join && listener->request_mptcp) { subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; subflow_req->mp_join = 1; subflow_req->backup = mp_opt.backup; subflow_req->remote_id = mp_opt.join_id; subflow_req->token = mp_opt.token; subflow_req->remote_nonce = mp_opt.nonce; subflow_req->msk = subflow_token_join_request(req); /* Can't fall back to TCP in this case. */ if (!subflow_req->msk) { subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); return -EPERM; } if (subflow_use_different_sport(subflow_req->msk, sk_listener)) { pr_debug("syn inet_sport=%d %d\n", ntohs(inet_sk(sk_listener)->inet_sport), ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX); } subflow_req_create_thmac(subflow_req); if (unlikely(req->syncookie)) { if (!mptcp_can_accept_new_subflow(subflow_req->msk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } subflow_init_req_cookie_join_save(subflow_req, skb); } pr_debug("token=%u, remote_nonce=%u msk=%p\n", subflow_req->token, subflow_req->remote_nonce, subflow_req->msk); } return 0; } int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_options_received mp_opt; bool opt_mp_capable, opt_mp_join; int err; subflow_init_req(req, sk_listener); mptcp_get_options(skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_ACK); opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK); if (opt_mp_capable && opt_mp_join) return -EINVAL; if (opt_mp_capable && listener->request_mptcp) { if (mp_opt.sndr_key == 0) return -EINVAL; subflow_req->local_key = mp_opt.rcvr_key; err = mptcp_token_new_request(req); if (err) return err; subflow_req->mp_capable = 1; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; } else if (opt_mp_join && listener->request_mptcp) { if (!mptcp_token_join_cookie_init_state(subflow_req, skb)) return -EINVAL; subflow_req->mp_join = 1; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; } return 0; } EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); static enum sk_rst_reason mptcp_get_rst_reason(const struct sk_buff *skb) { const struct mptcp_ext *mpext = mptcp_get_ext(skb); if (!mpext) return SK_RST_REASON_NOT_SPECIFIED; return sk_rst_convert_mptcp_reason(mpext->reset_reason); } static struct dst_entry *subflow_v4_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req, u32 tw_isn) { struct dst_entry *dst; int err; tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req, tw_isn); if (!dst) return NULL; err = subflow_check_req(req, sk, skb); if (err == 0) return dst; dst_release(dst); if (!req->syncookie) tcp_request_sock_ops.send_reset(sk, skb, mptcp_get_rst_reason(skb)); return NULL; } static void subflow_prep_synack(const struct sock *sk, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct inet_request_sock *ireq = inet_rsk(req); /* clear tstamp_ok, as needed depending on cookie */ if (foc && foc->len > -1) ireq->tstamp_ok = 0; if (synack_type == TCP_SYNACK_FASTOPEN) mptcp_fastopen_subflow_synack_set_params(subflow, req); } static int subflow_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { subflow_prep_synack(sk, req, foc, synack_type); return tcp_request_sock_ipv4_ops.send_synack(sk, dst, fl, req, foc, synack_type, syn_skb); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { subflow_prep_synack(sk, req, foc, synack_type); return tcp_request_sock_ipv6_ops.send_synack(sk, dst, fl, req, foc, synack_type, syn_skb); } static struct dst_entry *subflow_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req, u32 tw_isn) { struct dst_entry *dst; int err; tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req, tw_isn); if (!dst) return NULL; err = subflow_check_req(req, sk, skb); if (err == 0) return dst; dst_release(dst); if (!req->syncookie) tcp6_request_sock_ops.send_reset(sk, skb, mptcp_get_rst_reason(skb)); return NULL; } #endif /* validate received truncated hmac and create hmac for third ACK */ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) { u8 hmac[SHA256_DIGEST_SIZE]; u64 thmac; subflow_generate_hmac(subflow->remote_key, subflow->local_key, subflow->remote_nonce, subflow->local_nonce, hmac); thmac = get_unaligned_be64(hmac); pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n", subflow, subflow->token, thmac, subflow->thmac); return thmac == subflow->thmac; } void mptcp_subflow_reset(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; /* mptcp_mp_fail_no_response() can reach here on an already closed * socket */ if (ssk->sk_state == TCP_CLOSE) return; /* must hold: tcp_done() could drop last reference on parent */ sock_hold(sk); mptcp_send_active_reset_reason(ssk); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) mptcp_schedule_work(sk); sock_put(sk); } static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct sock *sk) { return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport; } void __mptcp_sync_state(struct sock *sk, int state) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk = msk->first; subflow = mptcp_subflow_ctx(ssk); __mptcp_propagate_sndbuf(sk, ssk); if (sk->sk_state == TCP_SYN_SENT) { /* subflow->idsn is always available is TCP_SYN_SENT state, * even for the FASTOPEN scenarios */ WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->snd_nxt, msk->write_seq); mptcp_set_state(sk, state); sk->sk_state_change(sk); } } static void subflow_set_remote_key(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt) { /* active MPC subflow will reach here multiple times: * at subflow_finish_connect() time and at 4th ack time */ if (subflow->remote_key_valid) return; subflow->remote_key_valid = 1; subflow->remote_key = mp_opt->sndr_key; mptcp_crypto_key_sha(subflow->remote_key, NULL, &subflow->iasn); subflow->iasn++; /* for fallback's sake */ subflow->map_seq = subflow->iasn; WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->ack_seq, subflow->iasn); WRITE_ONCE(msk->can_ack, true); atomic64_set(&msk->rcv_wnd_sent, subflow->iasn); } static void mptcp_propagate_state(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt) { struct mptcp_sock *msk = mptcp_sk(sk); mptcp_data_lock(sk); if (mp_opt) { /* Options are available only in the non fallback cases * avoid updating rx path fields otherwise */ WRITE_ONCE(msk->snd_una, subflow->idsn + 1); WRITE_ONCE(msk->wnd_end, subflow->idsn + 1 + tcp_sk(ssk)->snd_wnd); subflow_set_remote_key(msk, subflow, mp_opt); } if (!sock_owned_by_user(sk)) { __mptcp_sync_state(sk, ssk->sk_state); } else { msk->pending_state = ssk->sk_state; __set_bit(MPTCP_SYNC_STATE, &msk->cb_flags); } mptcp_data_unlock(sk); } static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_options_received mp_opt; struct sock *parent = subflow->conn; struct mptcp_sock *msk; subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); /* be sure no special action on any packet other than syn-ack */ if (subflow->conn_finished) return; msk = mptcp_sk(parent); subflow->rel_write_seq = 1; subflow->conn_finished = 1; subflow->ssn_offset = TCP_SKB_CB(skb)->seq; pr_debug("subflow=%p synack seq=%x\n", subflow, subflow->ssn_offset); mptcp_get_options(skb, &mp_opt); if (subflow->request_mptcp) { if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYNACK)) { if (!mptcp_try_fallback(sk, MPTCP_MIB_MPCAPABLEACTIVEFALLBACK)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FALLBACKFAILED); goto do_reset; } goto fallback; } if (mp_opt.suboptions & OPTION_MPTCP_CSUMREQD) WRITE_ONCE(msk->csum_enabled, true); if (mp_opt.deny_join_id0) WRITE_ONCE(msk->pm.remote_deny_join_id0, true); subflow->mp_capable = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); mptcp_finish_connect(sk); mptcp_active_enable(parent); mptcp_propagate_state(parent, sk, subflow, &mp_opt); } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYNACK)) { subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } subflow->backup = mp_opt.backup; subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; WRITE_ONCE(subflow->remote_id, mp_opt.join_id); pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d\n", subflow, subflow->thmac, subflow->remote_nonce, subflow->backup); if (!subflow_thmac_valid(subflow)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKMAC); subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } if (!mptcp_finish_join(sk)) goto do_reset; subflow_generate_hmac(subflow->local_key, subflow->remote_key, subflow->local_nonce, subflow->remote_nonce, hmac); memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); if (subflow->backup) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKBACKUPRX); if (subflow_use_different_dport(msk, sk)) { pr_debug("synack inet_dport=%d %d\n", ntohs(inet_sk(sk)->inet_dport), ntohs(inet_sk(parent)->inet_dport)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX); } } else if (mptcp_check_fallback(sk)) { /* It looks like MPTCP is blocked, while TCP is not */ if (subflow->mpc_drop) mptcp_active_disable(parent); fallback: mptcp_propagate_state(parent, sk, subflow, NULL); } return; do_reset: subflow->reset_transient = 0; mptcp_subflow_reset(sk); } static void subflow_set_local_id(struct mptcp_subflow_context *subflow, int local_id) { WARN_ON_ONCE(local_id < 0 || local_id > 255); WRITE_ONCE(subflow->local_id, local_id); } static int subflow_chk_local_id(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); int err; if (likely(subflow->local_id >= 0)) return 0; err = mptcp_pm_get_local_id(msk, (struct sock_common *)sk); if (err < 0) return err; subflow_set_local_id(subflow, err); subflow->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)sk); return 0; } static int subflow_rebuild_header(struct sock *sk) { int err = subflow_chk_local_id(sk); if (unlikely(err < 0)) return err; return inet_sk_rebuild_header(sk); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static int subflow_v6_rebuild_header(struct sock *sk) { int err = subflow_chk_local_id(sk); if (unlikely(err < 0)) return err; return inet6_sk_rebuild_header(sk); } #endif static struct request_sock_ops mptcp_subflow_v4_request_sock_ops __ro_after_init; static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init; static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); pr_debug("subflow=%p\n", subflow); /* Never answer to SYNs sent to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; return tcp_conn_request(&mptcp_subflow_v4_request_sock_ops, &subflow_request_sock_ipv4_ops, sk, skb); drop: tcp_listendrop(sk); return 0; } static void subflow_v4_req_destructor(struct request_sock *req) { subflow_req_destructor(req); tcp_request_sock_ops.destructor(req); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init; static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init; static struct proto tcpv6_prot_override __ro_after_init; static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); pr_debug("subflow=%p\n", subflow); if (skb->protocol == htons(ETH_P_IP)) return subflow_v4_conn_request(sk, skb); if (!ipv6_unicast_destination(skb)) goto drop; if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); return 0; } return tcp_conn_request(&mptcp_subflow_v6_request_sock_ops, &subflow_request_sock_ipv6_ops, sk, skb); drop: tcp_listendrop(sk); return 0; /* don't send reset */ } static void subflow_v6_req_destructor(struct request_sock *req) { subflow_req_destructor(req); tcp6_request_sock_ops.destructor(req); } #endif struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) { if (ops->family == AF_INET) ops = &mptcp_subflow_v4_request_sock_ops; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (ops->family == AF_INET6) ops = &mptcp_subflow_v6_request_sock_ops; #endif return inet_reqsk_alloc(ops, sk_listener, attach_listener); } EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc); /* validate hmac received in third ACK */ static bool subflow_hmac_valid(const struct mptcp_subflow_request_sock *subflow_req, const struct mptcp_options_received *mp_opt) { struct mptcp_sock *msk = subflow_req->msk; u8 hmac[SHA256_DIGEST_SIZE]; subflow_generate_hmac(READ_ONCE(msk->remote_key), READ_ONCE(msk->local_key), subflow_req->remote_nonce, subflow_req->local_nonce, hmac); return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN); } static void subflow_ulp_fallback(struct sock *sk, struct mptcp_subflow_context *old_ctx) { struct inet_connection_sock *icsk = inet_csk(sk); mptcp_subflow_tcp_fallback(sk, old_ctx); icsk->icsk_ulp_ops = NULL; rcu_assign_pointer(icsk->icsk_ulp_data, NULL); tcp_sk(sk)->is_mptcp = 0; mptcp_subflow_ops_undo_override(sk); } void mptcp_subflow_drop_ctx(struct sock *ssk) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); if (!ctx) return; list_del(&mptcp_subflow_ctx(ssk)->node); if (inet_csk(ssk)->icsk_ulp_ops) { subflow_ulp_fallback(ssk, ctx); if (ctx->conn) sock_put(ctx->conn); } kfree_rcu(ctx, rcu); } void __mptcp_subflow_fully_established(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt) { subflow_set_remote_key(msk, subflow, mp_opt); WRITE_ONCE(subflow->fully_established, true); WRITE_ONCE(msk->fully_established, true); } static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req, void (*opt_child_init)(struct sock *newsk, const struct sock *sk)) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); struct mptcp_subflow_request_sock *subflow_req; struct mptcp_options_received mp_opt; bool fallback, fallback_is_fatal; enum sk_rst_reason reason; struct mptcp_sock *owner; struct sock *child; pr_debug("listener=%p, req=%p, conn=%p\n", listener, req, listener->conn); /* After child creation we must look for MPC even when options * are not parsed */ mp_opt.suboptions = 0; /* hopefully temporary handling for MP_JOIN+syncookie */ subflow_req = mptcp_subflow_rsk(req); fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join; fallback = !tcp_rsk(req)->is_mptcp; if (fallback) goto create_child; /* if the sk is MP_CAPABLE, we try to fetch the client key */ if (subflow_req->mp_capable) { /* we can receive and accept an in-window, out-of-order pkt, * which may not carry the MP_CAPABLE opt even on mptcp enabled * paths: always try to extract the peer key, and fallback * for packets missing it. * Even OoO DSS packets coming legitly after dropped or * reordered MPC will cause fallback, but we don't have other * options. */ mptcp_get_options(skb, &mp_opt); if (!(mp_opt.suboptions & (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_ACK))) fallback = true; } else if (subflow_req->mp_join) { mptcp_get_options(skb, &mp_opt); if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK)) fallback = true; } create_child: child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash, own_req, opt_child_init); if (child && *own_req) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); tcp_rsk(req)->drop_req = false; /* we need to fallback on ctx allocation failure and on pre-reqs * checking above. In the latter scenario we additionally need * to reset the context to non MPTCP status. */ if (!ctx || fallback) { if (fallback_is_fatal) { subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); goto dispose_child; } goto fallback; } /* ssk inherits options of listener sk */ ctx->setsockopt_seq = listener->setsockopt_seq; if (ctx->mp_capable) { ctx->conn = mptcp_sk_clone_init(listener->conn, &mp_opt, child, req); if (!ctx->conn) goto fallback; ctx->subflow_id = 1; owner = mptcp_sk(ctx->conn); if (mp_opt.deny_join_id0) WRITE_ONCE(owner->pm.remote_deny_join_id0, true); mptcp_pm_new_connection(owner, child, 1); /* with OoO packets we can reach here without ingress * mpc option */ if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) { mptcp_pm_fully_established(owner, child); ctx->pm_notified = 1; } } else if (ctx->mp_join) { owner = subflow_req->msk; if (!owner) { subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } if (!subflow_hmac_valid(subflow_req, &mp_opt)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); goto dispose_child; } if (!mptcp_can_accept_new_subflow(owner)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } /* move the msk reference ownership to the subflow */ subflow_req->msk = NULL; ctx->conn = (struct sock *)owner; if (subflow_use_different_sport(owner, sk)) { pr_debug("ack inet_sport=%d %d\n", ntohs(inet_sk(sk)->inet_sport), ntohs(inet_sk((struct sock *)owner)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(owner, sk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX); } if (!mptcp_finish_join(child)) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(child); subflow_add_reset_reason(skb, subflow->reset_reason); goto dispose_child; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); tcp_rsk(req)->drop_req = true; } } /* check for expected invariant - should never trigger, just help * catching earlier subtle bugs */ WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp && (!mptcp_subflow_ctx(child) || !mptcp_subflow_ctx(child)->conn)); return child; dispose_child: mptcp_subflow_drop_ctx(child); tcp_rsk(req)->drop_req = true; inet_csk_prepare_for_destroy_sock(child); tcp_done(child); reason = mptcp_get_rst_reason(skb); req->rsk_ops->send_reset(sk, skb, reason); /* The last child reference will be released by the caller */ return child; fallback: if (fallback) SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); mptcp_subflow_drop_ctx(child); return child; } static struct inet_connection_sock_af_ops subflow_specific __ro_after_init; static struct proto tcp_prot_override __ro_after_init; enum mapping_status { MAPPING_OK, MAPPING_INVALID, MAPPING_EMPTY, MAPPING_DATA_FIN, MAPPING_DUMMY, MAPPING_BAD_CSUM, MAPPING_NODSS }; static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) { pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d\n", ssn, subflow->map_subflow_seq, subflow->map_data_len); } static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); unsigned int skb_consumed; skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; if (unlikely(skb_consumed >= skb->len)) { DEBUG_NET_WARN_ON_ONCE(1); return true; } return skb->len - skb_consumed <= subflow->map_data_len - mptcp_subflow_get_map_offset(subflow); } static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; if (unlikely(before(ssn, subflow->map_subflow_seq))) { /* Mapping covers data later in the subflow stream, * currently unsupported. */ dbg_bad_map(subflow, ssn); return false; } if (unlikely(!before(ssn, subflow->map_subflow_seq + subflow->map_data_len))) { /* Mapping does covers past subflow data, invalid */ dbg_bad_map(subflow, ssn); return false; } return true; } static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *skb, bool csum_reqd) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); u32 offset, seq, delta; __sum16 csum; int len; if (!csum_reqd) return MAPPING_OK; /* mapping already validated on previous traversal */ if (subflow->map_csum_len == subflow->map_data_len) return MAPPING_OK; /* traverse the receive queue, ensuring it contains a full * DSS mapping and accumulating the related csum. * Preserve the accoumlate csum across multiple calls, to compute * the csum only once */ delta = subflow->map_data_len - subflow->map_csum_len; for (;;) { seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len; offset = seq - TCP_SKB_CB(skb)->seq; /* if the current skb has not been accounted yet, csum its contents * up to the amount covered by the current DSS */ if (offset < skb->len) { __wsum csum; len = min(skb->len - offset, delta); csum = skb_checksum(skb, offset, len, 0); subflow->map_data_csum = csum_block_add(subflow->map_data_csum, csum, subflow->map_csum_len); delta -= len; subflow->map_csum_len += len; } if (delta == 0) break; if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) { /* if this subflow is closed, the partial mapping * will be never completed; flush the pending skbs, so * that subflow_sched_work_if_closed() can kick in */ if (unlikely(ssk->sk_state == TCP_CLOSE)) while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); /* not enough data to validate the csum */ return MAPPING_EMPTY; } /* the DSS mapping for next skbs will be validated later, * when a get_mapping_status call will process such skb */ skb = skb->next; } /* note that 'map_data_len' accounts only for the carried data, does * not include the eventual seq increment due to the data fin, * while the pseudo header requires the original DSS data len, * including that */ csum = __mptcp_make_csum(subflow->map_seq, subflow->map_subflow_seq, subflow->map_data_len + subflow->map_data_fin, subflow->map_data_csum); if (unlikely(csum)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); return MAPPING_BAD_CSUM; } subflow->valid_csum_seen = 1; return MAPPING_OK; } static enum mapping_status get_mapping_status(struct sock *ssk, struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool csum_reqd = READ_ONCE(msk->csum_enabled); struct mptcp_ext *mpext; struct sk_buff *skb; u16 data_len; u64 map_seq; skb = skb_peek(&ssk->sk_receive_queue); if (!skb) return MAPPING_EMPTY; if (mptcp_check_fallback(ssk)) return MAPPING_DUMMY; mpext = mptcp_get_ext(skb); if (!mpext || !mpext->use_map) { if (!subflow->map_valid && !skb->len) { /* the TCP stack deliver 0 len FIN pkt to the receive * queue, that is the only 0len pkts ever expected here, * and we can admit no mapping only for 0 len pkts */ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) WARN_ONCE(1, "0len seq %d:%d flags %x", TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, TCP_SKB_CB(skb)->tcp_flags); sk_eat_skb(ssk, skb); return MAPPING_EMPTY; } /* If the required DSS has likely been dropped by a middlebox */ if (!subflow->map_valid) return MAPPING_NODSS; goto validate_seq; } trace_get_mapping_status(mpext); data_len = mpext->data_len; if (data_len == 0) { pr_debug("infinite mapping received\n"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); return MAPPING_INVALID; } if (mpext->data_fin == 1) { u64 data_fin_seq; if (data_len == 1) { bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq, mpext->dsn64); pr_debug("DATA_FIN with no payload seq=%llu\n", mpext->data_seq); if (subflow->map_valid) { /* A DATA_FIN might arrive in a DSS * option before the previous mapping * has been fully consumed. Continue * handling the existing mapping. */ skb_ext_del(skb, SKB_EXT_MPTCP); return MAPPING_OK; } if (updated) mptcp_schedule_work((struct sock *)msk); return MAPPING_DATA_FIN; } data_fin_seq = mpext->data_seq + data_len - 1; /* If mpext->data_seq is a 32-bit value, data_fin_seq must also * be limited to 32 bits. */ if (!mpext->dsn64) data_fin_seq &= GENMASK_ULL(31, 0); mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64); pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d\n", data_fin_seq, mpext->dsn64); /* Adjust for DATA_FIN using 1 byte of sequence space */ data_len--; } map_seq = mptcp_expand_seq(READ_ONCE(msk->ack_seq), mpext->data_seq, mpext->dsn64); WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64); if (subflow->map_valid) { /* Allow replacing only with an identical map */ if (subflow->map_seq == map_seq && subflow->map_subflow_seq == mpext->subflow_seq && subflow->map_data_len == data_len && subflow->map_csum_reqd == mpext->csum_reqd) { skb_ext_del(skb, SKB_EXT_MPTCP); goto validate_csum; } /* If this skb data are fully covered by the current mapping, * the new map would need caching, which is not supported */ if (skb_is_fully_mapped(ssk, skb)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH); return MAPPING_INVALID; } /* will validate the next map after consuming the current one */ goto validate_csum; } subflow->map_seq = map_seq; subflow->map_subflow_seq = mpext->subflow_seq; subflow->map_data_len = data_len; subflow->map_valid = 1; subflow->map_data_fin = mpext->data_fin; subflow->mpc_map = mpext->mpc_map; subflow->map_csum_reqd = mpext->csum_reqd; subflow->map_csum_len = 0; subflow->map_data_csum = csum_unfold(mpext->csum); /* Cfr RFC 8684 Section 3.3.0 */ if (unlikely(subflow->map_csum_reqd != csum_reqd)) return MAPPING_INVALID; pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u\n", subflow->map_seq, subflow->map_subflow_seq, subflow->map_data_len, subflow->map_csum_reqd, subflow->map_data_csum); validate_seq: /* we revalidate valid mapping on new skb, because we must ensure * the current skb is completely covered by the available mapping */ if (!validate_mapping(ssk, skb)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSTCPMISMATCH); return MAPPING_INVALID; } skb_ext_del(skb, SKB_EXT_MPTCP); validate_csum: return validate_data_csum(ssk, skb, csum_reqd); } static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, u64 limit) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; struct tcp_sock *tp = tcp_sk(ssk); u32 offset, incr, avail_len; offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; if (WARN_ON_ONCE(offset > skb->len)) goto out; avail_len = skb->len - offset; incr = limit >= avail_len ? avail_len + fin : limit; pr_debug("discarding=%d len=%d offset=%d seq=%d\n", incr, skb->len, offset, subflow->map_subflow_seq); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA); tcp_sk(ssk)->copied_seq += incr; out: if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq)) sk_eat_skb(ssk, skb); if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) subflow->map_valid = 0; } static bool subflow_is_done(const struct sock *sk) { return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; } /* sched mptcp worker for subflow cleanup if no more data is pending */ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) { const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; if (likely(ssk->sk_state != TCP_CLOSE && (ssk->sk_state != TCP_CLOSE_WAIT || inet_sk_state_load(sk) != TCP_ESTABLISHED))) return; if (!skb_queue_empty(&ssk->sk_receive_queue)) return; if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) mptcp_schedule_work(sk); /* when the fallback subflow closes the rx side, trigger a 'dummy' * ingress data fin, so that the msk state will follow along */ if (__mptcp_check_fallback(msk) && subflow_is_done(ssk) && msk->first == ssk && mptcp_update_rcv_data_fin(msk, subflow->map_seq + subflow->map_data_len, true)) mptcp_schedule_work(sk); } static bool mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); unsigned long fail_tout; /* we are really failing, prevent any later subflow join */ spin_lock_bh(&msk->fallback_lock); if (!msk->allow_infinite_fallback) { spin_unlock_bh(&msk->fallback_lock); return false; } msk->allow_subflows = false; spin_unlock_bh(&msk->fallback_lock); /* graceful failure can happen only on the MPC subflow */ if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first))) return false; /* since the close timeout take precedence on the fail one, * no need to start the latter when the first is already set */ if (sock_flag((struct sock *)msk, SOCK_DEAD)) return true; /* we don't need extreme accuracy here, use a zero fail_tout as special * value meaning no fail timeout at all; */ fail_tout = jiffies + TCP_RTO_MAX; if (!fail_tout) fail_tout = 1; WRITE_ONCE(subflow->fail_tout, fail_tout); tcp_send_ack(ssk); mptcp_reset_tout_timer(msk, subflow->fail_tout); return true; } static bool subflow_check_data_avail(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); enum mapping_status status; struct mptcp_sock *msk; struct sk_buff *skb; if (!skb_peek(&ssk->sk_receive_queue)) WRITE_ONCE(subflow->data_avail, false); if (subflow->data_avail) return true; msk = mptcp_sk(subflow->conn); for (;;) { u64 ack_seq; u64 old_ack; status = get_mapping_status(ssk, msk); trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY || status == MAPPING_BAD_CSUM || status == MAPPING_NODSS)) goto fallback; if (status != MAPPING_OK) goto no_data; skb = skb_peek(&ssk->sk_receive_queue); if (WARN_ON_ONCE(!skb)) goto no_data; if (unlikely(!READ_ONCE(msk->can_ack))) goto fallback; old_ack = READ_ONCE(msk->ack_seq); ack_seq = mptcp_subflow_get_mapped_dsn(subflow); pr_debug("msk ack_seq=%llx subflow ack_seq=%llx\n", old_ack, ack_seq); if (unlikely(before64(ack_seq, old_ack))) { mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); continue; } WRITE_ONCE(subflow->data_avail, true); break; } return true; no_data: subflow_sched_work_if_closed(msk, ssk); return false; fallback: if (!__mptcp_check_fallback(msk)) { /* RFC 8684 section 3.7. */ if (status == MAPPING_BAD_CSUM && (subflow->mp_join || subflow->valid_csum_seen)) { subflow->send_mp_fail = 1; if (!mptcp_subflow_fail(msk, ssk)) { subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; goto reset; } WRITE_ONCE(subflow->data_avail, true); return true; } if (!mptcp_try_fallback(ssk, MPTCP_MIB_DSSFALLBACK)) { /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers */ subflow->reset_transient = 0; subflow->reset_reason = status == MAPPING_NODSS ? MPTCP_RST_EMIDDLEBOX : MPTCP_RST_EMPTCP; reset: WRITE_ONCE(ssk->sk_err, EBADMSG); tcp_set_state(ssk, TCP_CLOSE); while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); mptcp_send_active_reset_reason(ssk); WRITE_ONCE(subflow->data_avail, false); return false; } } skb = skb_peek(&ssk->sk_receive_queue); subflow->map_valid = 1; subflow->map_data_len = skb->len; subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; subflow->map_seq = __mptcp_expand_seq(subflow->map_seq, subflow->iasn + TCP_SKB_CB(skb)->seq - subflow->ssn_offset - 1); WRITE_ONCE(subflow->data_avail, true); return true; } bool mptcp_subflow_data_available(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* check if current mapping is still valid */ if (subflow->map_valid && mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { subflow->map_valid = 0; WRITE_ONCE(subflow->data_avail, false); pr_debug("Done with mapping: seq=%u data_len=%u\n", subflow->map_subflow_seq, subflow->map_data_len); } return subflow_check_data_avail(sk); } /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy, * not the ssk one. * * In mptcp, rwin is about the mptcp-level connection data. * * Data that is still on the ssk rx queue can thus be ignored, * as far as mptcp peer is concerned that data is still inflight. * DSS ACK is updated when skb is moved to the mptcp rx queue. */ void mptcp_space(const struct sock *ssk, int *space, int *full_space) { const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); const struct sock *sk = subflow->conn; *space = __mptcp_space(sk); *full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } static void subflow_error_report(struct sock *ssk) { struct sock *sk = mptcp_subflow_ctx(ssk)->conn; /* bail early if this is a no-op, so that we avoid introducing a * problematic lockdep dependency between TCP accept queue lock * and msk socket spinlock */ if (!sk->sk_socket) return; mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_error_report(sk); else __set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } static void subflow_data_ready(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); u16 state = 1 << inet_sk_state_load(sk); struct sock *parent = subflow->conn; struct mptcp_sock *msk; trace_sk_data_ready(sk); msk = mptcp_sk(parent); if (state & TCPF_LISTEN) { /* MPJ subflow are removed from accept queue before reaching here, * avoid stray wakeups */ if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue)) return; parent->sk_data_ready(parent); return; } WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && !subflow->mp_join && !(state & TCPF_CLOSE)); if (mptcp_subflow_data_available(sk)) { mptcp_data_ready(parent, sk); /* subflow-level lowat test are not relevant. * respect the msk-level threshold eventually mandating an immediate ack */ if (mptcp_data_avail(msk) < parent->sk_rcvlowat && (tcp_sk(sk)->rcv_nxt - tcp_sk(sk)->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss) inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } else if (unlikely(sk->sk_err)) { subflow_error_report(sk); } } static void subflow_write_space(struct sock *ssk) { struct sock *sk = mptcp_subflow_ctx(ssk)->conn; mptcp_propagate_sndbuf(sk, ssk); mptcp_write_space(sk); } static const struct inet_connection_sock_af_ops * subflow_default_af_ops(struct sock *sk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (sk->sk_family == AF_INET6) return &subflow_v6_specific; #endif return &subflow_specific; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock_af_ops *target; target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d\n", subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); if (likely(icsk->icsk_af_ops == target)) return; subflow->icsk_af_ops = icsk->icsk_af_ops; icsk->icsk_af_ops = target; } #endif void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family) { memset(addr, 0, sizeof(*addr)); addr->ss_family = family; if (addr->ss_family == AF_INET) { struct sockaddr_in *in_addr = (struct sockaddr_in *)addr; if (info->family == AF_INET) in_addr->sin_addr = info->addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (ipv6_addr_v4mapped(&info->addr6)) in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3]; #endif in_addr->sin_port = info->port; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->ss_family == AF_INET6) { struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr; if (info->family == AF_INET) ipv6_addr_set_v4mapped(info->addr.s_addr, &in6_addr->sin6_addr); else in6_addr->sin6_addr = info->addr6; in6_addr->sin6_port = info->port; } #endif } int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, const struct mptcp_addr_info *remote) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; int local_id = local->addr.id; struct sockaddr_storage addr; int remote_id = remote->id; int err = -ENOTCONN; struct socket *sf; struct sock *ssk; u32 remote_token; int addrlen; /* The userspace PM sent the request too early? */ if (!mptcp_is_fully_established(sk)) goto err_out; err = mptcp_subflow_create_socket(sk, local->addr.family, &sf); if (err) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCREATSKERR); pr_debug("msk=%p local=%d remote=%d create sock error: %d\n", msk, local_id, remote_id, err); goto err_out; } ssk = sf->sk; subflow = mptcp_subflow_ctx(ssk); do { subflow->local_nonce = get_random_u32(); } while (!subflow->local_nonce); /* if 'IPADDRANY', the ID will be set later, after the routing */ if (local->addr.family == AF_INET) { if (!local->addr.addr.s_addr) local_id = -1; #if IS_ENABLED(CONFIG_MPTCP_IPV6) } else if (sk->sk_family == AF_INET6) { if (ipv6_addr_any(&local->addr.addr6)) local_id = -1; #endif } if (local_id >= 0) subflow_set_local_id(subflow, local_id); subflow->remote_key_valid = 1; subflow->remote_key = READ_ONCE(msk->remote_key); subflow->local_key = READ_ONCE(msk->local_key); subflow->token = msk->token; mptcp_info2sockaddr(&local->addr, &addr, ssk->sk_family); addrlen = sizeof(struct sockaddr_in); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (addr.ss_family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif ssk->sk_bound_dev_if = local->ifindex; err = kernel_bind(sf, (struct sockaddr_unsized *)&addr, addrlen); if (err) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXBINDERR); pr_debug("msk=%p local=%d remote=%d bind error: %d\n", msk, local_id, remote_id, err); goto failed; } mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d\n", msk, remote_token, local_id, remote_id); subflow->remote_token = remote_token; WRITE_ONCE(subflow->remote_id, remote_id); subflow->request_join = 1; subflow->request_bkup = !!(local->flags & MPTCP_PM_ADDR_FLAG_BACKUP); subflow->subflow_id = msk->subflow_id++; mptcp_info2sockaddr(remote, &addr, ssk->sk_family); sock_hold(ssk); list_add_tail(&subflow->node, &msk->conn_list); err = kernel_connect(sf, (struct sockaddr_unsized *)&addr, addrlen, O_NONBLOCK); if (err && err != -EINPROGRESS) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCONNECTERR); pr_debug("msk=%p local=%d remote=%d connect error: %d\n", msk, local_id, remote_id, err); goto failed_unlink; } MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTX); /* discard the subflow socket */ mptcp_sock_graft(ssk, sk->sk_socket); iput(SOCK_INODE(sf)); mptcp_stop_tout_timer(sk); return 0; failed_unlink: list_del(&subflow->node); sock_put(mptcp_subflow_tcp_sock(subflow)); failed: subflow->disposable = 1; sock_release(sf); err_out: /* we account subflows before the creation, and this failures will not * be caught by sk_state_change() */ mptcp_pm_close_subflow(msk); return err; } void __mptcp_inherit_memcg(struct sock *sk, struct sock *ssk, gfp_t gfp) { /* Only if the msk has been accepted already (and not orphaned).*/ if (!mem_cgroup_sockets_enabled || !sk->sk_socket) return; mem_cgroup_sk_inherit(sk, ssk); __sk_charge(ssk, gfp); } void __mptcp_inherit_cgrp_data(struct sock *sk, struct sock *ssk) { #ifdef CONFIG_SOCK_CGROUP_DATA struct sock_cgroup_data *sk_cd = &sk->sk_cgrp_data, *ssk_cd = &ssk->sk_cgrp_data; /* only the additional subflows created by kworkers have to be modified */ if (cgroup_id(sock_cgroup_ptr(sk_cd)) != cgroup_id(sock_cgroup_ptr(ssk_cd))) { cgroup_sk_free(ssk_cd); *ssk_cd = *sk_cd; cgroup_sk_clone(sk_cd); } #endif /* CONFIG_SOCK_CGROUP_DATA */ } static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) { __mptcp_inherit_cgrp_data(parent, child); if (mem_cgroup_sockets_enabled) mem_cgroup_sk_inherit(parent, child); } static void mptcp_subflow_ops_override(struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (ssk->sk_prot == &tcpv6_prot) ssk->sk_prot = &tcpv6_prot_override; else #endif ssk->sk_prot = &tcp_prot_override; } static void mptcp_subflow_ops_undo_override(struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (ssk->sk_prot == &tcpv6_prot_override) ssk->sk_prot = &tcpv6_prot; else #endif ssk->sk_prot = &tcp_prot; } int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, struct socket **new_sock) { struct mptcp_subflow_context *subflow; struct net *net = sock_net(sk); struct socket *sf; int err; /* un-accepted server sockets can reach here - on bad configuration * bail early to avoid greater trouble later */ if (unlikely(!sk->sk_socket)) return -EINVAL; err = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &sf); if (err) return err; lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING); err = security_mptcp_add_subflow(sk, sf->sk); if (err) goto err_free; /* the newly created socket has to be in the same cgroup as its parent */ mptcp_attach_cgroup(sk, sf->sk); /* kernel sockets do not by default acquire net ref, but TCP timer * needs it. * Update ns_tracker to current stack trace and refcounted tracker. */ sk_net_refcnt_upgrade(sf->sk); err = tcp_set_ulp(sf->sk, "mptcp"); if (err) goto err_free; mptcp_sockopt_sync_locked(mptcp_sk(sk), sf->sk); release_sock(sf->sk); /* the newly created socket really belongs to the owning MPTCP * socket, even if for additional subflows the allocation is performed * by a kernel workqueue. Adjust inode references, so that the * procfs/diag interfaces really show this one belonging to the correct * user. */ SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino; SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid; SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid; subflow = mptcp_subflow_ctx(sf->sk); pr_debug("subflow=%p\n", subflow); *new_sock = sf; sock_hold(sk); subflow->conn = sk; mptcp_subflow_ops_override(sf->sk); return 0; err_free: release_sock(sf->sk); sock_release(sf); return err; } static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, gfp_t priority) { struct inet_connection_sock *icsk = inet_csk(sk); struct mptcp_subflow_context *ctx; ctx = kzalloc_obj(*ctx, priority); if (!ctx) return NULL; rcu_assign_pointer(icsk->icsk_ulp_data, ctx); INIT_LIST_HEAD(&ctx->node); INIT_LIST_HEAD(&ctx->delegated_node); pr_debug("subflow=%p\n", ctx); ctx->tcp_sock = sk; WRITE_ONCE(ctx->local_id, -1); return ctx; } static void __subflow_state_change(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); rcu_read_unlock(); } static void subflow_state_change(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct sock *parent = subflow->conn; __subflow_state_change(sk); /* as recvmsg() does not acquire the subflow socket for ssk selection * a fin packet carrying a DSS can be unnoticed if we don't trigger * the data available machinery here. */ if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); else if (unlikely(sk->sk_err)) subflow_error_report(sk); subflow_sched_work_if_closed(mptcp_sk(parent), sk); } void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) { struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; struct request_sock *req, *head, *tail; struct mptcp_subflow_context *subflow; struct sock *sk, *ssk; /* Due to lock dependencies no relevant lock can be acquired under rskq_lock. * Splice the req list, so that accept() can not reach the pending ssk after * the listener socket is released below. */ spin_lock_bh(&queue->rskq_lock); head = queue->rskq_accept_head; tail = queue->rskq_accept_tail; queue->rskq_accept_head = NULL; queue->rskq_accept_tail = NULL; spin_unlock_bh(&queue->rskq_lock); if (!head) return; /* can't acquire the msk socket lock under the subflow one, * or will cause ABBA deadlock */ release_sock(listener_ssk); for (req = head; req; req = req->dl_next) { ssk = req->sk; if (!sk_is_mptcp(ssk)) continue; subflow = mptcp_subflow_ctx(ssk); if (!subflow || !subflow->conn) continue; sk = subflow->conn; sock_hold(sk); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); __mptcp_unaccepted_force_close(sk); release_sock(sk); /* lockdep will report a false positive ABBA deadlock * between cancel_work_sync and the listener socket. * The involved locks belong to different sockets WRT * the existing AB chain. * Using a per socket key is problematic as key * deregistration requires process context and must be * performed at socket disposal time, in atomic * context. * Just tell lockdep to consider the listener socket * released here. */ mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_); mptcp_cancel_work(sk); mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_); sock_put(sk); } /* we are still under the listener msk socket lock */ lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING); /* restore the listener queue, to let the TCP code clean it up */ spin_lock_bh(&queue->rskq_lock); WARN_ON_ONCE(queue->rskq_accept_head); queue->rskq_accept_head = head; queue->rskq_accept_tail = tail; spin_unlock_bh(&queue->rskq_lock); } static int subflow_ulp_init(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct mptcp_subflow_context *ctx; struct tcp_sock *tp = tcp_sk(sk); int err = 0; /* disallow attaching ULP to a socket unless it has been * created with sock_create_kern() */ if (!sk->sk_kern_sock) { err = -EOPNOTSUPP; goto out; } ctx = subflow_create_ctx(sk, GFP_KERNEL); if (!ctx) { err = -ENOMEM; goto out; } pr_debug("subflow=%p, family=%d\n", ctx, sk->sk_family); tp->is_mptcp = 1; ctx->icsk_af_ops = icsk->icsk_af_ops; icsk->icsk_af_ops = subflow_default_af_ops(sk); ctx->tcp_state_change = sk->sk_state_change; ctx->tcp_error_report = sk->sk_error_report; WARN_ON_ONCE(sk->sk_data_ready != sock_def_readable); WARN_ON_ONCE(sk->sk_write_space != sk_stream_write_space); sk->sk_data_ready = subflow_data_ready; sk->sk_write_space = subflow_write_space; sk->sk_state_change = subflow_state_change; sk->sk_error_report = subflow_error_report; out: return err; } static void subflow_ulp_release(struct sock *ssk) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); bool release = true; struct sock *sk; if (!ctx) return; sk = ctx->conn; if (sk) { /* if the msk has been orphaned, keep the ctx * alive, will be freed by __mptcp_close_ssk(), * when the subflow is still unaccepted */ release = ctx->disposable || list_empty(&ctx->node); /* inet_child_forget() does not call sk_state_change(), * explicitly trigger the socket close machinery */ if (!release && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) mptcp_schedule_work(sk); sock_put(sk); } mptcp_subflow_ops_undo_override(ssk); if (release) kfree_rcu(ctx, rcu); } static void subflow_ulp_clone(const struct request_sock *req, struct sock *newsk, const gfp_t priority) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); struct mptcp_subflow_context *new_ctx; if (!tcp_rsk(req)->is_mptcp || (!subflow_req->mp_capable && !subflow_req->mp_join)) { subflow_ulp_fallback(newsk, old_ctx); return; } new_ctx = subflow_create_ctx(newsk, priority); if (!new_ctx) { subflow_ulp_fallback(newsk, old_ctx); return; } new_ctx->conn_finished = 1; new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; new_ctx->tcp_state_change = old_ctx->tcp_state_change; new_ctx->tcp_error_report = old_ctx->tcp_error_report; new_ctx->rel_write_seq = 1; if (subflow_req->mp_capable) { /* see comments in subflow_syn_recv_sock(), MPTCP connection * is fully established only after we receive the remote key */ new_ctx->mp_capable = 1; new_ctx->local_key = subflow_req->local_key; new_ctx->token = subflow_req->token; new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->idsn = subflow_req->idsn; /* this is the first subflow, id is always 0 */ subflow_set_local_id(new_ctx, 0); } else if (subflow_req->mp_join) { new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->mp_join = 1; WRITE_ONCE(new_ctx->fully_established, true); new_ctx->remote_key_valid = 1; new_ctx->backup = subflow_req->backup; new_ctx->request_bkup = subflow_req->request_bkup; WRITE_ONCE(new_ctx->remote_id, subflow_req->remote_id); new_ctx->token = subflow_req->token; new_ctx->thmac = subflow_req->thmac; /* the subflow req id is valid, fetched via subflow_check_req() * and subflow_token_join_request() */ subflow_set_local_id(new_ctx, subflow_req->local_id); } } static void tcp_release_cb_override(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); long status; /* process and clear all the pending actions, but leave the subflow into * the napi queue. To respect locking, only the same CPU that originated * the action can touch the list. mptcp_napi_poll will take care of it. */ status = set_mask_bits(&subflow->delegated_status, MPTCP_DELEGATE_ACTIONS_MASK, 0); if (status) mptcp_subflow_process_delegated(ssk, status); tcp_release_cb(ssk); } static int tcp_abort_override(struct sock *ssk, int err) { /* closing a listener subflow requires a great deal of care. * keep it simple and just prevent such operation */ if (inet_sk_state_load(ssk) == TCP_LISTEN) return -EINVAL; return tcp_abort(ssk, err); } static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { .name = "mptcp", .owner = THIS_MODULE, .init = subflow_ulp_init, .release = subflow_ulp_release, .clone = subflow_ulp_clone, }; static int subflow_ops_init(struct request_sock_ops *subflow_ops) { subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, subflow_ops->obj_size, 0, SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, NULL); if (!subflow_ops->slab) return -ENOMEM; return 0; } void __init mptcp_subflow_init(void) { mptcp_subflow_v4_request_sock_ops = tcp_request_sock_ops; mptcp_subflow_v4_request_sock_ops.slab_name = "request_sock_subflow_v4"; mptcp_subflow_v4_request_sock_ops.destructor = subflow_v4_req_destructor; if (subflow_ops_init(&mptcp_subflow_v4_request_sock_ops) != 0) panic("MPTCP: failed to init subflow v4 request sock ops\n"); subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; subflow_request_sock_ipv4_ops.send_synack = subflow_v4_send_synack; subflow_specific = ipv4_specific; subflow_specific.conn_request = subflow_v4_conn_request; subflow_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_specific.sk_rx_dst_set = subflow_finish_connect; subflow_specific.rebuild_header = subflow_rebuild_header; tcp_prot_override = tcp_prot; tcp_prot_override.release_cb = tcp_release_cb_override; tcp_prot_override.diag_destroy = tcp_abort_override; #ifdef CONFIG_BPF_SYSCALL /* Disable sockmap processing for subflows */ tcp_prot_override.psock_update_sk_prot = NULL; #endif mptcp_diag_subflow_init(&subflow_ulp_ops); if (tcp_register_ulp(&subflow_ulp_ops) != 0) panic("MPTCP: failed to register subflows to ULP\n"); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) void __init mptcp_subflow_v6_init(void) { /* In struct mptcp_subflow_request_sock, we assume the TCP request sock * structures for v4 and v6 have the same size. It should not changed in * the future but better to make sure to be warned if it is no longer * the case. */ BUILD_BUG_ON(sizeof(struct tcp_request_sock) != sizeof(struct tcp6_request_sock)); mptcp_subflow_v6_request_sock_ops = tcp6_request_sock_ops; mptcp_subflow_v6_request_sock_ops.slab_name = "request_sock_subflow_v6"; mptcp_subflow_v6_request_sock_ops.destructor = subflow_v6_req_destructor; if (subflow_ops_init(&mptcp_subflow_v6_request_sock_ops) != 0) panic("MPTCP: failed to init subflow v6 request sock ops\n"); subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; subflow_request_sock_ipv6_ops.send_synack = subflow_v6_send_synack; subflow_v6_specific = ipv6_specific; subflow_v6_specific.conn_request = subflow_v6_conn_request; subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; subflow_v6_specific.rebuild_header = subflow_v6_rebuild_header; subflow_v6m_specific = subflow_v6_specific; subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; subflow_v6m_specific.rebuild_header = subflow_rebuild_header; tcpv6_prot_override = tcpv6_prot; tcpv6_prot_override.release_cb = tcp_release_cb_override; tcpv6_prot_override.diag_destroy = tcp_abort_override; #ifdef CONFIG_BPF_SYSCALL /* Disable sockmap processing for subflows */ tcpv6_prot_override.psock_update_sk_prot = NULL; #endif } #endif
9 9 3 3 3 1 1 8 8 8 1 1 1 8 7 8 8 4 8 7 7 7 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 // SPDX-License-Identifier: GPL-2.0-only /* * net/psample/psample.c - Netlink channel for packet sampling * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/module.h> #include <linux/timekeeping.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> #include <net/genetlink.h> #include <net/psample.h> #include <linux/spinlock.h> #include <net/ip_tunnels.h> #include <net/dst_metadata.h> #define PSAMPLE_MAX_PACKET_SIZE 0xffff static LIST_HEAD(psample_groups_list); static DEFINE_SPINLOCK(psample_groups_lock); /* multicast groups */ enum psample_nl_multicast_groups { PSAMPLE_NL_MCGRP_CONFIG, PSAMPLE_NL_MCGRP_SAMPLE, }; static const struct genl_multicast_group psample_nl_mcgrps[] = { [PSAMPLE_NL_MCGRP_CONFIG] = { .name = PSAMPLE_NL_MCGRP_CONFIG_NAME }, [PSAMPLE_NL_MCGRP_SAMPLE] = { .name = PSAMPLE_NL_MCGRP_SAMPLE_NAME, .flags = GENL_MCAST_CAP_NET_ADMIN, }, }; static struct genl_family psample_nl_family __ro_after_init; static int psample_group_nl_fill(struct sk_buff *msg, struct psample_group *group, enum psample_command cmd, u32 portid, u32 seq, int flags) { void *hdr; int ret; hdr = genlmsg_put(msg, portid, seq, &psample_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; ret = nla_put_u32(msg, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num); if (ret < 0) goto error; ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_REFCOUNT, group->refcount); if (ret < 0) goto error; ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_SEQ, group->seq); if (ret < 0) goto error; genlmsg_end(msg, hdr); return 0; error: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int psample_nl_cmd_get_group_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct psample_group *group; int start = cb->args[0]; int idx = 0; int err; spin_lock_bh(&psample_groups_lock); list_for_each_entry(group, &psample_groups_list, list) { if (!net_eq(group->net, sock_net(msg->sk))) continue; if (idx < start) { idx++; continue; } err = psample_group_nl_fill(msg, group, PSAMPLE_CMD_NEW_GROUP, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); if (err) break; idx++; } spin_unlock_bh(&psample_groups_lock); cb->args[0] = idx; return msg->len; } static const struct genl_small_ops psample_nl_ops[] = { { .cmd = PSAMPLE_CMD_GET_GROUP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = psample_nl_cmd_get_group_dumpit, /* can be retrieved by unprivileged users */ } }; static struct genl_family psample_nl_family __ro_after_init = { .name = PSAMPLE_GENL_NAME, .version = PSAMPLE_GENL_VERSION, .maxattr = PSAMPLE_ATTR_MAX, .netnsok = true, .module = THIS_MODULE, .mcgrps = psample_nl_mcgrps, .small_ops = psample_nl_ops, .n_small_ops = ARRAY_SIZE(psample_nl_ops), .resv_start_op = PSAMPLE_CMD_GET_GROUP + 1, .n_mcgrps = ARRAY_SIZE(psample_nl_mcgrps), }; static void psample_group_notify(struct psample_group *group, enum psample_command cmd) { struct sk_buff *msg; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return; err = psample_group_nl_fill(msg, group, cmd, 0, 0, NLM_F_MULTI); if (!err) genlmsg_multicast_netns(&psample_nl_family, group->net, msg, 0, PSAMPLE_NL_MCGRP_CONFIG, GFP_ATOMIC); else nlmsg_free(msg); } static struct psample_group *psample_group_create(struct net *net, u32 group_num) { struct psample_group *group; group = kzalloc_obj(*group, GFP_ATOMIC); if (!group) return NULL; group->net = net; group->group_num = group_num; list_add_tail(&group->list, &psample_groups_list); psample_group_notify(group, PSAMPLE_CMD_NEW_GROUP); return group; } static void psample_group_destroy(struct psample_group *group) { psample_group_notify(group, PSAMPLE_CMD_DEL_GROUP); list_del(&group->list); kfree_rcu(group, rcu); } static struct psample_group * psample_group_lookup(struct net *net, u32 group_num) { struct psample_group *group; list_for_each_entry(group, &psample_groups_list, list) if ((group->group_num == group_num) && (group->net == net)) return group; return NULL; } struct psample_group *psample_group_get(struct net *net, u32 group_num) { struct psample_group *group; spin_lock_bh(&psample_groups_lock); group = psample_group_lookup(net, group_num); if (!group) { group = psample_group_create(net, group_num); if (!group) goto out; } group->refcount++; out: spin_unlock_bh(&psample_groups_lock); return group; } EXPORT_SYMBOL_GPL(psample_group_get); void psample_group_take(struct psample_group *group) { spin_lock_bh(&psample_groups_lock); group->refcount++; spin_unlock_bh(&psample_groups_lock); } EXPORT_SYMBOL_GPL(psample_group_take); void psample_group_put(struct psample_group *group) { spin_lock_bh(&psample_groups_lock); if (--group->refcount == 0) psample_group_destroy(group); spin_unlock_bh(&psample_groups_lock); } EXPORT_SYMBOL_GPL(psample_group_put); #ifdef CONFIG_INET static int __psample_ip_tun_to_nlattr(struct sk_buff *skb, struct ip_tunnel_info *tun_info) { unsigned short tun_proto = ip_tunnel_info_af(tun_info); const void *tun_opts = ip_tunnel_info_opts(tun_info); const struct ip_tunnel_key *tun_key = &tun_info->key; int tun_opts_len = tun_info->options_len; if (test_bit(IP_TUNNEL_KEY_BIT, tun_key->tun_flags) && nla_put_be64(skb, PSAMPLE_TUNNEL_KEY_ATTR_ID, tun_key->tun_id, PSAMPLE_TUNNEL_KEY_ATTR_PAD)) return -EMSGSIZE; if (tun_info->mode & IP_TUNNEL_INFO_BRIDGE && nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE)) return -EMSGSIZE; switch (tun_proto) { case AF_INET: if (tun_key->u.ipv4.src && nla_put_in_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_SRC, tun_key->u.ipv4.src)) return -EMSGSIZE; if (tun_key->u.ipv4.dst && nla_put_in_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_DST, tun_key->u.ipv4.dst)) return -EMSGSIZE; break; case AF_INET6: if (!ipv6_addr_any(&tun_key->u.ipv6.src) && nla_put_in6_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV6_SRC, &tun_key->u.ipv6.src)) return -EMSGSIZE; if (!ipv6_addr_any(&tun_key->u.ipv6.dst) && nla_put_in6_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV6_DST, &tun_key->u.ipv6.dst)) return -EMSGSIZE; break; } if (tun_key->tos && nla_put_u8(skb, PSAMPLE_TUNNEL_KEY_ATTR_TOS, tun_key->tos)) return -EMSGSIZE; if (nla_put_u8(skb, PSAMPLE_TUNNEL_KEY_ATTR_TTL, tun_key->ttl)) return -EMSGSIZE; if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, tun_key->tun_flags) && nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) return -EMSGSIZE; if (test_bit(IP_TUNNEL_CSUM_BIT, tun_key->tun_flags) && nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_CSUM)) return -EMSGSIZE; if (tun_key->tp_src && nla_put_be16(skb, PSAMPLE_TUNNEL_KEY_ATTR_TP_SRC, tun_key->tp_src)) return -EMSGSIZE; if (tun_key->tp_dst && nla_put_be16(skb, PSAMPLE_TUNNEL_KEY_ATTR_TP_DST, tun_key->tp_dst)) return -EMSGSIZE; if (test_bit(IP_TUNNEL_OAM_BIT, tun_key->tun_flags) && nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; if (tun_opts_len) { if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_key->tun_flags) && nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_GENEVE_OPTS, tun_opts_len, tun_opts)) return -EMSGSIZE; else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_key->tun_flags) && nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_ERSPAN_OPTS, tun_opts_len, tun_opts)) return -EMSGSIZE; } return 0; } static int psample_ip_tun_to_nlattr(struct sk_buff *skb, struct ip_tunnel_info *tun_info) { struct nlattr *nla; int err; nla = nla_nest_start_noflag(skb, PSAMPLE_ATTR_TUNNEL); if (!nla) return -EMSGSIZE; err = __psample_ip_tun_to_nlattr(skb, tun_info); if (err) { nla_nest_cancel(skb, nla); return err; } nla_nest_end(skb, nla); return 0; } static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info) { unsigned short tun_proto = ip_tunnel_info_af(tun_info); const struct ip_tunnel_key *tun_key = &tun_info->key; int tun_opts_len = tun_info->options_len; int sum = nla_total_size(0); /* PSAMPLE_ATTR_TUNNEL */ if (test_bit(IP_TUNNEL_KEY_BIT, tun_key->tun_flags)) sum += nla_total_size_64bit(sizeof(u64)); if (tun_info->mode & IP_TUNNEL_INFO_BRIDGE) sum += nla_total_size(0); switch (tun_proto) { case AF_INET: if (tun_key->u.ipv4.src) sum += nla_total_size(sizeof(u32)); if (tun_key->u.ipv4.dst) sum += nla_total_size(sizeof(u32)); break; case AF_INET6: if (!ipv6_addr_any(&tun_key->u.ipv6.src)) sum += nla_total_size(sizeof(struct in6_addr)); if (!ipv6_addr_any(&tun_key->u.ipv6.dst)) sum += nla_total_size(sizeof(struct in6_addr)); break; } if (tun_key->tos) sum += nla_total_size(sizeof(u8)); sum += nla_total_size(sizeof(u8)); /* TTL */ if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, tun_key->tun_flags)) sum += nla_total_size(0); if (test_bit(IP_TUNNEL_CSUM_BIT, tun_key->tun_flags)) sum += nla_total_size(0); if (tun_key->tp_src) sum += nla_total_size(sizeof(u16)); if (tun_key->tp_dst) sum += nla_total_size(sizeof(u16)); if (test_bit(IP_TUNNEL_OAM_BIT, tun_key->tun_flags)) sum += nla_total_size(0); if (tun_opts_len) { if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_key->tun_flags)) sum += nla_total_size(tun_opts_len); else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_key->tun_flags)) sum += nla_total_size(tun_opts_len); } return sum; } #endif void psample_sample_packet(struct psample_group *group, const struct sk_buff *skb, u32 sample_rate, const struct psample_metadata *md) { ktime_t tstamp = ktime_get_real(); int out_ifindex = md->out_ifindex; int in_ifindex = md->in_ifindex; u32 trunc_size = md->trunc_size; #ifdef CONFIG_INET struct ip_tunnel_info *tun_info; #endif struct sk_buff *nl_skb; int data_len; int meta_len; void *data; int ret; if (!genl_has_listeners(&psample_nl_family, group->net, PSAMPLE_NL_MCGRP_SAMPLE)) return; meta_len = (in_ifindex ? nla_total_size(sizeof(u16)) : 0) + (out_ifindex ? nla_total_size(sizeof(u16)) : 0) + (md->out_tc_valid ? nla_total_size(sizeof(u16)) : 0) + (md->out_tc_occ_valid ? nla_total_size_64bit(sizeof(u64)) : 0) + (md->latency_valid ? nla_total_size_64bit(sizeof(u64)) : 0) + nla_total_size(sizeof(u32)) + /* sample_rate */ nla_total_size(sizeof(u32)) + /* orig_size */ nla_total_size(sizeof(u32)) + /* group_num */ nla_total_size(sizeof(u32)) + /* seq */ nla_total_size_64bit(sizeof(u64)) + /* timestamp */ nla_total_size(sizeof(u16)) + /* protocol */ (md->user_cookie_len ? nla_total_size(md->user_cookie_len) : 0) + /* user cookie */ (md->rate_as_probability ? nla_total_size(0) : 0); /* rate as probability */ #ifdef CONFIG_INET tun_info = skb_tunnel_info(skb); if (tun_info) meta_len += psample_tunnel_meta_len(tun_info); #endif data_len = min(skb->len, trunc_size); if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE) data_len = PSAMPLE_MAX_PACKET_SIZE - meta_len - NLA_HDRLEN - NLA_ALIGNTO; nl_skb = genlmsg_new(meta_len + nla_total_size(data_len), GFP_ATOMIC); if (unlikely(!nl_skb)) return; data = genlmsg_put(nl_skb, 0, 0, &psample_nl_family, 0, PSAMPLE_CMD_SAMPLE); if (unlikely(!data)) goto error; if (in_ifindex) { ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_IIFINDEX, in_ifindex); if (unlikely(ret < 0)) goto error; } if (out_ifindex) { ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OIFINDEX, out_ifindex); if (unlikely(ret < 0)) goto error; } ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_RATE, sample_rate); if (unlikely(ret < 0)) goto error; ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_ORIGSIZE, skb->len); if (unlikely(ret < 0)) goto error; ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num); if (unlikely(ret < 0)) goto error; ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_GROUP_SEQ, group->seq++); if (unlikely(ret < 0)) goto error; if (md->out_tc_valid) { ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OUT_TC, md->out_tc); if (unlikely(ret < 0)) goto error; } if (md->out_tc_occ_valid) { ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_OUT_TC_OCC, md->out_tc_occ, PSAMPLE_ATTR_PAD); if (unlikely(ret < 0)) goto error; } if (md->latency_valid) { ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_LATENCY, md->latency, PSAMPLE_ATTR_PAD); if (unlikely(ret < 0)) goto error; } ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_TIMESTAMP, ktime_to_ns(tstamp), PSAMPLE_ATTR_PAD); if (unlikely(ret < 0)) goto error; ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_PROTO, be16_to_cpu(skb->protocol)); if (unlikely(ret < 0)) goto error; if (data_len) { int nla_len = nla_total_size(data_len); struct nlattr *nla; nla = skb_put(nl_skb, nla_len); nla->nla_type = PSAMPLE_ATTR_DATA; nla->nla_len = nla_attr_size(data_len); if (skb_copy_bits(skb, 0, nla_data(nla), data_len)) goto error; } #ifdef CONFIG_INET if (tun_info) { ret = psample_ip_tun_to_nlattr(nl_skb, tun_info); if (unlikely(ret < 0)) goto error; } #endif if (md->user_cookie && md->user_cookie_len && nla_put(nl_skb, PSAMPLE_ATTR_USER_COOKIE, md->user_cookie_len, md->user_cookie)) goto error; if (md->rate_as_probability && nla_put_flag(nl_skb, PSAMPLE_ATTR_SAMPLE_PROBABILITY)) goto error; genlmsg_end(nl_skb, data); genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0, PSAMPLE_NL_MCGRP_SAMPLE, GFP_ATOMIC); return; error: pr_err_ratelimited("Could not create psample log message\n"); nlmsg_free(nl_skb); } EXPORT_SYMBOL_GPL(psample_sample_packet); static int __init psample_module_init(void) { return genl_register_family(&psample_nl_family); } static void __exit psample_module_exit(void) { genl_unregister_family(&psample_nl_family); } module_init(psample_module_init); module_exit(psample_module_exit); MODULE_AUTHOR("Yotam Gigi <yotam.gi@gmail.com>"); MODULE_DESCRIPTION("netlink channel for packet sampling"); MODULE_LICENSE("GPL v2");
250 250 248 1 247 250 250 250 1 117 194 196 151 182 135 1 40 40 40 40 37 3 170 137 137 122 122 137 137 151 135 137 151 151 16 135 137 134 150 35 35 35 35 104 104 59 56 2 2 2 33 52 21 33 248 3 1 1 1 1 1 250 40 40 39 40 40 37 36 250 249 210 40 36 3 37 40 6 34 40 40 40 31 249 18 199 34 3 3 3 3 3 6 216 34 34 34 34 34 224 224 223 223 224 51 52 52 41 12 199 198 199 3 30 9 9 3 191 190 14 191 190 9 2 183 35 150 169 169 170 35 137 170 170 169 170 75 117 85 116 60 116 5 146 118 137 186 41 146 340 312 3 3 18 156 156 137 137 136 137 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/signal.c * * Copyright (C) 1991, 1992 Linus Torvalds * * 1997-11-02 Modified for POSIX.1b signals by Richard Henderson * * 2003-06-02 Jim Houston - Concurrent Computer Corp. * Changes to use preallocated sigqueue structures * to allow signals to be sent reliably. */ #include <linux/slab.h> #include <linux/export.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/user.h> #include <linux/sched/debug.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/proc_fs.h> #include <linux/tty.h> #include <linux/binfmts.h> #include <linux/coredump.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ptrace.h> #include <linux/signal.h> #include <linux/signalfd.h> #include <linux/ratelimit.h> #include <linux/task_work.h> #include <linux/capability.h> #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <linux/nsproxy.h> #include <linux/user_namespace.h> #include <linux/uprobes.h> #include <linux/compat.h> #include <linux/cn_proc.h> #include <linux/compiler.h> #include <linux/posix-timers.h> #include <linux/cgroup.h> #include <linux/audit.h> #include <linux/sysctl.h> #include <uapi/linux/pidfd.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> #include <asm/param.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <asm/siginfo.h> #include <asm/cacheflush.h> #include <asm/syscall.h> /* for syscall_get_* */ #include "time/posix-timers.h" /* * SLAB caches for signal bits. */ static struct kmem_cache *sigqueue_cachep; int print_fatal_signals __read_mostly; static void __user *sig_handler(struct task_struct *t, int sig) { return t->sighand->action[sig - 1].sa.sa_handler; } static inline bool sig_handler_ignored(void __user *handler, int sig) { /* Is it explicitly or implicitly ignored? */ return handler == SIG_IGN || (handler == SIG_DFL && sig_kernel_ignore(sig)); } static bool sig_task_ignored(struct task_struct *t, int sig, bool force) { void __user *handler; handler = sig_handler(t, sig); /* SIGKILL and SIGSTOP may not be sent to the global init */ if (unlikely(is_global_init(t) && sig_kernel_only(sig))) return true; if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && handler == SIG_DFL && !(force && sig_kernel_only(sig))) return true; /* Only allow kernel generated signals to this kthread */ if (unlikely((t->flags & PF_KTHREAD) && (handler == SIG_KTHREAD_KERNEL) && !force)) return true; return sig_handler_ignored(handler, sig); } static bool sig_ignored(struct task_struct *t, int sig, bool force) { /* * Blocked signals are never ignored, since the * signal handler may change by the time it is * unblocked. */ if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) return false; /* * Tracers may want to know about even ignored signal unless it * is SIGKILL which can't be reported anyway but can be ignored * by SIGNAL_UNKILLABLE task. */ if (t->ptrace && sig != SIGKILL) return false; return sig_task_ignored(t, sig, force); } /* * Re-calculate pending state from the set of locally pending * signals, globally pending signals, and blocked signals. */ static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked) { unsigned long ready; long i; switch (_NSIG_WORDS) { default: for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;) ready |= signal->sig[i] &~ blocked->sig[i]; break; case 4: ready = signal->sig[3] &~ blocked->sig[3]; ready |= signal->sig[2] &~ blocked->sig[2]; ready |= signal->sig[1] &~ blocked->sig[1]; ready |= signal->sig[0] &~ blocked->sig[0]; break; case 2: ready = signal->sig[1] &~ blocked->sig[1]; ready |= signal->sig[0] &~ blocked->sig[0]; break; case 1: ready = signal->sig[0] &~ blocked->sig[0]; } return ready != 0; } #define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) static bool recalc_sigpending_tsk(struct task_struct *t) { if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) || PENDING(&t->pending, &t->blocked) || PENDING(&t->signal->shared_pending, &t->blocked) || cgroup_task_frozen(t)) { set_tsk_thread_flag(t, TIF_SIGPENDING); return true; } /* * We must never clear the flag in another thread, or in current * when it's possible the current syscall is returning -ERESTART*. * So we don't clear it here, and only callers who know they should do. */ return false; } void recalc_sigpending(void) { if (!recalc_sigpending_tsk(current) && !freezing(current)) { if (unlikely(test_thread_flag(TIF_SIGPENDING))) clear_thread_flag(TIF_SIGPENDING); } } EXPORT_SYMBOL(recalc_sigpending); void calculate_sigpending(void) { /* Have any signals or users of TIF_SIGPENDING been delayed * until after fork? */ spin_lock_irq(&current->sighand->siglock); set_tsk_thread_flag(current, TIF_SIGPENDING); recalc_sigpending(); spin_unlock_irq(&current->sighand->siglock); } /* Given the mask, find the first available signal that should be serviced. */ #define SYNCHRONOUS_MASK \ (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS)) int next_signal(struct sigpending *pending, sigset_t *mask) { unsigned long i, *s, *m, x; int sig = 0; s = pending->signal.sig; m = mask->sig; /* * Handle the first word specially: it contains the * synchronous signals that need to be dequeued first. */ x = *s &~ *m; if (x) { if (x & SYNCHRONOUS_MASK) x &= SYNCHRONOUS_MASK; sig = ffz(~x) + 1; return sig; } switch (_NSIG_WORDS) { default: for (i = 1; i < _NSIG_WORDS; ++i) { x = *++s &~ *++m; if (!x) continue; sig = ffz(~x) + i*_NSIG_BPW + 1; break; } break; case 2: x = s[1] &~ m[1]; if (!x) break; sig = ffz(~x) + _NSIG_BPW + 1; break; case 1: /* Nothing to do */ break; } return sig; } static inline void print_dropped_signal(int sig) { static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); if (!print_fatal_signals) return; if (!__ratelimit(&ratelimit_state)) return; pr_info("%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n", current->comm, current->pid, sig); } /** * task_set_jobctl_pending - set jobctl pending bits * @task: target task * @mask: pending bits to set * * Clear @mask from @task->jobctl. @mask must be subset of * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK | * %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is * cleared. If @task is already being killed or exiting, this function * becomes noop. * * CONTEXT: * Must be called with @task->sighand->siglock held. * * RETURNS: * %true if @mask is set, %false if made noop because @task was dying. */ bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask) { BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME | JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK)); if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING))) return false; if (mask & JOBCTL_STOP_SIGMASK) task->jobctl &= ~JOBCTL_STOP_SIGMASK; task->jobctl |= mask; return true; } /** * task_clear_jobctl_trapping - clear jobctl trapping bit * @task: target task * * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED. * Clear it and wake up the ptracer. Note that we don't need any further * locking. @task->siglock guarantees that @task->parent points to the * ptracer. * * CONTEXT: * Must be called with @task->sighand->siglock held. */ void task_clear_jobctl_trapping(struct task_struct *task) { if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { task->jobctl &= ~JOBCTL_TRAPPING; smp_mb(); /* advised by wake_up_bit() */ wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); } } /** * task_clear_jobctl_pending - clear jobctl pending bits * @task: target task * @mask: pending bits to clear * * Clear @mask from @task->jobctl. @mask must be subset of * %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other * STOP bits are cleared together. * * If clearing of @mask leaves no stop or trap pending, this function calls * task_clear_jobctl_trapping(). * * CONTEXT: * Must be called with @task->sighand->siglock held. */ void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask) { BUG_ON(mask & ~JOBCTL_PENDING_MASK); if (mask & JOBCTL_STOP_PENDING) mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED; task->jobctl &= ~mask; if (!(task->jobctl & JOBCTL_PENDING_MASK)) task_clear_jobctl_trapping(task); } /** * task_participate_group_stop - participate in a group stop * @task: task participating in a group stop * * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop. * Group stop states are cleared and the group stop count is consumed if * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group * stop, the appropriate `SIGNAL_*` flags are set. * * CONTEXT: * Must be called with @task->sighand->siglock held. * * RETURNS: * %true if group stop completion should be notified to the parent, %false * otherwise. */ static bool task_participate_group_stop(struct task_struct *task) { struct signal_struct *sig = task->signal; bool consume = task->jobctl & JOBCTL_STOP_CONSUME; WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING)); task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING); if (!consume) return false; if (!WARN_ON_ONCE(sig->group_stop_count == 0)) sig->group_stop_count--; /* * Tell the caller to notify completion iff we are entering into a * fresh group stop. Read comment in do_signal_stop() for details. */ if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) { signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED); return true; } return false; } void task_join_group_stop(struct task_struct *task) { unsigned long mask = current->jobctl & JOBCTL_STOP_SIGMASK; struct signal_struct *sig = current->signal; if (sig->group_stop_count) { sig->group_stop_count++; mask |= JOBCTL_STOP_CONSUME; } else if (!(sig->flags & SIGNAL_STOP_STOPPED)) return; /* Have the new thread join an on-going signal group stop */ task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING); } static struct ucounts *sig_get_ucounts(struct task_struct *t, int sig, int override_rlimit) { struct ucounts *ucounts; long sigpending; /* * Protect access to @t credentials. This can go away when all * callers hold rcu read lock. * * NOTE! A pending signal will hold on to the user refcount, * and we get/put the refcount only when the sigpending count * changes from/to zero. */ rcu_read_lock(); ucounts = task_ucounts(t); sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, override_rlimit); rcu_read_unlock(); if (!sigpending) return NULL; if (unlikely(!override_rlimit && sigpending > task_rlimit(t, RLIMIT_SIGPENDING))) { dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); print_dropped_signal(sig); return NULL; } return ucounts; } static void __sigqueue_init(struct sigqueue *q, struct ucounts *ucounts, const unsigned int sigqueue_flags) { INIT_LIST_HEAD(&q->list); q->flags = sigqueue_flags; q->ucounts = ucounts; } /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an * appropriate lock must be held to stop the target task from exiting */ static struct sigqueue *sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, int override_rlimit) { struct ucounts *ucounts = sig_get_ucounts(t, sig, override_rlimit); struct sigqueue *q; if (!ucounts) return NULL; q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); if (!q) { dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); return NULL; } __sigqueue_init(q, ucounts, 0); return q; } static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) { posixtimer_sigqueue_putref(q); return; } if (q->ucounts) { dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING); q->ucounts = NULL; } kmem_cache_free(sigqueue_cachep, q); } void flush_sigqueue(struct sigpending *queue) { struct sigqueue *q; sigemptyset(&queue->signal); while (!list_empty(&queue->list)) { q = list_entry(queue->list.next, struct sigqueue , list); list_del_init(&q->list); __sigqueue_free(q); } } /* * Flush all pending signals for this kthread. */ void flush_signals(struct task_struct *t) { unsigned long flags; spin_lock_irqsave(&t->sighand->siglock, flags); clear_tsk_thread_flag(t, TIF_SIGPENDING); flush_sigqueue(&t->pending); flush_sigqueue(&t->signal->shared_pending); spin_unlock_irqrestore(&t->sighand->siglock, flags); } EXPORT_SYMBOL(flush_signals); void ignore_signals(struct task_struct *t) { int i; for (i = 0; i < _NSIG; ++i) t->sighand->action[i].sa.sa_handler = SIG_IGN; flush_signals(t); } /* * Flush all handlers for a task. */ void flush_signal_handlers(struct task_struct *t, int force_default) { int i; struct k_sigaction *ka = &t->sighand->action[0]; for (i = _NSIG ; i != 0 ; i--) { if (force_default || ka->sa.sa_handler != SIG_IGN) ka->sa.sa_handler = SIG_DFL; ka->sa.sa_flags = 0; #ifdef __ARCH_HAS_SA_RESTORER ka->sa.sa_restorer = NULL; #endif sigemptyset(&ka->sa.sa_mask); ka++; } } bool unhandled_signal(struct task_struct *tsk, int sig) { void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler; if (is_global_init(tsk)) return true; if (handler != SIG_IGN && handler != SIG_DFL) return false; /* If dying, we handle all new signals by ignoring them */ if (fatal_signal_pending(tsk)) return false; /* if ptraced, let the tracer determine */ return !tsk->ptrace; } static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *info, struct sigqueue **timer_sigq) { struct sigqueue *q, *first = NULL; /* * Collect the siginfo appropriate to this signal. Check if * there is another siginfo for the same signal. */ list_for_each_entry(q, &list->list, list) { if (q->info.si_signo == sig) { if (first) goto still_pending; first = q; } } sigdelset(&list->signal, sig); if (first) { still_pending: list_del_init(&first->list); copy_siginfo(info, &first->info); /* * posix-timer signals are preallocated and freed when the last * reference count is dropped in posixtimer_deliver_signal() or * immediately on timer deletion when the signal is not pending. * Spare the extra round through __sigqueue_free() which is * ignoring preallocated signals. */ if (unlikely((first->flags & SIGQUEUE_PREALLOC) && (info->si_code == SI_TIMER))) *timer_sigq = first; else __sigqueue_free(first); } else { /* * Ok, it wasn't in the queue. This must be * a fast-pathed signal or we must have been * out of queue space. So zero out the info. */ clear_siginfo(info); info->si_signo = sig; info->si_errno = 0; info->si_code = SI_USER; info->si_pid = 0; info->si_uid = 0; } } static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, kernel_siginfo_t *info, struct sigqueue **timer_sigq) { int sig = next_signal(pending, mask); if (sig) collect_signal(sig, pending, info, timer_sigq); return sig; } /* * Try to dequeue a signal. If a deliverable signal is found fill in the * caller provided siginfo and return the signal number. Otherwise return * 0. */ int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type) { struct task_struct *tsk = current; struct sigqueue *timer_sigq; int signr; lockdep_assert_held(&tsk->sighand->siglock); again: *type = PIDTYPE_PID; timer_sigq = NULL; signr = __dequeue_signal(&tsk->pending, mask, info, &timer_sigq); if (!signr) { *type = PIDTYPE_TGID; signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info, &timer_sigq); if (unlikely(signr == SIGALRM)) posixtimer_rearm_itimer(tsk); } recalc_sigpending(); if (!signr) return 0; if (unlikely(sig_kernel_stop(signr))) { /* * Set a marker that we have dequeued a stop signal. Our * caller might release the siglock and then the pending * stop signal it is about to process is no longer in the * pending bitmasks, but must still be cleared by a SIGCONT * (and overruled by a SIGKILL). So those cases clear this * shared flag after we've set it. Note that this flag may * remain set after the signal we return is ignored or * handled. That doesn't matter because its only purpose * is to alert stop-signal processing code when another * processor has come along and cleared the flag. */ current->jobctl |= JOBCTL_STOP_DEQUEUED; } if (IS_ENABLED(CONFIG_POSIX_TIMERS) && unlikely(timer_sigq)) { if (!posixtimer_deliver_signal(info, timer_sigq)) goto again; } return signr; } EXPORT_SYMBOL_GPL(dequeue_signal); static int dequeue_synchronous_signal(kernel_siginfo_t *info) { struct task_struct *tsk = current; struct sigpending *pending = &tsk->pending; struct sigqueue *q, *sync = NULL; /* * Might a synchronous signal be in the queue? */ if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK)) return 0; /* * Return the first synchronous signal in the queue. */ list_for_each_entry(q, &pending->list, list) { /* Synchronous signals have a positive si_code */ if ((q->info.si_code > SI_USER) && (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) { sync = q; goto next; } } return 0; next: /* * Check if there is another siginfo for the same signal. */ list_for_each_entry_continue(q, &pending->list, list) { if (q->info.si_signo == sync->info.si_signo) goto still_pending; } sigdelset(&pending->signal, sync->info.si_signo); recalc_sigpending(); still_pending: list_del_init(&sync->list); copy_siginfo(info, &sync->info); __sigqueue_free(sync); return info->si_signo; } /* * Tell a process that it has a new active signal.. * * NOTE! we rely on the previous spin_lock to * lock interrupts for us! We can only be called with * "siglock" held, and the local interrupt must * have been disabled when that got acquired! * * No need to set need_resched since signal event passing * goes through ->blocked */ void signal_wake_up_state(struct task_struct *t, unsigned int state) { lockdep_assert_held(&t->sighand->siglock); set_tsk_thread_flag(t, TIF_SIGPENDING); /* * TASK_WAKEKILL also means wake it up in the stopped/traced/killable * case. We don't check t->state here because there is a race with it * executing another processor and just now entering stopped state. * By using wake_up_state, we ensure the process will wake up and * handle its death signal. */ if (!wake_up_state(t, state | TASK_INTERRUPTIBLE)) kick_process(t); } static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q); static void sigqueue_free_ignored(struct task_struct *tsk, struct sigqueue *q) { if (likely(!(q->flags & SIGQUEUE_PREALLOC) || q->info.si_code != SI_TIMER)) __sigqueue_free(q); else posixtimer_sig_ignore(tsk, q); } /* Remove signals in mask from the pending set and queue. */ static void flush_sigqueue_mask(struct task_struct *p, sigset_t *mask, struct sigpending *s) { struct sigqueue *q, *n; sigset_t m; lockdep_assert_held(&p->sighand->siglock); sigandsets(&m, mask, &s->signal); if (sigisemptyset(&m)) return; sigandnsets(&s->signal, &s->signal, mask); list_for_each_entry_safe(q, n, &s->list, list) { if (sigismember(mask, q->info.si_signo)) { list_del_init(&q->list); sigqueue_free_ignored(p, q); } } } static inline int is_si_special(const struct kernel_siginfo *info) { return info <= SEND_SIG_PRIV; } static inline bool si_fromuser(const struct kernel_siginfo *info) { return info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)); } /* * called with RCU read lock from check_kill_permission() */ static bool kill_ok_by_cred(struct task_struct *t) { const struct cred *cred = current_cred(); const struct cred *tcred = __task_cred(t); return uid_eq(cred->euid, tcred->suid) || uid_eq(cred->euid, tcred->uid) || uid_eq(cred->uid, tcred->suid) || uid_eq(cred->uid, tcred->uid) || ns_capable(tcred->user_ns, CAP_KILL); } /* * Bad permissions for sending the signal * - the caller must hold the RCU read lock */ static int check_kill_permission(int sig, struct kernel_siginfo *info, struct task_struct *t) { struct pid *sid; int error; if (!valid_signal(sig)) return -EINVAL; if (!si_fromuser(info)) return 0; error = audit_signal_info(sig, t); /* Let audit system see the signal */ if (error) return error; if (!same_thread_group(current, t) && !kill_ok_by_cred(t)) { switch (sig) { case SIGCONT: sid = task_session(t); /* * We don't return the error if sid == NULL. The * task was unhashed, the caller must notice this. */ if (!sid || sid == task_session(current)) break; fallthrough; default: return -EPERM; } } return security_task_kill(t, info, sig, NULL); } /** * ptrace_trap_notify - schedule trap to notify ptracer * @t: tracee wanting to notify tracer * * This function schedules sticky ptrace trap which is cleared on the next * TRAP_STOP to notify ptracer of an event. @t must have been seized by * ptracer. * * If @t is running, STOP trap will be taken. If trapped for STOP and * ptracer is listening for events, tracee is woken up so that it can * re-trap for the new event. If trapped otherwise, STOP trap will be * eventually taken without returning to userland after the existing traps * are finished by PTRACE_CONT. * * CONTEXT: * Must be called with @task->sighand->siglock held. */ static void ptrace_trap_notify(struct task_struct *t) { WARN_ON_ONCE(!(t->ptrace & PT_SEIZED)); lockdep_assert_held(&t->sighand->siglock); task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); } /* * Handle magic process-wide effects of stop/continue signals. Unlike * the signal actions, these happen immediately at signal-generation * time regardless of blocking, ignoring, or handling. This does the * actual continuing for SIGCONT, but not the actual stopping for stop * signals. The process stop is done as a signal action for SIG_DFL. * * Returns true if the signal should be actually delivered, otherwise * it should be dropped. */ static bool prepare_signal(int sig, struct task_struct *p, bool force) { struct signal_struct *signal = p->signal; struct task_struct *t; sigset_t flush; if (signal->flags & SIGNAL_GROUP_EXIT) { if (signal->core_state) return sig == SIGKILL; /* * The process is in the middle of dying, drop the signal. */ return false; } else if (sig_kernel_stop(sig)) { /* * This is a stop signal. Remove SIGCONT from all queues. */ siginitset(&flush, sigmask(SIGCONT)); flush_sigqueue_mask(p, &flush, &signal->shared_pending); for_each_thread(p, t) flush_sigqueue_mask(p, &flush, &t->pending); } else if (sig == SIGCONT) { unsigned int why; /* * Remove all stop signals from all queues, wake all threads. */ siginitset(&flush, SIG_KERNEL_STOP_MASK); flush_sigqueue_mask(p, &flush, &signal->shared_pending); for_each_thread(p, t) { flush_sigqueue_mask(p, &flush, &t->pending); task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); if (likely(!(t->ptrace & PT_SEIZED))) { t->jobctl &= ~JOBCTL_STOPPED; wake_up_state(t, __TASK_STOPPED); } else ptrace_trap_notify(t); } /* * Notify the parent with CLD_CONTINUED if we were stopped. * * If we were in the middle of a group stop, we pretend it * was already finished, and then continued. Since SIGCHLD * doesn't queue we report only CLD_STOPPED, as if the next * CLD_CONTINUED was dropped. */ why = 0; if (signal->flags & SIGNAL_STOP_STOPPED) why |= SIGNAL_CLD_CONTINUED; else if (signal->group_stop_count) why |= SIGNAL_CLD_STOPPED; if (why) { /* * The first thread which returns from do_signal_stop() * will take ->siglock, notice SIGNAL_CLD_MASK, and * notify its parent. See get_signal(). */ signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED); signal->group_stop_count = 0; signal->group_exit_code = 0; } } return !sig_ignored(p, sig, force); } /* * Test if P wants to take SIG. After we've checked all threads with this, * it's equivalent to finding no threads not blocking SIG. Any threads not * blocking SIG were ruled out because they are not running and already * have pending signals. Such threads will dequeue from the shared queue * as soon as they're available, so putting the signal on the shared queue * will be equivalent to sending it to one such thread. */ static inline bool wants_signal(int sig, struct task_struct *p) { if (sigismember(&p->blocked, sig)) return false; if (p->flags & PF_EXITING) return false; if (sig == SIGKILL) return true; if (task_is_stopped_or_traced(p)) return false; return task_curr(p) || !task_sigpending(p); } static void complete_signal(int sig, struct task_struct *p, enum pid_type type) { struct signal_struct *signal = p->signal; struct task_struct *t; /* * Now find a thread we can wake up to take the signal off the queue. * * Try the suggested task first (may or may not be the main thread). */ if (wants_signal(sig, p)) t = p; else if ((type == PIDTYPE_PID) || thread_group_empty(p)) /* * There is just one thread and it does not need to be woken. * It will dequeue unblocked signals before it runs again. */ return; else { /* * Otherwise try to find a suitable thread. */ t = signal->curr_target; while (!wants_signal(sig, t)) { t = next_thread(t); if (t == signal->curr_target) /* * No thread needs to be woken. * Any eligible threads will see * the signal in the queue soon. */ return; } signal->curr_target = t; } /* * Found a killable thread. If the signal will be fatal, * then start taking the whole group down immediately. */ if (sig_fatal(p, sig) && !sigismember(&t->real_blocked, sig) && (sig == SIGKILL || !p->ptrace)) { /* * This signal will be fatal to the whole group. */ if (!sig_kernel_coredump(sig)) { /* * Start a group exit and wake everybody up. * This way we don't have other threads * running and doing things after a slower * thread has the fatal signal pending. */ signal->flags = SIGNAL_GROUP_EXIT; signal->group_exit_code = sig; signal->group_stop_count = 0; __for_each_thread(signal, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } return; } } /* * The signal is already in the shared-pending queue. * Tell the chosen thread to wake up and dequeue it. */ signal_wake_up(t, sig == SIGKILL); return; } static inline bool legacy_queue(struct sigpending *signals, int sig) { return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } static int __send_signal_locked(int sig, struct kernel_siginfo *info, struct task_struct *t, enum pid_type type, bool force) { struct sigpending *pending; struct sigqueue *q; int override_rlimit; int ret = 0, result; lockdep_assert_held(&t->sighand->siglock); result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, force)) goto ret; pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; /* * Short-circuit ignored signals and support queuing * exactly one non-rt signal, so that we can get more * detailed information about the cause of the signal. */ result = TRACE_SIGNAL_ALREADY_PENDING; if (legacy_queue(pending, sig)) goto ret; result = TRACE_SIGNAL_DELIVERED; /* * Skip useless siginfo allocation for SIGKILL and kernel threads. */ if ((sig == SIGKILL) || (t->flags & PF_KTHREAD)) goto out_set; /* * Real-time signals must be queued if sent by sigqueue, or * some other real-time mechanism. It is implementation * defined whether kill() does so. We attempt to do so, on * the principle of least surprise, but since kill is not * allowed to fail with EAGAIN when low on memory we just * make sure at least one signal gets delivered and don't * pass on the info struct. */ if (sig < SIGRTMIN) override_rlimit = (is_si_special(info) || info->si_code >= 0); else override_rlimit = 0; q = sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit); if (q) { list_add_tail(&q->list, &pending->list); switch ((unsigned long) info) { case (unsigned long) SEND_SIG_NOINFO: clear_siginfo(&q->info); q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; q->info.si_pid = task_tgid_nr_ns(current, task_active_pid_ns(t)); rcu_read_lock(); q->info.si_uid = from_kuid_munged(task_cred_xxx(t, user_ns), current_uid()); rcu_read_unlock(); break; case (unsigned long) SEND_SIG_PRIV: clear_siginfo(&q->info); q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_KERNEL; q->info.si_pid = 0; q->info.si_uid = 0; break; default: copy_siginfo(&q->info, info); break; } } else if (!is_si_special(info) && sig >= SIGRTMIN && info->si_code != SI_USER) { /* * Queue overflow, abort. We may abort if the * signal was rt and sent by user using something * other than kill(). */ result = TRACE_SIGNAL_OVERFLOW_FAIL; ret = -EAGAIN; goto ret; } else { /* * This is a silent loss of information. We still * send the signal, but the *info bits are lost. */ result = TRACE_SIGNAL_LOSE_INFO; } out_set: signalfd_notify(t, sig); sigaddset(&pending->signal, sig); /* Let multiprocess signals appear after on-going forks */ if (type > PIDTYPE_TGID) { struct multiprocess_signals *delayed; hlist_for_each_entry(delayed, &t->signal->multiprocess, node) { sigset_t *signal = &delayed->signal; /* Can't queue both a stop and a continue signal */ if (sig == SIGCONT) sigdelsetmask(signal, SIG_KERNEL_STOP_MASK); else if (sig_kernel_stop(sig)) sigdelset(signal, SIGCONT); sigaddset(signal, sig); } } complete_signal(sig, t, type); ret: trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result); return ret; } static inline bool has_si_pid_and_uid(struct kernel_siginfo *info) { bool ret = false; switch (siginfo_layout(info->si_signo, info->si_code)) { case SIL_KILL: case SIL_CHLD: case SIL_RT: ret = true; break; case SIL_TIMER: case SIL_POLL: case SIL_FAULT: case SIL_FAULT_TRAPNO: case SIL_FAULT_MCEERR: case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: case SIL_FAULT_PERF_EVENT: case SIL_SYS: ret = false; break; } return ret; } int send_signal_locked(int sig, struct kernel_siginfo *info, struct task_struct *t, enum pid_type type) { /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */ bool force = false; if (info == SEND_SIG_NOINFO) { /* Force if sent from an ancestor pid namespace */ force = !task_pid_nr_ns(current, task_active_pid_ns(t)); } else if (info == SEND_SIG_PRIV) { /* Don't ignore kernel generated signals */ force = true; } else if (has_si_pid_and_uid(info)) { /* SIGKILL and SIGSTOP is special or has ids */ struct user_namespace *t_user_ns; rcu_read_lock(); t_user_ns = task_cred_xxx(t, user_ns); if (current_user_ns() != t_user_ns) { kuid_t uid = make_kuid(current_user_ns(), info->si_uid); info->si_uid = from_kuid_munged(t_user_ns, uid); } rcu_read_unlock(); /* A kernel generated signal? */ force = (info->si_code == SI_KERNEL); /* From an ancestor pid namespace? */ if (!task_pid_nr_ns(current, task_active_pid_ns(t))) { info->si_pid = 0; force = true; } } return __send_signal_locked(sig, info, t, type, force); } static void print_fatal_signal(int signr) { struct pt_regs *regs = task_pt_regs(current); struct file *exe_file; exe_file = get_task_exe_file(current); if (exe_file) { pr_info("%pD: %s: potentially unexpected fatal signal %d.\n", exe_file, current->comm, signr); fput(exe_file); } else { pr_info("%s: potentially unexpected fatal signal %d.\n", current->comm, signr); } #if defined(__i386__) && !defined(__arch_um__) pr_info("code at %08lx: ", regs->ip); { int i; for (i = 0; i < 16; i++) { unsigned char insn; if (get_user(insn, (unsigned char *)(regs->ip + i))) break; pr_cont("%02x ", insn); } } pr_cont("\n"); #endif preempt_disable(); show_regs(regs); preempt_enable(); } static int __init setup_print_fatal_signals(char *str) { get_option (&str, &print_fatal_signals); return 1; } __setup("print-fatal-signals=", setup_print_fatal_signals); int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type) { unsigned long flags; int ret = -ESRCH; if (lock_task_sighand(p, &flags)) { ret = send_signal_locked(sig, info, p, type); unlock_task_sighand(p, &flags); } return ret; } enum sig_handler { HANDLER_CURRENT, /* If reachable use the current handler */ HANDLER_SIG_DFL, /* Always use SIG_DFL handler semantics */ HANDLER_EXIT, /* Only visible as the process exit code */ }; /* * Force a signal that the process can't ignore: if necessary * we unblock the signal and change any SIG_IGN to SIG_DFL. * * Note: If we unblock the signal, we always reset it to SIG_DFL, * since we do not want to have a signal handler that was blocked * be invoked when user space had explicitly blocked it. * * We don't want to have recursive SIGSEGV's etc, for example, * that is why we also clear SIGNAL_UNKILLABLE. */ static int force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, enum sig_handler handler) { unsigned long int flags; int ret, blocked, ignored; struct k_sigaction *action; int sig = info->si_signo; spin_lock_irqsave(&t->sighand->siglock, flags); action = &t->sighand->action[sig-1]; ignored = action->sa.sa_handler == SIG_IGN; blocked = sigismember(&t->blocked, sig); if (blocked || ignored || (handler != HANDLER_CURRENT)) { action->sa.sa_handler = SIG_DFL; if (handler == HANDLER_EXIT) action->sa.sa_flags |= SA_IMMUTABLE; if (blocked) sigdelset(&t->blocked, sig); } /* * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect * debugging to leave init killable. But HANDLER_EXIT is always fatal. */ if (action->sa.sa_handler == SIG_DFL && (!t->ptrace || (handler == HANDLER_EXIT))) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = send_signal_locked(sig, info, t, PIDTYPE_PID); /* This can happen if the signal was already pending and blocked */ if (!task_sigpending(t)) signal_wake_up(t, 0); spin_unlock_irqrestore(&t->sighand->siglock, flags); return ret; } int force_sig_info(struct kernel_siginfo *info) { return force_sig_info_to_task(info, current, HANDLER_CURRENT); } /* * Nuke all other threads in the group. */ int zap_other_threads(struct task_struct *p) { struct task_struct *t; int count = 0; p->signal->group_stop_count = 0; for_other_threads(p, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); count++; /* Don't bother with already dead threads */ if (t->exit_state) continue; sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } return count; } struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) { struct sighand_struct *sighand; rcu_read_lock(); for (;;) { sighand = rcu_dereference(tsk->sighand); if (unlikely(sighand == NULL)) break; /* * This sighand can be already freed and even reused, but * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which * initializes ->siglock: this slab can't go away, it has * the same object type, ->siglock can't be reinitialized. * * We need to ensure that tsk->sighand is still the same * after we take the lock, we can race with de_thread() or * __exit_signal(). In the latter case the next iteration * must see ->sighand == NULL. */ spin_lock_irqsave(&sighand->siglock, *flags); if (likely(sighand == rcu_access_pointer(tsk->sighand))) break; spin_unlock_irqrestore(&sighand->siglock, *flags); } rcu_read_unlock(); return sighand; } #ifdef CONFIG_LOCKDEP void lockdep_assert_task_sighand_held(struct task_struct *task) { struct sighand_struct *sighand; rcu_read_lock(); sighand = rcu_dereference(task->sighand); if (sighand) lockdep_assert_held(&sighand->siglock); else WARN_ON_ONCE(1); rcu_read_unlock(); } #endif /* * send signal info to all the members of a thread group or to the * individual thread if type == PIDTYPE_PID. */ int group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type) { int ret; rcu_read_lock(); ret = check_kill_permission(sig, info, p); rcu_read_unlock(); if (!ret && sig) ret = do_send_sig_info(sig, info, p, type); return ret; } /* * __kill_pgrp_info() sends a signal to a process group: this is what the tty * control characters do (^C, ^Z etc) * - the caller must hold at least a readlock on tasklist_lock */ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) { struct task_struct *p = NULL; int ret = -ESRCH; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID); /* * If group_send_sig_info() succeeds at least once ret * becomes 0 and after that the code below has no effect. * Otherwise we return the last err or -ESRCH if this * process group is empty. */ if (ret) ret = err; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return ret; } static int kill_pid_info_type(int sig, struct kernel_siginfo *info, struct pid *pid, enum pid_type type) { int error = -ESRCH; struct task_struct *p; for (;;) { rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (p) error = group_send_sig_info(sig, info, p, type); rcu_read_unlock(); if (likely(!p || error != -ESRCH)) return error; /* * The task was unhashed in between, try again. If it * is dead, pid_task() will return NULL, if we race with * de_thread() it will find the new leader. */ } } int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) { return kill_pid_info_type(sig, info, pid, PIDTYPE_TGID); } static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid) { int error; rcu_read_lock(); error = kill_pid_info(sig, info, find_vpid(pid)); rcu_read_unlock(); return error; } static inline bool kill_as_cred_perm(const struct cred *cred, struct task_struct *target) { const struct cred *pcred = __task_cred(target); return uid_eq(cred->euid, pcred->suid) || uid_eq(cred->euid, pcred->uid) || uid_eq(cred->uid, pcred->suid) || uid_eq(cred->uid, pcred->uid); } /* * The usb asyncio usage of siginfo is wrong. The glibc support * for asyncio which uses SI_ASYNCIO assumes the layout is SIL_RT. * AKA after the generic fields: * kernel_pid_t si_pid; * kernel_uid32_t si_uid; * sigval_t si_value; * * Unfortunately when usb generates SI_ASYNCIO it assumes the layout * after the generic fields is: * void __user *si_addr; * * This is a practical problem when there is a 64bit big endian kernel * and a 32bit userspace. As the 32bit address will encoded in the low * 32bits of the pointer. Those low 32bits will be stored at higher * address than appear in a 32 bit pointer. So userspace will not * see the address it was expecting for it's completions. * * There is nothing in the encoding that can allow * copy_siginfo_to_user32 to detect this confusion of formats, so * handle this by requiring the caller of kill_pid_usb_asyncio to * notice when this situration takes place and to store the 32bit * pointer in sival_int, instead of sival_addr of the sigval_t addr * parameter. */ int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *pid, const struct cred *cred) { struct kernel_siginfo info; struct task_struct *p; unsigned long flags; int ret = -EINVAL; if (!valid_signal(sig)) return ret; clear_siginfo(&info); info.si_signo = sig; info.si_errno = errno; info.si_code = SI_ASYNCIO; *((sigval_t *)&info.si_pid) = addr; rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (!p) { ret = -ESRCH; goto out_unlock; } if (!kill_as_cred_perm(cred, p)) { ret = -EPERM; goto out_unlock; } ret = security_task_kill(p, &info, sig, cred); if (ret) goto out_unlock; if (sig) { if (lock_task_sighand(p, &flags)) { ret = __send_signal_locked(sig, &info, p, PIDTYPE_TGID, false); unlock_task_sighand(p, &flags); } else ret = -ESRCH; } out_unlock: rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(kill_pid_usb_asyncio); /* * kill_something_info() interprets pid in interesting ways just like kill(2). * * POSIX specifies that kill(-1,sig) is unspecified, but what we have * is probably wrong. Should make it like BSD or SYSV. */ static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid) { int ret; if (pid > 0) return kill_proc_info(sig, info, pid); /* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */ if (pid == INT_MIN) return -ESRCH; read_lock(&tasklist_lock); if (pid != -1) { ret = __kill_pgrp_info(sig, info, pid ? find_vpid(-pid) : task_pgrp(current)); } else { int retval = 0, count = 0; struct task_struct * p; for_each_process(p) { if (task_pid_vnr(p) > 1 && !same_thread_group(p, current)) { int err = group_send_sig_info(sig, info, p, PIDTYPE_MAX); ++count; if (err != -EPERM) retval = err; } } ret = count ? retval : -ESRCH; } read_unlock(&tasklist_lock); return ret; } /* * These are for backward compatibility with the rest of the kernel source. */ int send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p) { /* * Make sure legacy kernel users don't send in bad values * (normal paths check this in check_kill_permission). */ if (!valid_signal(sig)) return -EINVAL; return do_send_sig_info(sig, info, p, PIDTYPE_PID); } EXPORT_SYMBOL(send_sig_info); #define __si_special(priv) \ ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO) int send_sig(int sig, struct task_struct *p, int priv) { return send_sig_info(sig, __si_special(priv), p); } EXPORT_SYMBOL(send_sig); void force_sig(int sig) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; force_sig_info(&info); } EXPORT_SYMBOL(force_sig); void force_fatal_sig(int sig) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; force_sig_info_to_task(&info, current, HANDLER_SIG_DFL); } void force_exit_sig(int sig) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; force_sig_info_to_task(&info, current, HANDLER_EXIT); } /* * When things go south during signal handling, we * will force a SIGSEGV. And if the signal that caused * the problem was already a SIGSEGV, we'll want to * make sure we don't even try to deliver the signal.. */ void force_sigsegv(int sig) { if (sig == SIGSEGV) force_fatal_sig(SIGSEGV); else force_sig(SIGSEGV); } int force_sig_fault_to_task(int sig, int code, void __user *addr, struct task_struct *t) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = code; info.si_addr = addr; return force_sig_info_to_task(&info, t, HANDLER_CURRENT); } int force_sig_fault(int sig, int code, void __user *addr) { return force_sig_fault_to_task(sig, code, addr, current); } int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = code; info.si_addr = addr; return send_sig_info(info.si_signo, &info, t); } int force_sig_mceerr(int code, void __user *addr, short lsb) { struct kernel_siginfo info; WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); clear_siginfo(&info); info.si_signo = SIGBUS; info.si_errno = 0; info.si_code = code; info.si_addr = addr; info.si_addr_lsb = lsb; return force_sig_info(&info); } int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) { struct kernel_siginfo info; WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); clear_siginfo(&info); info.si_signo = SIGBUS; info.si_errno = 0; info.si_code = code; info.si_addr = addr; info.si_addr_lsb = lsb; return send_sig_info(info.si_signo, &info, t); } EXPORT_SYMBOL(send_sig_mceerr); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGSEGV; info.si_errno = 0; info.si_code = SEGV_BNDERR; info.si_addr = addr; info.si_lower = lower; info.si_upper = upper; return force_sig_info(&info); } #ifdef SEGV_PKUERR int force_sig_pkuerr(void __user *addr, u32 pkey) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGSEGV; info.si_errno = 0; info.si_code = SEGV_PKUERR; info.si_addr = addr; info.si_pkey = pkey; return force_sig_info(&info); } #endif int send_sig_perf(void __user *addr, u32 type, u64 sig_data) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGTRAP; info.si_errno = 0; info.si_code = TRAP_PERF; info.si_addr = addr; info.si_perf_data = sig_data; info.si_perf_type = type; /* * Signals generated by perf events should not terminate the whole * process if SIGTRAP is blocked, however, delivering the signal * asynchronously is better than not delivering at all. But tell user * space if the signal was asynchronous, so it can clearly be * distinguished from normal synchronous ones. */ info.si_perf_flags = sigismember(&current->blocked, info.si_signo) ? TRAP_PERF_FLAG_ASYNC : 0; return send_sig_info(info.si_signo, &info, current); } /** * force_sig_seccomp - signals the task to allow in-process syscall emulation * @syscall: syscall number to send to userland * @reason: filter-supplied reason code to send to userland (via si_errno) * @force_coredump: true to trigger a coredump * * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. */ int force_sig_seccomp(int syscall, int reason, bool force_coredump) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGSYS; info.si_code = SYS_SECCOMP; info.si_call_addr = (void __user *)KSTK_EIP(current); info.si_errno = reason; info.si_arch = syscall_get_arch(current); info.si_syscall = syscall; return force_sig_info_to_task(&info, current, force_coredump ? HANDLER_EXIT : HANDLER_CURRENT); } /* For the crazy architectures that include trap information in * the errno field, instead of an actual errno value. */ int force_sig_ptrace_errno_trap(int errno, void __user *addr) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGTRAP; info.si_errno = errno; info.si_code = TRAP_HWBKPT; info.si_addr = addr; return force_sig_info(&info); } /* For the rare architectures that include trap information using * si_trapno. */ int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = code; info.si_addr = addr; info.si_trapno = trapno; return force_sig_info(&info); } /* For the rare architectures that include trap information using * si_trapno. */ int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno, struct task_struct *t) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = code; info.si_addr = addr; info.si_trapno = trapno; return send_sig_info(info.si_signo, &info, t); } static int kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) { int ret; read_lock(&tasklist_lock); ret = __kill_pgrp_info(sig, info, pgrp); read_unlock(&tasklist_lock); return ret; } int kill_pgrp(struct pid *pid, int sig, int priv) { return kill_pgrp_info(sig, __si_special(priv), pid); } EXPORT_SYMBOL(kill_pgrp); int kill_pid(struct pid *pid, int sig, int priv) { return kill_pid_info(sig, __si_special(priv), pid); } EXPORT_SYMBOL(kill_pid); #ifdef CONFIG_POSIX_TIMERS /* * These functions handle POSIX timer signals. POSIX timers use * preallocated sigqueue structs for sending signals. */ static void __flush_itimer_signals(struct sigpending *pending) { sigset_t signal, retain; struct sigqueue *q, *n; signal = pending->signal; sigemptyset(&retain); list_for_each_entry_safe(q, n, &pending->list, list) { int sig = q->info.si_signo; if (likely(q->info.si_code != SI_TIMER)) { sigaddset(&retain, sig); } else { sigdelset(&signal, sig); list_del_init(&q->list); __sigqueue_free(q); } } sigorsets(&pending->signal, &signal, &retain); } void flush_itimer_signals(void) { struct task_struct *tsk = current; guard(spinlock_irqsave)(&tsk->sighand->siglock); __flush_itimer_signals(&tsk->pending); __flush_itimer_signals(&tsk->signal->shared_pending); } bool posixtimer_init_sigqueue(struct sigqueue *q) { struct ucounts *ucounts = sig_get_ucounts(current, -1, 0); if (!ucounts) return false; clear_siginfo(&q->info); __sigqueue_init(q, ucounts, SIGQUEUE_PREALLOC); return true; } static void posixtimer_queue_sigqueue(struct sigqueue *q, struct task_struct *t, enum pid_type type) { struct sigpending *pending; int sig = q->info.si_signo; signalfd_notify(t, sig); pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; list_add_tail(&q->list, &pending->list); sigaddset(&pending->signal, sig); complete_signal(sig, t, type); } /* * This function is used by POSIX timers to deliver a timer signal. * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID * set), the signal must be delivered to the specific thread (queues * into t->pending). * * Where type is not PIDTYPE_PID, signals must be delivered to the * process. In this case, prefer to deliver to current if it is in * the same thread group as the target process and its sighand is * stable, which avoids unnecessarily waking up a potentially idle task. */ static inline struct task_struct *posixtimer_get_target(struct k_itimer *tmr) { struct task_struct *t = pid_task(tmr->it_pid, tmr->it_pid_type); if (t && tmr->it_pid_type != PIDTYPE_PID && same_thread_group(t, current) && !current->exit_state) t = current; return t; } void posixtimer_send_sigqueue(struct k_itimer *tmr) { struct sigqueue *q = &tmr->sigq; int sig = q->info.si_signo; struct task_struct *t; unsigned long flags; int result; guard(rcu)(); t = posixtimer_get_target(tmr); if (!t) return; if (!likely(lock_task_sighand(t, &flags))) return; /* * Update @tmr::sigqueue_seq for posix timer signals with sighand * locked to prevent a race against dequeue_signal(). */ tmr->it_sigqueue_seq = tmr->it_signal_seq; /* * Set the signal delivery status under sighand lock, so that the * ignored signal handling can distinguish between a periodic and a * non-periodic timer. */ tmr->it_sig_periodic = tmr->it_status == POSIX_TIMER_REQUEUE_PENDING; if (!prepare_signal(sig, t, false)) { result = TRACE_SIGNAL_IGNORED; if (!list_empty(&q->list)) { /* * The signal was ignored and blocked. The timer * expiry queued it because blocked signals are * queued independent of the ignored state. * * The unblocking set SIGPENDING, but the signal * was not yet dequeued from the pending list. * So prepare_signal() sees unblocked and ignored, * which ends up here. Leave it queued like a * regular signal. * * The same happens when the task group is exiting * and the signal is already queued. * prepare_signal() treats SIGNAL_GROUP_EXIT as * ignored independent of its queued state. This * gets cleaned up in __exit_signal(). */ goto out; } /* Periodic timers with SIG_IGN are queued on the ignored list */ if (tmr->it_sig_periodic) { /* * Already queued means the timer was rearmed after * the previous expiry got it on the ignore list. * Nothing to do for that case. */ if (hlist_unhashed(&tmr->ignored_list)) { /* * Take a signal reference and queue it on * the ignored list. */ posixtimer_sigqueue_getref(q); posixtimer_sig_ignore(t, q); } } else if (!hlist_unhashed(&tmr->ignored_list)) { /* * Covers the case where a timer was periodic and * then the signal was ignored. Later it was rearmed * as oneshot timer. The previous signal is invalid * now, and this oneshot signal has to be dropped. * Remove it from the ignored list and drop the * reference count as the signal is not longer * queued. */ hlist_del_init(&tmr->ignored_list); posixtimer_putref(tmr); } goto out; } if (unlikely(!list_empty(&q->list))) { /* This holds a reference count already */ result = TRACE_SIGNAL_ALREADY_PENDING; goto out; } /* * If the signal is on the ignore list, it got blocked after it was * ignored earlier. But nothing lifted the ignore. Move it back to * the pending list to be consistent with the regular signal * handling. This already holds a reference count. * * If it's not on the ignore list acquire a reference count. */ if (likely(hlist_unhashed(&tmr->ignored_list))) posixtimer_sigqueue_getref(q); else hlist_del_init(&tmr->ignored_list); posixtimer_queue_sigqueue(q, t, tmr->it_pid_type); result = TRACE_SIGNAL_DELIVERED; out: trace_signal_generate(sig, &q->info, t, tmr->it_pid_type != PIDTYPE_PID, result); unlock_task_sighand(t, &flags); } static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q) { struct k_itimer *tmr = container_of(q, struct k_itimer, sigq); /* * If the timer is marked deleted already or the signal originates * from a non-periodic timer, then just drop the reference * count. Otherwise queue it on the ignored list. */ if (posixtimer_valid(tmr) && tmr->it_sig_periodic) hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers); else posixtimer_putref(tmr); } static void posixtimer_sig_unignore(struct task_struct *tsk, int sig) { struct hlist_head *head = &tsk->signal->ignored_posix_timers; struct hlist_node *tmp; struct k_itimer *tmr; if (likely(hlist_empty(head))) return; /* * Rearming a timer with sighand lock held is not possible due to * lock ordering vs. tmr::it_lock. Just stick the sigqueue back and * let the signal delivery path deal with it whether it needs to be * rearmed or not. This cannot be decided here w/o dropping sighand * lock and creating a loop retry horror show. */ hlist_for_each_entry_safe(tmr, tmp , head, ignored_list) { struct task_struct *target; /* * tmr::sigq.info.si_signo is immutable, so accessing it * without holding tmr::it_lock is safe. */ if (tmr->sigq.info.si_signo != sig) continue; hlist_del_init(&tmr->ignored_list); /* This should never happen and leaks a reference count */ if (WARN_ON_ONCE(!list_empty(&tmr->sigq.list))) continue; /* * Get the target for the signal. If target is a thread and * has exited by now, drop the reference count. */ guard(rcu)(); target = posixtimer_get_target(tmr); if (target) posixtimer_queue_sigqueue(&tmr->sigq, target, tmr->it_pid_type); else posixtimer_putref(tmr); } } #else /* CONFIG_POSIX_TIMERS */ static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q) { } static inline void posixtimer_sig_unignore(struct task_struct *tsk, int sig) { } #endif /* !CONFIG_POSIX_TIMERS */ void do_notify_pidfd(struct task_struct *task) { struct pid *pid = task_pid(task); WARN_ON(task->exit_state == 0); __wake_up(&pid->wait_pidfd, TASK_NORMAL, 0, poll_to_key(EPOLLIN | EPOLLRDNORM)); } /* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. * * Returns true if our parent ignored us and so we've switched to * self-reaping. */ bool do_notify_parent(struct task_struct *tsk, int sig) { struct kernel_siginfo info; unsigned long flags; struct sighand_struct *psig; bool autoreap = false; u64 utime, stime; if (WARN_ON_ONCE(!valid_signal(sig))) return false; /* do_notify_parent_cldstop should have been called instead. */ WARN_ON_ONCE(task_is_stopped_or_traced(tsk)); WARN_ON_ONCE(!tsk->ptrace && !thread_group_empty(tsk)); /* ptraced, or group-leader without sub-threads */ do_notify_pidfd(tsk); if (sig != SIGCHLD) { /* * This is only possible if parent == real_parent. * Check if it has changed security domain. */ if (tsk->parent_exec_id != READ_ONCE(tsk->parent->self_exec_id)) sig = SIGCHLD; } clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; /* * We are under tasklist_lock here so our parent is tied to * us and cannot change. * * task_active_pid_ns will always return the same pid namespace * until a task passes through release_task. * * write_lock() currently calls preempt_disable() which is the * same as rcu_read_lock(), but according to Oleg, this is not * correct to rely on this */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent)); info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns), task_uid(tsk)); rcu_read_unlock(); task_cputime(tsk, &utime, &stime); info.si_utime = nsec_to_clock_t(utime + tsk->signal->utime); info.si_stime = nsec_to_clock_t(stime + tsk->signal->stime); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) info.si_code = CLD_DUMPED; else if (tsk->exit_code & 0x7f) info.si_code = CLD_KILLED; else { info.si_code = CLD_EXITED; info.si_status = tsk->exit_code >> 8; } psig = tsk->parent->sighand; spin_lock_irqsave(&psig->siglock, flags); if (!tsk->ptrace && sig == SIGCHLD && (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { /* * We are exiting and our parent doesn't care. POSIX.1 * defines special semantics for setting SIGCHLD to SIG_IGN * or setting the SA_NOCLDWAIT flag: we should be reaped * automatically and not left for our parent's wait4 call. * Rather than having the parent do it as a magic kind of * signal handler, we just set this to tell do_exit that we * can be cleaned up without becoming a zombie. Note that * we still call __wake_up_parent in this case, because a * blocked sys_wait4 might now return -ECHILD. * * Whether we send SIGCHLD or not for SA_NOCLDWAIT * is implementation-defined: we do (if you don't want * it, just use SIG_IGN instead). */ autoreap = true; if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) sig = 0; } if (!tsk->ptrace && tsk->signal->autoreap) { autoreap = true; sig = 0; } /* * Send with __send_signal as si_pid and si_uid are in the * parent's namespaces. */ if (sig) __send_signal_locked(sig, &info, tsk->parent, PIDTYPE_TGID, false); __wake_up_parent(tsk, tsk->parent); spin_unlock_irqrestore(&psig->siglock, flags); return autoreap; } /** * do_notify_parent_cldstop - notify parent of stopped/continued state change * @tsk: task reporting the state change * @for_ptracer: the notification is for ptracer * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report * * Notify @tsk's parent that the stopped/continued state has changed. If * @for_ptracer is %false, @tsk's group leader notifies to its real parent. * If %true, @tsk reports to @tsk->parent which should be the ptracer. * * CONTEXT: * Must be called with tasklist_lock at least read locked. */ static void do_notify_parent_cldstop(struct task_struct *tsk, bool for_ptracer, int why) { struct kernel_siginfo info; unsigned long flags; struct task_struct *parent; struct sighand_struct *sighand; u64 utime, stime; if (for_ptracer) { parent = tsk->parent; } else { tsk = tsk->group_leader; parent = tsk->real_parent; } clear_siginfo(&info); info.si_signo = SIGCHLD; info.si_errno = 0; /* * see comment in do_notify_parent() about the following 4 lines */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent)); info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); rcu_read_unlock(); task_cputime(tsk, &utime, &stime); info.si_utime = nsec_to_clock_t(utime); info.si_stime = nsec_to_clock_t(stime); info.si_code = why; switch (why) { case CLD_CONTINUED: info.si_status = SIGCONT; break; case CLD_STOPPED: info.si_status = tsk->signal->group_exit_code & 0x7f; break; case CLD_TRAPPED: info.si_status = tsk->exit_code & 0x7f; break; default: BUG(); } sighand = parent->sighand; spin_lock_irqsave(&sighand->siglock, flags); if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN && !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP)) send_signal_locked(SIGCHLD, &info, parent, PIDTYPE_TGID); /* * Even if SIGCHLD is not generated, we must wake up wait4 calls. */ __wake_up_parent(tsk, parent); spin_unlock_irqrestore(&sighand->siglock, flags); } /* * This must be called with current->sighand->siglock held. * * This should be the path for all ptrace stops. * We always set current->last_siginfo while stopped here. * That makes it a way to test a stopped process for * being ptrace-stopped vs being job-control-stopped. * * Returns the signal the ptracer requested the code resume * with. If the code did not stop because the tracer is gone, * the stop signal remains unchanged unless clear_code. */ static int ptrace_stop(int exit_code, int why, unsigned long message, kernel_siginfo_t *info) __releases(&current->sighand->siglock) __acquires(&current->sighand->siglock) { bool gstop_done = false; if (arch_ptrace_stop_needed()) { /* * The arch code has something special to do before a * ptrace stop. This is allowed to block, e.g. for faults * on user stack pages. We can't keep the siglock while * calling arch_ptrace_stop, so we must release it now. * To preserve proper semantics, we must do this before * any signal bookkeeping like checking group_stop_count. */ spin_unlock_irq(&current->sighand->siglock); arch_ptrace_stop(); spin_lock_irq(&current->sighand->siglock); } /* * After this point ptrace_signal_wake_up or signal_wake_up * will clear TASK_TRACED if ptrace_unlink happens or a fatal * signal comes in. Handle previous ptrace_unlinks and fatal * signals here to prevent ptrace_stop sleeping in schedule. */ if (!current->ptrace || __fatal_signal_pending(current)) return exit_code; set_special_state(TASK_TRACED); current->jobctl |= JOBCTL_TRACED; /* * We're committing to trapping. TRACED should be visible before * TRAPPING is cleared; otherwise, the tracer might fail do_wait(). * Also, transition to TRACED and updates to ->jobctl should be * atomic with respect to siglock and should be done after the arch * hook as siglock is released and regrabbed across it. * * TRACER TRACEE * * ptrace_attach() * [L] wait_on_bit(JOBCTL_TRAPPING) [S] set_special_state(TRACED) * do_wait() * set_current_state() smp_wmb(); * ptrace_do_wait() * wait_task_stopped() * task_stopped_code() * [L] task_is_traced() [S] task_clear_jobctl_trapping(); */ smp_wmb(); current->ptrace_message = message; current->last_siginfo = info; current->exit_code = exit_code; /* * If @why is CLD_STOPPED, we're trapping to participate in a group * stop. Do the bookkeeping. Note that if SIGCONT was delievered * across siglock relocks since INTERRUPT was scheduled, PENDING * could be clear now. We act as if SIGCONT is received after * TASK_TRACED is entered - ignore it. */ if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING)) gstop_done = task_participate_group_stop(current); /* any trap clears pending STOP trap, STOP trap clears NOTIFY */ task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP); if (info && info->si_code >> 8 == PTRACE_EVENT_STOP) task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY); /* entering a trap, clear TRAPPING */ task_clear_jobctl_trapping(current); spin_unlock_irq(&current->sighand->siglock); read_lock(&tasklist_lock); /* * Notify parents of the stop. * * While ptraced, there are two parents - the ptracer and * the real_parent of the group_leader. The ptracer should * know about every stop while the real parent is only * interested in the completion of group stop. The states * for the two don't interact with each other. Notify * separately unless they're gonna be duplicates. */ if (current->ptrace) do_notify_parent_cldstop(current, true, why); if (gstop_done && (!current->ptrace || ptrace_reparented(current))) do_notify_parent_cldstop(current, false, why); /* * The previous do_notify_parent_cldstop() invocation woke ptracer. * One a PREEMPTION kernel this can result in preemption requirement * which will be fulfilled after read_unlock() and the ptracer will be * put on the CPU. * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for * this task wait in schedule(). If this task gets preempted then it * remains enqueued on the runqueue. The ptracer will observe this and * then sleep for a delay of one HZ tick. In the meantime this task * gets scheduled, enters schedule() and will wait for the ptracer. * * This preemption point is not bad from a correctness point of * view but extends the runtime by one HZ tick time due to the * ptracer's sleep. The preempt-disable section ensures that there * will be no preemption between unlock and schedule() and so * improving the performance since the ptracer will observe that * the tracee is scheduled out once it gets on the CPU. * * On PREEMPT_RT locking tasklist_lock does not disable preemption. * Therefore the task can be preempted after do_notify_parent_cldstop() * before unlocking tasklist_lock so there is no benefit in doing this. * * In fact disabling preemption is harmful on PREEMPT_RT because * the spinlock_t in cgroup_enter_frozen() must not be acquired * with preemption disabled due to the 'sleeping' spinlock * substitution of RT. */ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_disable(); read_unlock(&tasklist_lock); cgroup_enter_frozen(); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable_no_resched(); schedule(); cgroup_leave_frozen(true); /* * We are back. Now reacquire the siglock before touching * last_siginfo, so that we are sure to have synchronized with * any signal-sending on another CPU that wants to examine it. */ spin_lock_irq(&current->sighand->siglock); exit_code = current->exit_code; current->last_siginfo = NULL; current->ptrace_message = 0; current->exit_code = 0; /* LISTENING can be set only during STOP traps, clear it */ current->jobctl &= ~(JOBCTL_LISTENING | JOBCTL_PTRACE_FROZEN); /* * Queued signals ignored us while we were stopped for tracing. * So check for any that we should take before resuming user mode. * This sets TIF_SIGPENDING, but never clears it. */ recalc_sigpending_tsk(current); return exit_code; } static int ptrace_do_notify(int signr, int exit_code, int why, unsigned long message) { kernel_siginfo_t info; clear_siginfo(&info); info.si_signo = signr; info.si_code = exit_code; info.si_pid = task_pid_vnr(current); info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); /* Let the debugger run. */ return ptrace_stop(exit_code, why, message, &info); } int ptrace_notify(int exit_code, unsigned long message) { int signr; BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); if (unlikely(task_work_pending(current))) task_work_run(); spin_lock_irq(&current->sighand->siglock); signr = ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED, message); spin_unlock_irq(&current->sighand->siglock); return signr; } /** * do_signal_stop - handle group stop for SIGSTOP and other stop signals * @signr: signr causing group stop if initiating * * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr * and participate in it. If already set, participate in the existing * group stop. If participated in a group stop (and thus slept), %true is * returned with siglock released. * * If ptraced, this function doesn't handle stop itself. Instead, * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock * untouched. The caller must ensure that INTERRUPT trap handling takes * places afterwards. * * CONTEXT: * Must be called with @current->sighand->siglock held, which is released * on %true return. * * RETURNS: * %false if group stop is already cancelled or ptrace trap is scheduled. * %true if participated in group stop. */ static bool do_signal_stop(int signr) __releases(&current->sighand->siglock) { struct signal_struct *sig = current->signal; if (!(current->jobctl & JOBCTL_STOP_PENDING)) { unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; struct task_struct *t; /* signr will be recorded in task->jobctl for retries */ WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK); if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) || unlikely(sig->flags & SIGNAL_GROUP_EXIT) || unlikely(sig->group_exec_task)) return false; /* * There is no group stop already in progress. We must * initiate one now. * * While ptraced, a task may be resumed while group stop is * still in effect and then receive a stop signal and * initiate another group stop. This deviates from the * usual behavior as two consecutive stop signals can't * cause two group stops when !ptraced. That is why we * also check !task_is_stopped(t) below. * * The condition can be distinguished by testing whether * SIGNAL_STOP_STOPPED is already set. Don't generate * group_exit_code in such case. * * This is not necessary for SIGNAL_STOP_CONTINUED because * an intervening stop signal is required to cause two * continued events regardless of ptrace. */ if (!(sig->flags & SIGNAL_STOP_STOPPED)) sig->group_exit_code = signr; sig->group_stop_count = 0; if (task_set_jobctl_pending(current, signr | gstop)) sig->group_stop_count++; for_other_threads(current, t) { /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, * so this check has no races. */ if (!task_is_stopped(t) && task_set_jobctl_pending(t, signr | gstop)) { sig->group_stop_count++; if (likely(!(t->ptrace & PT_SEIZED))) signal_wake_up(t, 0); else ptrace_trap_notify(t); } } } if (likely(!current->ptrace)) { int notify = 0; /* * If there are no other threads in the group, or if there * is a group stop in progress and we are the last to stop, * report to the parent. */ if (task_participate_group_stop(current)) notify = CLD_STOPPED; current->jobctl |= JOBCTL_STOPPED; set_special_state(TASK_STOPPED); spin_unlock_irq(&current->sighand->siglock); /* * Notify the parent of the group stop completion. Because * we're not holding either the siglock or tasklist_lock * here, ptracer may attach inbetween; however, this is for * group stop and should always be delivered to the real * parent of the group leader. The new ptracer will get * its notification when this task transitions into * TASK_TRACED. */ if (notify) { read_lock(&tasklist_lock); do_notify_parent_cldstop(current, false, notify); read_unlock(&tasklist_lock); } /* Now we don't run again until woken by SIGCONT or SIGKILL */ cgroup_enter_frozen(); schedule(); return true; } else { /* * While ptraced, group stop is handled by STOP trap. * Schedule it and let the caller deal with it. */ task_set_jobctl_pending(current, JOBCTL_TRAP_STOP); return false; } } /** * do_jobctl_trap - take care of ptrace jobctl traps * * When PT_SEIZED, it's used for both group stop and explicit * SEIZE/INTERRUPT traps. Both generate PTRACE_EVENT_STOP trap with * accompanying siginfo. If stopped, lower eight bits of exit_code contain * the stop signal; otherwise, %SIGTRAP. * * When !PT_SEIZED, it's used only for group stop trap with stop signal * number as exit_code and no siginfo. * * CONTEXT: * Must be called with @current->sighand->siglock held, which may be * released and re-acquired before returning with intervening sleep. */ static void do_jobctl_trap(void) { struct signal_struct *signal = current->signal; int signr = current->jobctl & JOBCTL_STOP_SIGMASK; if (current->ptrace & PT_SEIZED) { if (!signal->group_stop_count && !(signal->flags & SIGNAL_STOP_STOPPED)) signr = SIGTRAP; WARN_ON_ONCE(!signr); ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8), CLD_STOPPED, 0); } else { WARN_ON_ONCE(!signr); ptrace_stop(signr, CLD_STOPPED, 0, NULL); } } /** * do_freezer_trap - handle the freezer jobctl trap * * Puts the task into frozen state, if only the task is not about to quit. * In this case it drops JOBCTL_TRAP_FREEZE. * * CONTEXT: * Must be called with @current->sighand->siglock held, * which is always released before returning. */ static void do_freezer_trap(void) __releases(&current->sighand->siglock) { /* * If there are other trap bits pending except JOBCTL_TRAP_FREEZE, * let's make another loop to give it a chance to be handled. * In any case, we'll return back. */ if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) != JOBCTL_TRAP_FREEZE) { spin_unlock_irq(&current->sighand->siglock); return; } /* * Now we're sure that there is no pending fatal signal and no * pending traps. Clear TIF_SIGPENDING to not get out of schedule() * immediately (if there is a non-fatal signal pending), and * put the task into sleep. */ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); clear_thread_flag(TIF_SIGPENDING); spin_unlock_irq(&current->sighand->siglock); cgroup_enter_frozen(); schedule(); /* * We could've been woken by task_work, run it to clear * TIF_NOTIFY_SIGNAL. The caller will retry if necessary. */ clear_notify_signal(); if (unlikely(task_work_pending(current))) task_work_run(); } static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) { /* * We do not check sig_kernel_stop(signr) but set this marker * unconditionally because we do not know whether debugger will * change signr. This flag has no meaning unless we are going * to stop after return from ptrace_stop(). In this case it will * be checked in do_signal_stop(), we should only stop if it was * not cleared by SIGCONT while we were sleeping. See also the * comment in dequeue_signal(). */ current->jobctl |= JOBCTL_STOP_DEQUEUED; signr = ptrace_stop(signr, CLD_TRAPPED, 0, info); /* We're back. Did the debugger cancel the sig? */ if (signr == 0) return signr; /* * Update the siginfo structure if the signal has * changed. If the debugger wanted something * specific in the siginfo structure then it should * have updated *info via PTRACE_SETSIGINFO. */ if (signr != info->si_signo) { clear_siginfo(info); info->si_signo = signr; info->si_errno = 0; info->si_code = SI_USER; rcu_read_lock(); info->si_pid = task_pid_vnr(current->parent); info->si_uid = from_kuid_munged(current_user_ns(), task_uid(current->parent)); rcu_read_unlock(); } /* If the (new) signal is now blocked, requeue it. */ if (sigismember(&current->blocked, signr) || fatal_signal_pending(current)) { send_signal_locked(signr, info, current, type); signr = 0; } return signr; } static void hide_si_addr_tag_bits(struct ksignal *ksig) { switch (siginfo_layout(ksig->sig, ksig->info.si_code)) { case SIL_FAULT: case SIL_FAULT_TRAPNO: case SIL_FAULT_MCEERR: case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: case SIL_FAULT_PERF_EVENT: ksig->info.si_addr = arch_untagged_si_addr( ksig->info.si_addr, ksig->sig, ksig->info.si_code); break; case SIL_KILL: case SIL_TIMER: case SIL_POLL: case SIL_CHLD: case SIL_RT: case SIL_SYS: break; } } bool get_signal(struct ksignal *ksig) { struct sighand_struct *sighand = current->sighand; struct signal_struct *signal = current->signal; int signr; clear_notify_signal(); if (unlikely(task_work_pending(current))) task_work_run(); if (!task_sigpending(current)) return false; if (unlikely(uprobe_deny_signal())) return false; /* * Do this once, we can't return to user-mode if freezing() == T. * do_signal_stop() and ptrace_stop() set TASK_STOPPED/TASK_TRACED * and the freezer handles those states via TASK_FROZEN, thus they * do not need another check after return. */ try_to_freeze(); relock: spin_lock_irq(&sighand->siglock); /* * Every stopped thread goes here after wakeup. Check to see if * we should notify the parent, prepare_signal(SIGCONT) encodes * the CLD_ si_code into SIGNAL_CLD_MASK bits. */ if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { int why; if (signal->flags & SIGNAL_CLD_CONTINUED) why = CLD_CONTINUED; else why = CLD_STOPPED; signal->flags &= ~SIGNAL_CLD_MASK; spin_unlock_irq(&sighand->siglock); /* * Notify the parent that we're continuing. This event is * always per-process and doesn't make whole lot of sense * for ptracers, who shouldn't consume the state via * wait(2) either, but, for backward compatibility, notify * the ptracer of the group leader too unless it's gonna be * a duplicate. */ read_lock(&tasklist_lock); do_notify_parent_cldstop(current, false, why); if (ptrace_reparented(current->group_leader)) do_notify_parent_cldstop(current->group_leader, true, why); read_unlock(&tasklist_lock); goto relock; } for (;;) { struct k_sigaction *ka; enum pid_type type; /* Has this task already been marked for death? */ if ((signal->flags & SIGNAL_GROUP_EXIT) || signal->group_exec_task) { signr = SIGKILL; sigdelset(&current->pending.signal, SIGKILL); trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, &sighand->action[SIGKILL-1]); recalc_sigpending(); /* * implies do_group_exit() or return to PF_USER_WORKER, * no need to initialize ksig->info/etc. */ goto fatal; } if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) && do_signal_stop(0)) goto relock; if (unlikely(current->jobctl & (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) { if (current->jobctl & JOBCTL_TRAP_MASK) { do_jobctl_trap(); spin_unlock_irq(&sighand->siglock); } else if (current->jobctl & JOBCTL_TRAP_FREEZE) do_freezer_trap(); goto relock; } /* * If the task is leaving the frozen state, let's update * cgroup counters and reset the frozen bit. */ if (unlikely(cgroup_task_frozen(current))) { spin_unlock_irq(&sighand->siglock); cgroup_leave_frozen(false); goto relock; } /* * Signals generated by the execution of an instruction * need to be delivered before any other pending signals * so that the instruction pointer in the signal stack * frame points to the faulting instruction. */ type = PIDTYPE_PID; signr = dequeue_synchronous_signal(&ksig->info); if (!signr) signr = dequeue_signal(&current->blocked, &ksig->info, &type); if (!signr) break; /* will return 0 */ if (unlikely(current->ptrace) && (signr != SIGKILL) && !(sighand->action[signr -1].sa.sa_flags & SA_IMMUTABLE)) { signr = ptrace_signal(signr, &ksig->info, type); if (!signr) continue; } ka = &sighand->action[signr-1]; /* Trace actually delivered signals. */ trace_signal_deliver(signr, &ksig->info, ka); if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ continue; if (ka->sa.sa_handler != SIG_DFL) { /* Run the handler. */ ksig->ka = *ka; if (ka->sa.sa_flags & SA_ONESHOT) ka->sa.sa_handler = SIG_DFL; break; /* will return non-zero "signr" value */ } /* * Now we are doing the default action for this signal. */ if (sig_kernel_ignore(signr)) /* Default is nothing. */ continue; /* * Global init gets no signals it doesn't want. * Container-init gets no signals it doesn't want from same * container. * * Note that if global/container-init sees a sig_kernel_only() * signal here, the signal must have been generated internally * or must have come from an ancestor namespace. In either * case, the signal cannot be dropped. */ if (unlikely(signal->flags & SIGNAL_UNKILLABLE) && !sig_kernel_only(signr)) continue; if (sig_kernel_stop(signr)) { /* * The default action is to stop all threads in * the thread group. The job control signals * do nothing in an orphaned pgrp, but SIGSTOP * always works. Note that siglock needs to be * dropped during the call to is_orphaned_pgrp() * because of lock ordering with tasklist_lock. * This allows an intervening SIGCONT to be posted. * We need to check for that and bail out if necessary. */ if (signr != SIGSTOP) { spin_unlock_irq(&sighand->siglock); /* signals can be posted during this window */ if (is_current_pgrp_orphaned()) goto relock; spin_lock_irq(&sighand->siglock); } if (likely(do_signal_stop(signr))) { /* It released the siglock. */ goto relock; } /* * We didn't actually stop, due to a race * with SIGCONT or something like that. */ continue; } fatal: spin_unlock_irq(&sighand->siglock); if (unlikely(cgroup_task_frozen(current))) cgroup_leave_frozen(true); /* * Anything else is fatal, maybe with a core dump. */ current->flags |= PF_SIGNALED; if (sig_kernel_coredump(signr)) { if (print_fatal_signals) print_fatal_signal(signr); proc_coredump_connector(current); /* * If it was able to dump core, this kills all * other threads in the group and synchronizes with * their demise. If we lost the race with another * thread getting here, it set group_exit_code * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ vfs_coredump(&ksig->info); } /* * PF_USER_WORKER threads will catch and exit on fatal signals * themselves. They have cleanup that must be performed, so we * cannot call do_exit() on their behalf. Note that ksig won't * be properly initialized, PF_USER_WORKER's shouldn't use it. */ if (current->flags & PF_USER_WORKER) goto out; /* * Death signals, no core dump. */ do_group_exit(signr); /* NOTREACHED */ } spin_unlock_irq(&sighand->siglock); ksig->sig = signr; if (signr && !(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS)) hide_si_addr_tag_bits(ksig); out: return signr > 0; } /** * signal_delivered - called after signal delivery to update blocked signals * @ksig: kernel signal struct * @stepping: nonzero if debugger single-step or block-step in use * * This function should be called when a signal has successfully been * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask * is always blocked), and the signal itself is blocked unless %SA_NODEFER * is set in @ksig->ka.sa.sa_flags. Tracing is notified. */ static void signal_delivered(struct ksignal *ksig, int stepping) { sigset_t blocked; /* A signal was successfully delivered, and the saved sigmask was stored on the signal frame, and will be restored by sigreturn. So we can simply clear the restore sigmask flag. */ clear_restore_sigmask(); sigorsets(&blocked, &current->blocked, &ksig->ka.sa.sa_mask); if (!(ksig->ka.sa.sa_flags & SA_NODEFER)) sigaddset(&blocked, ksig->sig); set_current_blocked(&blocked); if (current->sas_ss_flags & SS_AUTODISARM) sas_ss_reset(current); if (stepping) ptrace_notify(SIGTRAP, 0); } void signal_setup_done(int failed, struct ksignal *ksig, int stepping) { if (failed) force_sigsegv(ksig->sig); else signal_delivered(ksig, stepping); } /* * It could be that complete_signal() picked us to notify about the * group-wide signal. Other threads should be notified now to take * the shared signals in @which since we will not. */ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) { sigset_t retarget; struct task_struct *t; sigandsets(&retarget, &tsk->signal->shared_pending.signal, which); if (sigisemptyset(&retarget)) return; for_other_threads(tsk, t) { if (t->flags & PF_EXITING) continue; if (!has_pending_signals(&retarget, &t->blocked)) continue; /* Remove the signals this thread can handle. */ sigandsets(&retarget, &retarget, &t->blocked); if (!task_sigpending(t)) signal_wake_up(t, 0); if (sigisemptyset(&retarget)) break; } } void exit_signals(struct task_struct *tsk) { int group_stop = 0; sigset_t unblocked; /* * @tsk is about to have PF_EXITING set - lock out users which * expect stable threadgroup. */ cgroup_threadgroup_change_begin(tsk); if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); return; } spin_lock_irq(&tsk->sighand->siglock); /* * From now this task is not visible for group-wide signals, * see wants_signal(), do_signal_stop(). */ tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); if (!task_sigpending(tsk)) goto out; unblocked = tsk->blocked; signotset(&unblocked); retarget_shared_pending(tsk, &unblocked); if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) && task_participate_group_stop(tsk)) group_stop = CLD_STOPPED; out: spin_unlock_irq(&tsk->sighand->siglock); /* * If group stop has completed, deliver the notification. This * should always go to the real parent of the group leader. */ if (unlikely(group_stop)) { read_lock(&tasklist_lock); do_notify_parent_cldstop(tsk, false, group_stop); read_unlock(&tasklist_lock); } } /* * System call entry points. */ /** * sys_restart_syscall - restart a system call */ SYSCALL_DEFINE0(restart_syscall) { struct restart_block *restart = &current->restart_block; return restart->fn(restart); } long do_no_restart_syscall(struct restart_block *param) { return -EINTR; } static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) { if (task_sigpending(tsk) && !thread_group_empty(tsk)) { sigset_t newblocked; /* A set of now blocked but previously unblocked signals. */ sigandnsets(&newblocked, newset, &current->blocked); retarget_shared_pending(tsk, &newblocked); } tsk->blocked = *newset; recalc_sigpending(); } /** * set_current_blocked - change current->blocked mask * @newset: new mask * * It is wrong to change ->blocked directly, this helper should be used * to ensure the process can't miss a shared signal we are going to block. */ void set_current_blocked(sigset_t *newset) { sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP)); __set_current_blocked(newset); } void __set_current_blocked(const sigset_t *newset) { struct task_struct *tsk = current; /* * In case the signal mask hasn't changed, there is nothing we need * to do. The current->blocked shouldn't be modified by other task. */ if (sigequalsets(&tsk->blocked, newset)) return; spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, newset); spin_unlock_irq(&tsk->sighand->siglock); } /* * This is also useful for kernel threads that want to temporarily * (or permanently) block certain signals. * * NOTE! Unlike the user-mode sys_sigprocmask(), the kernel * interface happily blocks "unblockable" signals like SIGKILL * and friends. */ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) { struct task_struct *tsk = current; sigset_t newset; /* Lockless, only current can change ->blocked, never from irq */ if (oldset) *oldset = tsk->blocked; switch (how) { case SIG_BLOCK: sigorsets(&newset, &tsk->blocked, set); break; case SIG_UNBLOCK: sigandnsets(&newset, &tsk->blocked, set); break; case SIG_SETMASK: newset = *set; break; default: return -EINVAL; } __set_current_blocked(&newset); return 0; } EXPORT_SYMBOL(sigprocmask); /* * The api helps set app-provided sigmasks. * * This is useful for syscalls such as ppoll, pselect, io_pgetevents and * epoll_pwait where a new sigmask is passed from userland for the syscalls. * * Note that it does set_restore_sigmask() in advance, so it must be always * paired with restore_saved_sigmask_unless() before return from syscall. */ int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize) { sigset_t kmask; if (!umask) return 0; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (copy_from_user(&kmask, umask, sizeof(sigset_t))) return -EFAULT; set_restore_sigmask(); current->saved_sigmask = current->blocked; set_current_blocked(&kmask); return 0; } #ifdef CONFIG_COMPAT int set_compat_user_sigmask(const compat_sigset_t __user *umask, size_t sigsetsize) { sigset_t kmask; if (!umask) return 0; if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; if (get_compat_sigset(&kmask, umask)) return -EFAULT; set_restore_sigmask(); current->saved_sigmask = current->blocked; set_current_blocked(&kmask); return 0; } #endif /** * sys_rt_sigprocmask - change the list of currently blocked signals * @how: whether to add, remove, or set signals * @nset: stores pending signals * @oset: previous value of signal mask if non-null * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, sigset_t __user *, oset, size_t, sigsetsize) { sigset_t old_set, new_set; int error; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; old_set = current->blocked; if (nset) { if (copy_from_user(&new_set, nset, sizeof(sigset_t))) return -EFAULT; sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); error = sigprocmask(how, &new_set, NULL); if (error) return error; } if (oset) { if (copy_to_user(oset, &old_set, sizeof(sigset_t))) return -EFAULT; } return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, compat_sigset_t __user *, oset, compat_size_t, sigsetsize) { sigset_t old_set = current->blocked; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (nset) { sigset_t new_set; int error; if (get_compat_sigset(&new_set, nset)) return -EFAULT; sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); error = sigprocmask(how, &new_set, NULL); if (error) return error; } return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0; } #endif static void do_sigpending(sigset_t *set) { spin_lock_irq(&current->sighand->siglock); sigorsets(set, &current->pending.signal, &current->signal->shared_pending.signal); spin_unlock_irq(&current->sighand->siglock); /* Outside the lock because only this thread touches it. */ sigandsets(set, &current->blocked, set); } /** * sys_rt_sigpending - examine a pending signal that has been raised * while blocked * @uset: stores pending signals * @sigsetsize: size of sigset_t type or larger */ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) { sigset_t set; if (sigsetsize > sizeof(*uset)) return -EINVAL; do_sigpending(&set); if (copy_to_user(uset, &set, sigsetsize)) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, compat_size_t, sigsetsize) { sigset_t set; if (sigsetsize > sizeof(*uset)) return -EINVAL; do_sigpending(&set); return put_compat_sigset(uset, &set, sigsetsize); } #endif static const struct { unsigned char limit, layout; } sig_sicodes[] = { [SIGILL] = { NSIGILL, SIL_FAULT }, [SIGFPE] = { NSIGFPE, SIL_FAULT }, [SIGSEGV] = { NSIGSEGV, SIL_FAULT }, [SIGBUS] = { NSIGBUS, SIL_FAULT }, [SIGTRAP] = { NSIGTRAP, SIL_FAULT }, #if defined(SIGEMT) [SIGEMT] = { NSIGEMT, SIL_FAULT }, #endif [SIGCHLD] = { NSIGCHLD, SIL_CHLD }, [SIGPOLL] = { NSIGPOLL, SIL_POLL }, [SIGSYS] = { NSIGSYS, SIL_SYS }, }; static bool known_siginfo_layout(unsigned sig, int si_code) { if (si_code == SI_KERNEL) return true; else if ((si_code > SI_USER)) { if (sig_specific_sicodes(sig)) { if (si_code <= sig_sicodes[sig].limit) return true; } else if (si_code <= NSIGPOLL) return true; } else if (si_code >= SI_DETHREAD) return true; else if (si_code == SI_ASYNCNL) return true; return false; } enum siginfo_layout siginfo_layout(unsigned sig, int si_code) { enum siginfo_layout layout = SIL_KILL; if ((si_code > SI_USER) && (si_code < SI_KERNEL)) { if ((sig < ARRAY_SIZE(sig_sicodes)) && (si_code <= sig_sicodes[sig].limit)) { layout = sig_sicodes[sig].layout; /* Handle the exceptions */ if ((sig == SIGBUS) && (si_code >= BUS_MCEERR_AR) && (si_code <= BUS_MCEERR_AO)) layout = SIL_FAULT_MCEERR; else if ((sig == SIGSEGV) && (si_code == SEGV_BNDERR)) layout = SIL_FAULT_BNDERR; #ifdef SEGV_PKUERR else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR)) layout = SIL_FAULT_PKUERR; #endif else if ((sig == SIGTRAP) && (si_code == TRAP_PERF)) layout = SIL_FAULT_PERF_EVENT; else if (IS_ENABLED(CONFIG_SPARC) && (sig == SIGILL) && (si_code == ILL_ILLTRP)) layout = SIL_FAULT_TRAPNO; else if (IS_ENABLED(CONFIG_ALPHA) && ((sig == SIGFPE) || ((sig == SIGTRAP) && (si_code == TRAP_UNK)))) layout = SIL_FAULT_TRAPNO; } else if (si_code <= NSIGPOLL) layout = SIL_POLL; } else { if (si_code == SI_TIMER) layout = SIL_TIMER; else if (si_code == SI_SIGIO) layout = SIL_POLL; else if (si_code < 0) layout = SIL_RT; } return layout; } static inline char __user *si_expansion(const siginfo_t __user *info) { return ((char __user *)info) + sizeof(struct kernel_siginfo); } int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from) { char __user *expansion = si_expansion(to); if (copy_to_user(to, from , sizeof(struct kernel_siginfo))) return -EFAULT; if (clear_user(expansion, SI_EXPANSION_SIZE)) return -EFAULT; return 0; } static int post_copy_siginfo_from_user(kernel_siginfo_t *info, const siginfo_t __user *from) { if (unlikely(!known_siginfo_layout(info->si_signo, info->si_code))) { char __user *expansion = si_expansion(from); char buf[SI_EXPANSION_SIZE]; int i; /* * An unknown si_code might need more than * sizeof(struct kernel_siginfo) bytes. Verify all of the * extra bytes are 0. This guarantees copy_siginfo_to_user * will return this data to userspace exactly. */ if (copy_from_user(&buf, expansion, SI_EXPANSION_SIZE)) return -EFAULT; for (i = 0; i < SI_EXPANSION_SIZE; i++) { if (buf[i] != 0) return -E2BIG; } } return 0; } static int __copy_siginfo_from_user(int signo, kernel_siginfo_t *to, const siginfo_t __user *from) { if (copy_from_user(to, from, sizeof(struct kernel_siginfo))) return -EFAULT; to->si_signo = signo; return post_copy_siginfo_from_user(to, from); } int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from) { if (copy_from_user(to, from, sizeof(struct kernel_siginfo))) return -EFAULT; return post_copy_siginfo_from_user(to, from); } #ifdef CONFIG_COMPAT /** * copy_siginfo_to_external32 - copy a kernel siginfo into a compat user siginfo * @to: compat siginfo destination * @from: kernel siginfo source * * Note: This function does not work properly for the SIGCHLD on x32, but * fortunately it doesn't have to. The only valid callers for this function are * copy_siginfo_to_user32, which is overriden for x32 and the coredump code. * The latter does not care because SIGCHLD will never cause a coredump. */ void copy_siginfo_to_external32(struct compat_siginfo *to, const struct kernel_siginfo *from) { memset(to, 0, sizeof(*to)); to->si_signo = from->si_signo; to->si_errno = from->si_errno; to->si_code = from->si_code; switch(siginfo_layout(from->si_signo, from->si_code)) { case SIL_KILL: to->si_pid = from->si_pid; to->si_uid = from->si_uid; break; case SIL_TIMER: to->si_tid = from->si_tid; to->si_overrun = from->si_overrun; to->si_int = from->si_int; break; case SIL_POLL: to->si_band = from->si_band; to->si_fd = from->si_fd; break; case SIL_FAULT: to->si_addr = ptr_to_compat(from->si_addr); break; case SIL_FAULT_TRAPNO: to->si_addr = ptr_to_compat(from->si_addr); to->si_trapno = from->si_trapno; break; case SIL_FAULT_MCEERR: to->si_addr = ptr_to_compat(from->si_addr); to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: to->si_addr = ptr_to_compat(from->si_addr); to->si_lower = ptr_to_compat(from->si_lower); to->si_upper = ptr_to_compat(from->si_upper); break; case SIL_FAULT_PKUERR: to->si_addr = ptr_to_compat(from->si_addr); to->si_pkey = from->si_pkey; break; case SIL_FAULT_PERF_EVENT: to->si_addr = ptr_to_compat(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; to->si_perf_flags = from->si_perf_flags; break; case SIL_CHLD: to->si_pid = from->si_pid; to->si_uid = from->si_uid; to->si_status = from->si_status; to->si_utime = from->si_utime; to->si_stime = from->si_stime; break; case SIL_RT: to->si_pid = from->si_pid; to->si_uid = from->si_uid; to->si_int = from->si_int; break; case SIL_SYS: to->si_call_addr = ptr_to_compat(from->si_call_addr); to->si_syscall = from->si_syscall; to->si_arch = from->si_arch; break; } } int __copy_siginfo_to_user32(struct compat_siginfo __user *to, const struct kernel_siginfo *from) { struct compat_siginfo new; copy_siginfo_to_external32(&new, from); if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) return -EFAULT; return 0; } static int post_copy_siginfo_from_user32(kernel_siginfo_t *to, const struct compat_siginfo *from) { clear_siginfo(to); to->si_signo = from->si_signo; to->si_errno = from->si_errno; to->si_code = from->si_code; switch(siginfo_layout(from->si_signo, from->si_code)) { case SIL_KILL: to->si_pid = from->si_pid; to->si_uid = from->si_uid; break; case SIL_TIMER: to->si_tid = from->si_tid; to->si_overrun = from->si_overrun; to->si_int = from->si_int; break; case SIL_POLL: to->si_band = from->si_band; to->si_fd = from->si_fd; break; case SIL_FAULT: to->si_addr = compat_ptr(from->si_addr); break; case SIL_FAULT_TRAPNO: to->si_addr = compat_ptr(from->si_addr); to->si_trapno = from->si_trapno; break; case SIL_FAULT_MCEERR: to->si_addr = compat_ptr(from->si_addr); to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: to->si_addr = compat_ptr(from->si_addr); to->si_lower = compat_ptr(from->si_lower); to->si_upper = compat_ptr(from->si_upper); break; case SIL_FAULT_PKUERR: to->si_addr = compat_ptr(from->si_addr); to->si_pkey = from->si_pkey; break; case SIL_FAULT_PERF_EVENT: to->si_addr = compat_ptr(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; to->si_perf_flags = from->si_perf_flags; break; case SIL_CHLD: to->si_pid = from->si_pid; to->si_uid = from->si_uid; to->si_status = from->si_status; #ifdef CONFIG_X86_X32_ABI if (in_x32_syscall()) { to->si_utime = from->_sifields._sigchld_x32._utime; to->si_stime = from->_sifields._sigchld_x32._stime; } else #endif { to->si_utime = from->si_utime; to->si_stime = from->si_stime; } break; case SIL_RT: to->si_pid = from->si_pid; to->si_uid = from->si_uid; to->si_int = from->si_int; break; case SIL_SYS: to->si_call_addr = compat_ptr(from->si_call_addr); to->si_syscall = from->si_syscall; to->si_arch = from->si_arch; break; } return 0; } static int __copy_siginfo_from_user32(int signo, struct kernel_siginfo *to, const struct compat_siginfo __user *ufrom) { struct compat_siginfo from; if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo))) return -EFAULT; from.si_signo = signo; return post_copy_siginfo_from_user32(to, &from); } int copy_siginfo_from_user32(struct kernel_siginfo *to, const struct compat_siginfo __user *ufrom) { struct compat_siginfo from; if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo))) return -EFAULT; return post_copy_siginfo_from_user32(to, &from); } #endif /* CONFIG_COMPAT */ /** * do_sigtimedwait - wait for queued signals specified in @which * @which: queued signals to wait for * @info: if non-null, the signal's siginfo is returned here * @ts: upper bound on process time suspension */ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, const struct timespec64 *ts) { ktime_t *to = NULL, timeout = KTIME_MAX; struct task_struct *tsk = current; sigset_t mask = *which; enum pid_type type; int sig, ret = 0; if (ts) { if (!timespec64_valid(ts)) return -EINVAL; timeout = timespec64_to_ktime(*ts); to = &timeout; } /* * Invert the set of allowed signals to get those we want to block. */ sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); signotset(&mask); spin_lock_irq(&tsk->sighand->siglock); sig = dequeue_signal(&mask, info, &type); if (!sig && timeout) { /* * None ready, temporarily unblock those we're interested * while we are sleeping in so that we'll be awakened when * they arrive. Unblocking is always fine, we can avoid * set_current_blocked(). */ tsk->real_blocked = tsk->blocked; sigandsets(&tsk->blocked, &tsk->blocked, &mask); recalc_sigpending(); spin_unlock_irq(&tsk->sighand->siglock); __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns, HRTIMER_MODE_REL); spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); sigemptyset(&tsk->real_blocked); sig = dequeue_signal(&mask, info, &type); } spin_unlock_irq(&tsk->sighand->siglock); if (sig) return sig; return ret ? -EINTR : -EAGAIN; } /** * sys_rt_sigtimedwait - synchronously wait for queued signals specified * in @uthese * @uthese: queued signals to wait for * @uinfo: if non-null, the signal's siginfo is returned here * @uts: upper bound on process time suspension * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, siginfo_t __user *, uinfo, const struct __kernel_timespec __user *, uts, size_t, sigsetsize) { sigset_t these; struct timespec64 ts; kernel_siginfo_t info; int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (copy_from_user(&these, uthese, sizeof(these))) return -EFAULT; if (uts) { if (get_timespec64(&ts, uts)) return -EFAULT; } ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL); if (ret > 0 && uinfo) { if (copy_siginfo_to_user(uinfo, &info)) ret = -EFAULT; } return ret; } #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE4(rt_sigtimedwait_time32, const sigset_t __user *, uthese, siginfo_t __user *, uinfo, const struct old_timespec32 __user *, uts, size_t, sigsetsize) { sigset_t these; struct timespec64 ts; kernel_siginfo_t info; int ret; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (copy_from_user(&these, uthese, sizeof(these))) return -EFAULT; if (uts) { if (get_old_timespec32(&ts, uts)) return -EFAULT; } ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL); if (ret > 0 && uinfo) { if (copy_siginfo_to_user(uinfo, &info)) ret = -EFAULT; } return ret; } #endif #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, struct __kernel_timespec __user *, uts, compat_size_t, sigsetsize) { sigset_t s; struct timespec64 t; kernel_siginfo_t info; long ret; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (get_compat_sigset(&s, uthese)) return -EFAULT; if (uts) { if (get_timespec64(&t, uts)) return -EFAULT; } ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); if (ret > 0 && uinfo) { if (copy_siginfo_to_user32(uinfo, &info)) ret = -EFAULT; } return ret; } #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, struct old_timespec32 __user *, uts, compat_size_t, sigsetsize) { sigset_t s; struct timespec64 t; kernel_siginfo_t info; long ret; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (get_compat_sigset(&s, uthese)) return -EFAULT; if (uts) { if (get_old_timespec32(&t, uts)) return -EFAULT; } ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); if (ret > 0 && uinfo) { if (copy_siginfo_to_user32(uinfo, &info)) ret = -EFAULT; } return ret; } #endif #endif static void prepare_kill_siginfo(int sig, struct kernel_siginfo *info, enum pid_type type) { clear_siginfo(info); info->si_signo = sig; info->si_errno = 0; info->si_code = (type == PIDTYPE_PID) ? SI_TKILL : SI_USER; info->si_pid = task_tgid_vnr(current); info->si_uid = from_kuid_munged(current_user_ns(), current_uid()); } /** * sys_kill - send a signal to a process * @pid: the PID of the process * @sig: signal to be sent */ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct kernel_siginfo info; prepare_kill_siginfo(sig, &info, PIDTYPE_TGID); return kill_something_info(sig, &info, pid); } /* * Verify that the signaler and signalee either are in the same pid namespace * or that the signaler's pid namespace is an ancestor of the signalee's pid * namespace. */ static bool access_pidfd_pidns(struct pid *pid) { struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *p = ns_of_pid(pid); for (;;) { if (!p) return false; if (p == active) break; p = p->parent; } return true; } static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t __user *info) { #ifdef CONFIG_COMPAT /* * Avoid hooking up compat syscalls and instead handle necessary * conversions here. Note, this is a stop-gap measure and should not be * considered a generic solution. */ if (in_compat_syscall()) return copy_siginfo_from_user32( kinfo, (struct compat_siginfo __user *)info); #endif return copy_siginfo_from_user(kinfo, info); } static struct pid *pidfd_to_pid(const struct file *file) { struct pid *pid; pid = pidfd_pid(file); if (!IS_ERR(pid)) return pid; return tgid_pidfd_to_pid(file); } #define PIDFD_SEND_SIGNAL_FLAGS \ (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \ PIDFD_SIGNAL_PROCESS_GROUP) static int do_pidfd_send_signal(struct pid *pid, int sig, enum pid_type type, siginfo_t __user *info, unsigned int flags) { kernel_siginfo_t kinfo; switch (flags) { case PIDFD_SIGNAL_THREAD: type = PIDTYPE_PID; break; case PIDFD_SIGNAL_THREAD_GROUP: type = PIDTYPE_TGID; break; case PIDFD_SIGNAL_PROCESS_GROUP: type = PIDTYPE_PGID; break; } if (info) { int ret; ret = copy_siginfo_from_user_any(&kinfo, info); if (unlikely(ret)) return ret; if (unlikely(sig != kinfo.si_signo)) return -EINVAL; /* Only allow sending arbitrary signals to yourself. */ if ((task_pid(current) != pid || type > PIDTYPE_TGID) && (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) return -EPERM; } else { prepare_kill_siginfo(sig, &kinfo, type); } if (type == PIDTYPE_PGID) return kill_pgrp_info(sig, &kinfo, pid); return kill_pid_info_type(sig, &kinfo, pid, type); } /** * sys_pidfd_send_signal - Signal a process through a pidfd * @pidfd: file descriptor of the process * @sig: signal to send * @info: signal info * @flags: future flags * * Send the signal to the thread group or to the individual thread depending * on PIDFD_THREAD. * In the future extension to @flags may be used to override the default scope * of @pidfd. * * Return: 0 on success, negative errno on failure */ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, siginfo_t __user *, info, unsigned int, flags) { struct pid *pid; enum pid_type type; int ret; /* Enforce flags be set to 0 until we add an extension. */ if (flags & ~PIDFD_SEND_SIGNAL_FLAGS) return -EINVAL; /* Ensure that only a single signal scope determining flag is set. */ if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1) return -EINVAL; switch (pidfd) { case PIDFD_SELF_THREAD: pid = get_task_pid(current, PIDTYPE_PID); type = PIDTYPE_PID; break; case PIDFD_SELF_THREAD_GROUP: pid = get_task_pid(current, PIDTYPE_TGID); type = PIDTYPE_TGID; break; default: { CLASS(fd, f)(pidfd); if (fd_empty(f)) return -EBADF; /* Is this a pidfd? */ pid = pidfd_to_pid(fd_file(f)); if (IS_ERR(pid)) return PTR_ERR(pid); if (!access_pidfd_pidns(pid)) return -EINVAL; /* Infer scope from the type of pidfd. */ if (fd_file(f)->f_flags & PIDFD_THREAD) type = PIDTYPE_PID; else type = PIDTYPE_TGID; return do_pidfd_send_signal(pid, sig, type, info, flags); } } ret = do_pidfd_send_signal(pid, sig, type, info, flags); put_pid(pid); return ret; } static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) { struct task_struct *p; int error = -ESRCH; rcu_read_lock(); p = find_task_by_vpid(pid); if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { error = check_kill_permission(sig, info, p); /* * The null signal is a permissions and process existence * probe. No signal is actually delivered. */ if (!error && sig) { error = do_send_sig_info(sig, info, p, PIDTYPE_PID); /* * If lock_task_sighand() failed we pretend the task * dies after receiving the signal. The window is tiny, * and the signal is private anyway. */ if (unlikely(error == -ESRCH)) error = 0; } } rcu_read_unlock(); return error; } static int do_tkill(pid_t tgid, pid_t pid, int sig) { struct kernel_siginfo info; prepare_kill_siginfo(sig, &info, PIDTYPE_PID); return do_send_specific(tgid, pid, sig, &info); } /** * sys_tgkill - send signal to one specific thread * @tgid: the thread group ID of the thread * @pid: the PID of the thread * @sig: signal to be sent * * This syscall also checks the @tgid and returns -ESRCH even if the PID * exists but it's not belonging to the target process anymore. This * method solves the problem of threads exiting and PIDs getting reused. */ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig) { /* This is only valid for single tasks */ if (pid <= 0 || tgid <= 0) return -EINVAL; return do_tkill(tgid, pid, sig); } /** * sys_tkill - send signal to one specific task * @pid: the PID of the task * @sig: signal to be sent * * Send a signal to only one task, even if it's a CLONE_THREAD task. */ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) { /* This is only valid for single tasks */ if (pid <= 0) return -EINVAL; return do_tkill(0, pid, sig); } static int do_rt_sigqueueinfo(pid_t pid, int sig, kernel_siginfo_t *info) { /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. */ if ((info->si_code >= 0 || info->si_code == SI_TKILL) && (task_pid_vnr(current) != pid)) return -EPERM; /* POSIX.1b doesn't mention process groups. */ return kill_proc_info(sig, info, pid); } /** * sys_rt_sigqueueinfo - send signal information to a signal * @pid: the PID of the thread * @sig: signal to be sent * @uinfo: signal info to be sent */ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { kernel_siginfo_t info; int ret = __copy_siginfo_from_user(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_sigqueueinfo(pid, sig, &info); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, compat_pid_t, pid, int, sig, struct compat_siginfo __user *, uinfo) { kernel_siginfo_t info; int ret = __copy_siginfo_from_user32(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_sigqueueinfo(pid, sig, &info); } #endif static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, kernel_siginfo_t *info) { /* This is only valid for single tasks */ if (pid <= 0 || tgid <= 0) return -EINVAL; /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. */ if ((info->si_code >= 0 || info->si_code == SI_TKILL) && (task_pid_vnr(current) != pid)) return -EPERM; return do_send_specific(tgid, pid, sig, info); } SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { kernel_siginfo_t info; int ret = __copy_siginfo_from_user(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, compat_pid_t, tgid, compat_pid_t, pid, int, sig, struct compat_siginfo __user *, uinfo) { kernel_siginfo_t info; int ret = __copy_siginfo_from_user32(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); } #endif /* * For kthreads only, must not be used if cloned with CLONE_SIGHAND */ void kernel_sigaction(int sig, __sighandler_t action) { spin_lock_irq(&current->sighand->siglock); current->sighand->action[sig - 1].sa.sa_handler = action; if (action == SIG_IGN) { sigset_t mask; sigemptyset(&mask); sigaddset(&mask, sig); flush_sigqueue_mask(current, &mask, &current->signal->shared_pending); flush_sigqueue_mask(current, &mask, &current->pending); recalc_sigpending(); } spin_unlock_irq(&current->sighand->siglock); } EXPORT_SYMBOL(kernel_sigaction); void __weak sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) { } int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct task_struct *p = current, *t; struct k_sigaction *k; sigset_t mask; if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) return -EINVAL; k = &p->sighand->action[sig-1]; spin_lock_irq(&p->sighand->siglock); if (k->sa.sa_flags & SA_IMMUTABLE) { spin_unlock_irq(&p->sighand->siglock); return -EINVAL; } if (oact) *oact = *k; /* * Make sure that we never accidentally claim to support SA_UNSUPPORTED, * e.g. by having an architecture use the bit in their uapi. */ BUILD_BUG_ON(UAPI_SA_FLAGS & SA_UNSUPPORTED); /* * Clear unknown flag bits in order to allow userspace to detect missing * support for flag bits and to allow the kernel to use non-uapi bits * internally. */ if (act) act->sa.sa_flags &= UAPI_SA_FLAGS; if (oact) oact->sa.sa_flags &= UAPI_SA_FLAGS; sigaction_compat_abi(act, oact); if (act) { bool was_ignored = k->sa.sa_handler == SIG_IGN; sigdelsetmask(&act->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); *k = *act; /* * POSIX 3.3.1.3: * "Setting a signal action to SIG_IGN for a signal that is * pending shall cause the pending signal to be discarded, * whether or not it is blocked." * * "Setting a signal action to SIG_DFL for a signal that is * pending and whose default action is to ignore the signal * (for example, SIGCHLD), shall cause the pending signal to * be discarded, whether or not it is blocked" */ if (sig_handler_ignored(sig_handler(p, sig), sig)) { sigemptyset(&mask); sigaddset(&mask, sig); flush_sigqueue_mask(p, &mask, &p->signal->shared_pending); for_each_thread(p, t) flush_sigqueue_mask(p, &mask, &t->pending); } else if (was_ignored) { posixtimer_sig_unignore(p, sig); } } spin_unlock_irq(&p->sighand->siglock); return 0; } #ifdef CONFIG_DYNAMIC_SIGFRAME static inline void sigaltstack_lock(void) __acquires(&current->sighand->siglock) { spin_lock_irq(&current->sighand->siglock); } static inline void sigaltstack_unlock(void) __releases(&current->sighand->siglock) { spin_unlock_irq(&current->sighand->siglock); } #else static inline void sigaltstack_lock(void) { } static inline void sigaltstack_unlock(void) { } #endif static int do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp, size_t min_ss_size) { struct task_struct *t = current; int ret = 0; if (oss) { memset(oss, 0, sizeof(stack_t)); oss->ss_sp = (void __user *) t->sas_ss_sp; oss->ss_size = t->sas_ss_size; oss->ss_flags = sas_ss_flags(sp) | (current->sas_ss_flags & SS_FLAG_BITS); } if (ss) { void __user *ss_sp = ss->ss_sp; size_t ss_size = ss->ss_size; unsigned ss_flags = ss->ss_flags; int ss_mode; if (unlikely(on_sig_stack(sp))) return -EPERM; ss_mode = ss_flags & ~SS_FLAG_BITS; if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && ss_mode != 0)) return -EINVAL; /* * Return before taking any locks if no actual * sigaltstack changes were requested. */ if (t->sas_ss_sp == (unsigned long)ss_sp && t->sas_ss_size == ss_size && t->sas_ss_flags == ss_flags) return 0; sigaltstack_lock(); if (ss_mode == SS_DISABLE) { ss_size = 0; ss_sp = NULL; } else { if (unlikely(ss_size < min_ss_size)) ret = -ENOMEM; if (!sigaltstack_size_valid(ss_size)) ret = -ENOMEM; } if (!ret) { t->sas_ss_sp = (unsigned long) ss_sp; t->sas_ss_size = ss_size; t->sas_ss_flags = ss_flags; } sigaltstack_unlock(); } return ret; } SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) { stack_t new, old; int err; if (uss && copy_from_user(&new, uss, sizeof(stack_t))) return -EFAULT; err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL, current_user_stack_pointer(), MINSIGSTKSZ); if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t))) err = -EFAULT; return err; } int restore_altstack(const stack_t __user *uss) { stack_t new; if (copy_from_user(&new, uss, sizeof(stack_t))) return -EFAULT; (void)do_sigaltstack(&new, NULL, current_user_stack_pointer(), MINSIGSTKSZ); /* squash all but EFAULT for now */ return 0; } int __save_altstack(stack_t __user *uss, unsigned long sp) { struct task_struct *t = current; int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) | __put_user(t->sas_ss_flags, &uss->ss_flags) | __put_user(t->sas_ss_size, &uss->ss_size); return err; } #ifdef CONFIG_COMPAT static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr, compat_stack_t __user *uoss_ptr) { stack_t uss, uoss; int ret; if (uss_ptr) { compat_stack_t uss32; if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t))) return -EFAULT; uss.ss_sp = compat_ptr(uss32.ss_sp); uss.ss_flags = uss32.ss_flags; uss.ss_size = uss32.ss_size; } ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, compat_user_stack_pointer(), COMPAT_MINSIGSTKSZ); if (ret >= 0 && uoss_ptr) { compat_stack_t old; memset(&old, 0, sizeof(old)); old.ss_sp = ptr_to_compat(uoss.ss_sp); old.ss_flags = uoss.ss_flags; old.ss_size = uoss.ss_size; if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t))) ret = -EFAULT; } return ret; } COMPAT_SYSCALL_DEFINE2(sigaltstack, const compat_stack_t __user *, uss_ptr, compat_stack_t __user *, uoss_ptr) { return do_compat_sigaltstack(uss_ptr, uoss_ptr); } int compat_restore_altstack(const compat_stack_t __user *uss) { int err = do_compat_sigaltstack(uss, NULL); /* squash all but -EFAULT for now */ return err == -EFAULT ? err : 0; } int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) { int err; struct task_struct *t = current; err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) | __put_user(t->sas_ss_flags, &uss->ss_flags) | __put_user(t->sas_ss_size, &uss->ss_size); return err; } #endif #ifdef __ARCH_WANT_SYS_SIGPENDING /** * sys_sigpending - examine pending signals * @uset: where mask of pending signal is returned */ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset) { sigset_t set; if (sizeof(old_sigset_t) > sizeof(*uset)) return -EINVAL; do_sigpending(&set); if (copy_to_user(uset, &set, sizeof(old_sigset_t))) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) { sigset_t set; do_sigpending(&set); return put_user(set.sig[0], set32); } #endif #endif #ifdef __ARCH_WANT_SYS_SIGPROCMASK /** * sys_sigprocmask - examine and change blocked signals * @how: whether to add, remove, or set signals * @nset: signals to add or remove (if non-null) * @oset: previous value of signal mask if non-null * * Some platforms have their own version with special arguments; * others support only sys_rt_sigprocmask. */ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, old_sigset_t __user *, oset) { old_sigset_t old_set, new_set; sigset_t new_blocked; old_set = current->blocked.sig[0]; if (nset) { if (copy_from_user(&new_set, nset, sizeof(*nset))) return -EFAULT; new_blocked = current->blocked; switch (how) { case SIG_BLOCK: sigaddsetmask(&new_blocked, new_set); break; case SIG_UNBLOCK: sigdelsetmask(&new_blocked, new_set); break; case SIG_SETMASK: new_blocked.sig[0] = new_set; break; default: return -EINVAL; } set_current_blocked(&new_blocked); } if (oset) { if (copy_to_user(oset, &old_set, sizeof(*oset))) return -EFAULT; } return 0; } #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ #ifndef CONFIG_ODD_RT_SIGACTION /** * sys_rt_sigaction - alter an action taken by a process * @sig: signal to be sent * @act: new sigaction * @oact: used to save the previous sigaction * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE4(rt_sigaction, int, sig, const struct sigaction __user *, act, struct sigaction __user *, oact, size_t, sigsetsize) { struct k_sigaction new_sa, old_sa; int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (act && copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa))) return -EFAULT; ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL); if (ret) return ret; if (oact && copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa))) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, const struct compat_sigaction __user *, act, struct compat_sigaction __user *, oact, compat_size_t, sigsetsize) { struct k_sigaction new_ka, old_ka; #ifdef __ARCH_HAS_SA_RESTORER compat_uptr_t restorer; #endif int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; if (act) { compat_uptr_t handler; ret = get_user(handler, &act->sa_handler); new_ka.sa.sa_handler = compat_ptr(handler); #ifdef __ARCH_HAS_SA_RESTORER ret |= get_user(restorer, &act->sa_restorer); new_ka.sa.sa_restorer = compat_ptr(restorer); #endif ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask); ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags); if (ret) return -EFAULT; } ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler); ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask, sizeof(oact->sa_mask)); ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags); #ifdef __ARCH_HAS_SA_RESTORER ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer); #endif } return ret; } #endif #endif /* !CONFIG_ODD_RT_SIGACTION */ #ifdef CONFIG_OLD_SIGACTION SYSCALL_DEFINE3(sigaction, int, sig, const struct old_sigaction __user *, act, struct old_sigaction __user *, oact) { struct k_sigaction new_ka, old_ka; int ret; if (act) { old_sigset_t mask; if (!access_ok(act, sizeof(*act)) || __get_user(new_ka.sa.sa_handler, &act->sa_handler) || __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) || __get_user(new_ka.sa.sa_flags, &act->sa_flags) || __get_user(mask, &act->sa_mask)) return -EFAULT; #ifdef __ARCH_HAS_KA_RESTORER new_ka.ka_restorer = NULL; #endif siginitset(&new_ka.sa.sa_mask, mask); } ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { if (!access_ok(oact, sizeof(*oact)) || __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) || __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) return -EFAULT; } return ret; } #endif #ifdef CONFIG_COMPAT_OLD_SIGACTION COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, const struct compat_old_sigaction __user *, act, struct compat_old_sigaction __user *, oact) { struct k_sigaction new_ka, old_ka; int ret; compat_old_sigset_t mask; compat_uptr_t handler, restorer; if (act) { if (!access_ok(act, sizeof(*act)) || __get_user(handler, &act->sa_handler) || __get_user(restorer, &act->sa_restorer) || __get_user(new_ka.sa.sa_flags, &act->sa_flags) || __get_user(mask, &act->sa_mask)) return -EFAULT; #ifdef __ARCH_HAS_KA_RESTORER new_ka.ka_restorer = NULL; #endif new_ka.sa.sa_handler = compat_ptr(handler); new_ka.sa.sa_restorer = compat_ptr(restorer); siginitset(&new_ka.sa.sa_mask, mask); } ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { if (!access_ok(oact, sizeof(*oact)) || __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) return -EFAULT; } return ret; } #endif #ifdef CONFIG_SGETMASK_SYSCALL /* * For backwards compatibility. Functionality superseded by sigprocmask. */ SYSCALL_DEFINE0(sgetmask) { /* SMP safe */ return current->blocked.sig[0]; } SYSCALL_DEFINE1(ssetmask, int, newmask) { int old = current->blocked.sig[0]; sigset_t newset; siginitset(&newset, newmask); set_current_blocked(&newset); return old; } #endif /* CONFIG_SGETMASK_SYSCALL */ #ifdef __ARCH_WANT_SYS_SIGNAL /* * For backwards compatibility. Functionality superseded by sigaction. */ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler) { struct k_sigaction new_sa, old_sa; int ret; new_sa.sa.sa_handler = handler; new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK; sigemptyset(&new_sa.sa.sa_mask); ret = do_sigaction(sig, &new_sa, &old_sa); return ret ? ret : (unsigned long)old_sa.sa.sa_handler; } #endif /* __ARCH_WANT_SYS_SIGNAL */ #ifdef __ARCH_WANT_SYS_PAUSE SYSCALL_DEFINE0(pause) { while (!signal_pending(current)) { __set_current_state(TASK_INTERRUPTIBLE); schedule(); } return -ERESTARTNOHAND; } #endif static int sigsuspend(sigset_t *set) { current->saved_sigmask = current->blocked; set_current_blocked(set); while (!signal_pending(current)) { __set_current_state(TASK_INTERRUPTIBLE); schedule(); } set_restore_sigmask(); return -ERESTARTNOHAND; } /** * sys_rt_sigsuspend - replace the signal mask for a value with the * @unewset value until a signal is received * @unewset: new signal mask value * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) { sigset_t newset; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (copy_from_user(&newset, unewset, sizeof(newset))) return -EFAULT; return sigsuspend(&newset); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize) { sigset_t newset; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (get_compat_sigset(&newset, unewset)) return -EFAULT; return sigsuspend(&newset); } #endif #ifdef CONFIG_OLD_SIGSUSPEND SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask) { sigset_t blocked; siginitset(&blocked, mask); return sigsuspend(&blocked); } #endif #ifdef CONFIG_OLD_SIGSUSPEND3 SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask) { sigset_t blocked; siginitset(&blocked, mask); return sigsuspend(&blocked); } #endif __weak const char *arch_vma_name(struct vm_area_struct *vma) { return NULL; } static inline void siginfo_buildtime_checks(void) { BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE); /* Verify the offsets in the two siginfos match */ #define CHECK_OFFSET(field) \ BUILD_BUG_ON(offsetof(siginfo_t, field) != offsetof(kernel_siginfo_t, field)) /* kill */ CHECK_OFFSET(si_pid); CHECK_OFFSET(si_uid); /* timer */ CHECK_OFFSET(si_tid); CHECK_OFFSET(si_overrun); CHECK_OFFSET(si_value); /* rt */ CHECK_OFFSET(si_pid); CHECK_OFFSET(si_uid); CHECK_OFFSET(si_value); /* sigchld */ CHECK_OFFSET(si_pid); CHECK_OFFSET(si_uid); CHECK_OFFSET(si_status); CHECK_OFFSET(si_utime); CHECK_OFFSET(si_stime); /* sigfault */ CHECK_OFFSET(si_addr); CHECK_OFFSET(si_trapno); CHECK_OFFSET(si_addr_lsb); CHECK_OFFSET(si_lower); CHECK_OFFSET(si_upper); CHECK_OFFSET(si_pkey); CHECK_OFFSET(si_perf_data); CHECK_OFFSET(si_perf_type); CHECK_OFFSET(si_perf_flags); /* sigpoll */ CHECK_OFFSET(si_band); CHECK_OFFSET(si_fd); /* sigsys */ CHECK_OFFSET(si_call_addr); CHECK_OFFSET(si_syscall); CHECK_OFFSET(si_arch); #undef CHECK_OFFSET /* usb asyncio */ BUILD_BUG_ON(offsetof(struct siginfo, si_pid) != offsetof(struct siginfo, si_addr)); if (sizeof(int) == sizeof(void __user *)) { BUILD_BUG_ON(sizeof_field(struct siginfo, si_pid) != sizeof(void __user *)); } else { BUILD_BUG_ON((sizeof_field(struct siginfo, si_pid) + sizeof_field(struct siginfo, si_uid)) != sizeof(void __user *)); BUILD_BUG_ON(offsetofend(struct siginfo, si_pid) != offsetof(struct siginfo, si_uid)); } #ifdef CONFIG_COMPAT BUILD_BUG_ON(offsetof(struct compat_siginfo, si_pid) != offsetof(struct compat_siginfo, si_addr)); BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) != sizeof(compat_uptr_t)); BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) != sizeof_field(struct siginfo, si_pid)); #endif } #if defined(CONFIG_SYSCTL) static const struct ctl_table signal_debug_table[] = { #ifdef CONFIG_SYSCTL_EXCEPTION_TRACE { .procname = "exception-trace", .data = &show_unhandled_signals, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, #endif }; static const struct ctl_table signal_table[] = { { .procname = "print-fatal-signals", .data = &print_fatal_signals, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static int __init init_signal_sysctls(void) { register_sysctl_init("debug", signal_debug_table); register_sysctl_init("kernel", signal_table); return 0; } early_initcall(init_signal_sysctls); #endif /* CONFIG_SYSCTL */ void __init signals_init(void) { siginfo_buildtime_checks(); sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT); } #ifdef CONFIG_KGDB_KDB #include <linux/kdb.h> /* * kdb_send_sig - Allows kdb to send signals without exposing * signal internals. This function checks if the required locks are * available before calling the main signal code, to avoid kdb * deadlocks. */ void kdb_send_sig(struct task_struct *t, int sig) { static struct task_struct *kdb_prev_t; int new_t, ret; if (!spin_trylock(&t->sighand->siglock)) { kdb_printf("Can't do kill command now.\n" "The sigmask lock is held somewhere else in " "kernel, try again later\n"); return; } new_t = kdb_prev_t != t; kdb_prev_t = t; if (!task_is_running(t) && new_t) { spin_unlock(&t->sighand->siglock); kdb_printf("Process is not RUNNING, sending a signal from " "kdb risks deadlock\n" "on the run queue locks. " "The signal has _not_ been sent.\n" "Reissue the kill command if you want to risk " "the deadlock.\n"); return; } ret = send_signal_locked(sig, SEND_SIG_PRIV, t, PIDTYPE_PID); spin_unlock(&t->sighand->siglock); if (ret) kdb_printf("Fail to deliver Signal %d to process %d.\n", sig, t->pid); else kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid); } #endif /* CONFIG_KGDB_KDB */
535 461 461 323 325 7 7 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __NET_PSP_HELPERS_H #define __NET_PSP_HELPERS_H #include <linux/skbuff.h> #include <linux/rcupdate.h> #include <linux/udp.h> #include <net/sock.h> #include <net/tcp.h> #include <net/psp/types.h> struct inet_timewait_sock; /* Driver-facing API */ struct psp_dev * psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, void *priv_ptr); void psp_dev_unregister(struct psp_dev *psd); bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, u8 ver, __be16 sport); int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv); /* Kernel-facing API */ void psp_assoc_put(struct psp_assoc *pas); static inline void *psp_assoc_drv_data(struct psp_assoc *pas) { return pas->drv_data; } #if IS_ENABLED(CONFIG_INET_PSP) unsigned int psp_key_size(u32 version); void psp_sk_assoc_free(struct sock *sk); void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk); void psp_twsk_assoc_free(struct inet_timewait_sock *tw); void psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb); static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) { return rcu_dereference_check(sk->psp_assoc, lockdep_sock_is_held(sk)); } static inline void psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { struct psp_assoc *pas; pas = psp_sk_assoc(sk); if (pas && pas->tx.spi) skb->decrypted = 1; } static inline unsigned long __psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, unsigned long diffs) { struct psp_skb_ext *a, *b; a = skb_ext_find(one, SKB_EXT_PSP); b = skb_ext_find(two, SKB_EXT_PSP); diffs |= (!!a) ^ (!!b); if (!diffs && unlikely(a)) diffs |= memcmp(a, b, sizeof(*a)); return diffs; } static inline bool psp_is_allowed_nondata(struct sk_buff *skb, struct psp_assoc *pas) { bool fin = !!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN); u32 end_seq = TCP_SKB_CB(skb)->end_seq; u32 seq = TCP_SKB_CB(skb)->seq; bool pure_fin; pure_fin = fin && end_seq - seq == 1; return seq == end_seq || (pure_fin && seq == pas->upgrade_seq); } static inline bool psp_pse_matches_pas(struct psp_skb_ext *pse, struct psp_assoc *pas) { return pse && pas->rx.spi == pse->spi && pas->generation == pse->generation && pas->version == pse->version && pas->dev_id == pse->dev_id; } static inline enum skb_drop_reason __psp_sk_rx_policy_check(struct sk_buff *skb, struct psp_assoc *pas) { struct psp_skb_ext *pse = skb_ext_find(skb, SKB_EXT_PSP); if (!pas) return pse ? SKB_DROP_REASON_PSP_INPUT : 0; if (likely(psp_pse_matches_pas(pse, pas))) { if (unlikely(!pas->peer_tx)) pas->peer_tx = 1; return 0; } if (!pse) { if (!pas->tx.spi || (!pas->peer_tx && psp_is_allowed_nondata(skb, pas))) return 0; } return SKB_DROP_REASON_PSP_INPUT; } static inline enum skb_drop_reason psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) { return __psp_sk_rx_policy_check(skb, psp_sk_assoc(sk)); } static inline enum skb_drop_reason psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) { return __psp_sk_rx_policy_check(skb, rcu_dereference(tw->psp_assoc)); } static inline struct psp_assoc *psp_sk_get_assoc_rcu(const struct sock *sk) { struct psp_assoc *pas; int state; state = READ_ONCE(sk->sk_state); if (!sk_is_inet(sk) || state == TCP_NEW_SYN_RECV) return NULL; pas = state == TCP_TIME_WAIT ? rcu_dereference(inet_twsk(sk)->psp_assoc) : rcu_dereference(sk->psp_assoc); return pas; } static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) { if (!skb->decrypted || !skb->sk) return NULL; return psp_sk_get_assoc_rcu(skb->sk); } static inline unsigned int psp_sk_overhead(const struct sock *sk) { int psp_encap = sizeof(struct udphdr) + PSP_HDR_SIZE + PSP_TRL_SIZE; bool has_psp = rcu_access_pointer(sk->psp_assoc); return has_psp ? psp_encap : 0; } #else static inline void psp_sk_assoc_free(struct sock *sk) { } static inline void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { } static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } static inline void psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb) { } static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) { return NULL; } static inline void psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { } static inline unsigned long __psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, unsigned long diffs) { return diffs; } static inline enum skb_drop_reason psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) { return 0; } static inline enum skb_drop_reason psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) { return 0; } static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) { return NULL; } static inline unsigned int psp_sk_overhead(const struct sock *sk) { return 0; } #endif static inline unsigned long psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two) { return __psp_skb_coalesce_diff(one, two, 0); } #endif /* __NET_PSP_HELPERS_H */
12 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Mellanox Technologies. All rights reserved */ #include <linux/debugfs.h> #include <linux/err.h> #include <linux/etherdevice.h> #include <linux/inet.h> #include <linux/kernel.h> #include <linux/random.h> #include <linux/slab.h> #include <net/devlink.h> #include <net/ip.h> #include <net/psample.h> #include <uapi/linux/ip.h> #include <uapi/linux/udp.h> #include "netdevsim.h" #define NSIM_PSAMPLE_REPORT_INTERVAL_MS 100 #define NSIM_PSAMPLE_INVALID_TC 0xFFFF #define NSIM_PSAMPLE_L4_DATA_LEN 100 struct nsim_dev_psample { struct delayed_work psample_dw; struct dentry *ddir; struct psample_group *group; u32 rate; u32 group_num; u32 trunc_size; int in_ifindex; int out_ifindex; u16 out_tc; u64 out_tc_occ_max; u64 latency_max; bool is_active; }; static struct sk_buff *nsim_dev_psample_skb_build(void) { int tot_len, data_len = NSIM_PSAMPLE_L4_DATA_LEN; struct sk_buff *skb; struct udphdr *udph; struct ethhdr *eth; struct iphdr *iph; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return NULL; tot_len = sizeof(struct iphdr) + sizeof(struct udphdr) + data_len; skb_reset_mac_header(skb); eth = skb_put(skb, sizeof(struct ethhdr)); eth_random_addr(eth->h_dest); eth_random_addr(eth->h_source); eth->h_proto = htons(ETH_P_IP); skb->protocol = htons(ETH_P_IP); skb_set_network_header(skb, skb->len); iph = skb_put(skb, sizeof(struct iphdr)); iph->protocol = IPPROTO_UDP; iph->saddr = in_aton("192.0.2.1"); iph->daddr = in_aton("198.51.100.1"); iph->version = 0x4; iph->frag_off = 0; iph->ihl = 0x5; iph->tot_len = htons(tot_len); iph->id = 0; iph->ttl = 100; iph->check = 0; iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); skb_set_transport_header(skb, skb->len); udph = skb_put_zero(skb, sizeof(struct udphdr) + data_len); get_random_bytes(&udph->source, sizeof(u16)); get_random_bytes(&udph->dest, sizeof(u16)); udph->len = htons(sizeof(struct udphdr) + data_len); return skb; } static void nsim_dev_psample_md_prepare(const struct nsim_dev_psample *psample, struct psample_metadata *md, unsigned int len) { md->trunc_size = psample->trunc_size ? psample->trunc_size : len; md->in_ifindex = psample->in_ifindex; md->out_ifindex = psample->out_ifindex; if (psample->out_tc != NSIM_PSAMPLE_INVALID_TC) { md->out_tc = psample->out_tc; md->out_tc_valid = 1; } if (psample->out_tc_occ_max) { u64 out_tc_occ; out_tc_occ = get_random_u64(); md->out_tc_occ = out_tc_occ & (psample->out_tc_occ_max - 1); md->out_tc_occ_valid = 1; } if (psample->latency_max) { u64 latency; latency = get_random_u64(); md->latency = latency & (psample->latency_max - 1); md->latency_valid = 1; } } static void nsim_dev_psample_report_work(struct work_struct *work) { struct nsim_dev_psample *psample; struct psample_metadata md = {}; struct sk_buff *skb; unsigned long delay; psample = container_of(work, struct nsim_dev_psample, psample_dw.work); skb = nsim_dev_psample_skb_build(); if (!skb) goto out; nsim_dev_psample_md_prepare(psample, &md, skb->len); psample_sample_packet(psample->group, skb, psample->rate, &md); consume_skb(skb); out: delay = msecs_to_jiffies(NSIM_PSAMPLE_REPORT_INTERVAL_MS); schedule_delayed_work(&psample->psample_dw, delay); } static int nsim_dev_psample_enable(struct nsim_dev *nsim_dev) { struct nsim_dev_psample *psample = nsim_dev->psample; struct devlink *devlink; unsigned long delay; if (psample->is_active) return -EBUSY; devlink = priv_to_devlink(nsim_dev); psample->group = psample_group_get(devlink_net(devlink), psample->group_num); if (!psample->group) return -EINVAL; delay = msecs_to_jiffies(NSIM_PSAMPLE_REPORT_INTERVAL_MS); schedule_delayed_work(&psample->psample_dw, delay); psample->is_active = true; return 0; } static int nsim_dev_psample_disable(struct nsim_dev *nsim_dev) { struct nsim_dev_psample *psample = nsim_dev->psample; if (!psample->is_active) return -EINVAL; psample->is_active = false; cancel_delayed_work_sync(&psample->psample_dw); psample_group_put(psample->group); return 0; } static ssize_t nsim_dev_psample_enable_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) { struct nsim_dev *nsim_dev = file->private_data; bool enable; int err; err = kstrtobool_from_user(data, count, &enable); if (err) return err; if (enable) err = nsim_dev_psample_enable(nsim_dev); else err = nsim_dev_psample_disable(nsim_dev); return err ? err : count; } static const struct file_operations nsim_psample_enable_fops = { .open = simple_open, .write = nsim_dev_psample_enable_write, .llseek = generic_file_llseek, .owner = THIS_MODULE, }; int nsim_dev_psample_init(struct nsim_dev *nsim_dev) { struct nsim_dev_psample *psample; int err; psample = kzalloc_obj(*psample); if (!psample) return -ENOMEM; nsim_dev->psample = psample; INIT_DELAYED_WORK(&psample->psample_dw, nsim_dev_psample_report_work); psample->ddir = debugfs_create_dir("psample", nsim_dev->ddir); if (IS_ERR(psample->ddir)) { err = PTR_ERR(psample->ddir); goto err_psample_free; } /* Populate sampling parameters with sane defaults. */ psample->rate = 100; debugfs_create_u32("rate", 0600, psample->ddir, &psample->rate); psample->group_num = 10; debugfs_create_u32("group_num", 0600, psample->ddir, &psample->group_num); psample->trunc_size = 0; debugfs_create_u32("trunc_size", 0600, psample->ddir, &psample->trunc_size); psample->in_ifindex = 1; debugfs_create_u32("in_ifindex", 0600, psample->ddir, &psample->in_ifindex); psample->out_ifindex = 2; debugfs_create_u32("out_ifindex", 0600, psample->ddir, &psample->out_ifindex); psample->out_tc = 0; debugfs_create_u16("out_tc", 0600, psample->ddir, &psample->out_tc); psample->out_tc_occ_max = 10000; debugfs_create_u64("out_tc_occ_max", 0600, psample->ddir, &psample->out_tc_occ_max); psample->latency_max = 50; debugfs_create_u64("latency_max", 0600, psample->ddir, &psample->latency_max); debugfs_create_file("enable", 0200, psample->ddir, nsim_dev, &nsim_psample_enable_fops); return 0; err_psample_free: kfree(nsim_dev->psample); return err; } void nsim_dev_psample_exit(struct nsim_dev *nsim_dev) { debugfs_remove_recursive(nsim_dev->psample->ddir); if (nsim_dev->psample->is_active) { cancel_delayed_work_sync(&nsim_dev->psample->psample_dw); psample_group_put(nsim_dev->psample->group); } kfree(nsim_dev->psample); }
26 2469 34 36 1477 1163 73 2415 2437 1502 1213 2421 200 1601 1610 1504 1515 6 6 772 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 // SPDX-License-Identifier: GPL-2.0-or-later /* * * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IP/TCP/UDP checksumming routines * * Authors: Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Tom May, <ftom@netcom.com> * Andreas Schwab, <schwab@issan.informatik.uni-dortmund.de> * Lots of code moved from tcp.c and ip.c; see those files * for more names. * * 03/02/96 Jes Sorensen, Andreas Schwab, Roman Hodek: * Fixed some nasty bugs, causing some horrible crashes. * A: At some points, the sum (%0) was used as * length-counter instead of the length counter * (%1). Thanks to Roman Hodek for pointing this out. * B: GCC seems to mess up if one uses too many * data-registers to hold input values and one tries to * specify d0 and d1 as scratch registers. Letting gcc * choose these registers itself solves the problem. */ /* Revised by Kenneth Albanowski for m68knommu. Basic problem: unaligned access kills, so most of the assembly has to go. */ #include <linux/export.h> #include <net/checksum.h> #include <asm/byteorder.h> #ifndef do_csum static unsigned int do_csum(const unsigned char *buff, int len) { int odd; unsigned int result = 0; if (len <= 0) goto out; odd = 1 & (unsigned long) buff; if (odd) { #ifdef __LITTLE_ENDIAN result += (*buff << 8); #else result = *buff; #endif len--; buff++; } if (len >= 2) { if (2 & (unsigned long) buff) { result += *(unsigned short *) buff; len -= 2; buff += 2; } if (len >= 4) { const unsigned char *end = buff + ((unsigned)len & ~3); unsigned int carry = 0; do { unsigned int w = *(unsigned int *) buff; buff += 4; result += carry; result += w; carry = (w > result); } while (buff < end); result += carry; result = (result & 0xffff) + (result >> 16); } if (len & 2) { result += *(unsigned short *) buff; buff += 2; } } if (len & 1) #ifdef __LITTLE_ENDIAN result += *buff; #else result += (*buff << 8); #endif result = csum_from32to16(result); if (odd) result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); out: return result; } #endif #ifndef ip_fast_csum /* * This is a version of ip_compute_csum() optimized for IP headers, * which always checksum on 4 octet boundaries. */ __sum16 ip_fast_csum(const void *iph, unsigned int ihl) { return (__force __sum16)~do_csum(iph, ihl*4); } EXPORT_SYMBOL(ip_fast_csum); #endif /* * computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit) * * returns a 32-bit number suitable for feeding into itself * or csum_tcpudp_magic * * this function must be called with even lengths, except * for the last fragment, which may be odd * * it's best to have buff aligned on a 32-bit boundary */ __wsum csum_partial(const void *buff, int len, __wsum wsum) { unsigned int sum = (__force unsigned int)wsum; unsigned int result = do_csum(buff, len); /* add in old sum, and carry.. */ result += sum; if (sum > result) result += 1; return (__force __wsum)result; } EXPORT_SYMBOL(csum_partial); /* * this routine is used for miscellaneous IP-like checksums, mainly * in icmp.c */ __sum16 ip_compute_csum(const void *buff, int len) { return (__force __sum16)~do_csum(buff, len); } EXPORT_SYMBOL(ip_compute_csum); #ifndef csum_tcpudp_nofold static inline u32 from64to32(u64 x) { /* add up 32-bit and 32-bit for 32+c bit */ x = (x & 0xffffffff) + (x >> 32); /* add up carry.. */ x = (x & 0xffffffff) + (x >> 32); return (u32)x; } __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum) { unsigned long long s = (__force u32)sum; s += (__force u32)saddr; s += (__force u32)daddr; #ifdef __BIG_ENDIAN s += proto + len; #else s += (proto + len) << 8; #endif return (__force __wsum)from64to32(s); } EXPORT_SYMBOL(csum_tcpudp_nofold); #endif
4 3 162 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com> */ #ifndef __IPVLAN_H #define __IPVLAN_H #include <linux/kernel.h> #include <linux/types.h> #include <linux/module.h> #include <linux/init.h> #include <linux/rculist.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/if_link.h> #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/inetdevice.h> #include <linux/netfilter.h> #include <net/ip.h> #include <net/ip6_route.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/route.h> #include <net/addrconf.h> #include <net/l3mdev.h> #define IPVLAN_DRV "ipvlan" #define IPV_DRV_VER "0.1" #define IPVLAN_HASH_SIZE (1 << BITS_PER_BYTE) #define IPVLAN_HASH_MASK (IPVLAN_HASH_SIZE - 1) #define IPVLAN_MAC_FILTER_BITS 8 #define IPVLAN_MAC_FILTER_SIZE (1 << IPVLAN_MAC_FILTER_BITS) #define IPVLAN_MAC_FILTER_MASK (IPVLAN_MAC_FILTER_SIZE - 1) #define IPVLAN_QBACKLOG_LIMIT 1000 typedef enum { IPVL_IPV6 = 0, IPVL_ICMPV6, IPVL_IPV4, IPVL_ARP, } ipvl_hdr_type; struct ipvl_pcpu_stats { u64_stats_t rx_pkts; u64_stats_t rx_bytes; u64_stats_t rx_mcast; u64_stats_t tx_pkts; u64_stats_t tx_bytes; struct u64_stats_sync syncp; u32 rx_errs; u32 tx_drps; }; struct ipvl_port; struct ipvl_dev { struct net_device *dev; struct list_head pnode; struct ipvl_port *port; struct net_device *phy_dev; struct list_head addrs; struct ipvl_pcpu_stats __percpu *pcpu_stats; DECLARE_BITMAP(mac_filters, IPVLAN_MAC_FILTER_SIZE); netdev_features_t sfeatures; u32 msg_enable; }; struct ipvl_addr { struct ipvl_dev *master; /* Back pointer to master */ union { struct in6_addr ip6; /* IPv6 address on logical interface */ struct in_addr ip4; /* IPv4 address on logical interface */ } ipu; #define ip6addr ipu.ip6 #define ip4addr ipu.ip4 struct hlist_node hlnode; /* Hash-table linkage */ struct list_head anode; /* logical-interface linkage */ ipvl_hdr_type atype; struct rcu_head rcu; }; struct ipvl_port { struct net_device *dev; possible_net_t pnet; struct hlist_head hlhead[IPVLAN_HASH_SIZE]; spinlock_t addrs_lock; /* guards hash-table and addrs */ struct list_head ipvlans; u16 mode; u16 flags; u16 dev_id_start; struct work_struct wq; struct sk_buff_head backlog; int count; struct ida ida; netdevice_tracker dev_tracker; }; struct ipvl_skb_cb { bool tx_pkt; }; #define IPVL_SKB_CB(_skb) ((struct ipvl_skb_cb *)&((_skb)->cb[0])) static inline struct ipvl_port *ipvlan_port_get_rcu(const struct net_device *d) { return rcu_dereference(d->rx_handler_data); } static inline struct ipvl_port *ipvlan_port_get_rcu_bh(const struct net_device *d) { return rcu_dereference_bh(d->rx_handler_data); } static inline struct ipvl_port *ipvlan_port_get_rtnl(const struct net_device *d) { return rtnl_dereference(d->rx_handler_data); } static inline bool ipvlan_is_private(const struct ipvl_port *port) { return !!(port->flags & IPVLAN_F_PRIVATE); } static inline void ipvlan_mark_private(struct ipvl_port *port) { port->flags |= IPVLAN_F_PRIVATE; } static inline void ipvlan_clear_private(struct ipvl_port *port) { port->flags &= ~IPVLAN_F_PRIVATE; } static inline bool ipvlan_is_vepa(const struct ipvl_port *port) { return !!(port->flags & IPVLAN_F_VEPA); } static inline void ipvlan_mark_vepa(struct ipvl_port *port) { port->flags |= IPVLAN_F_VEPA; } static inline void ipvlan_clear_vepa(struct ipvl_port *port) { port->flags &= ~IPVLAN_F_VEPA; } void ipvlan_init_secret(void); unsigned int ipvlan_mac_hash(const unsigned char *addr); rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb); void ipvlan_process_multicast(struct work_struct *work); int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev); void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr); struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, const void *iaddr, bool is_v6); bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6); void ipvlan_ht_addr_del(struct ipvl_addr *addr); struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h, int addr_type, bool use_dest); void *ipvlan_get_L3_hdr(struct ipvl_port *port, struct sk_buff *skb, int *type); void ipvlan_count_rx(const struct ipvl_dev *ipvlan, unsigned int len, bool success, bool mcast); int ipvlan_link_new(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack); void ipvlan_link_delete(struct net_device *dev, struct list_head *head); void ipvlan_link_setup(struct net_device *dev); int ipvlan_link_register(struct rtnl_link_ops *ops); #ifdef CONFIG_IPVLAN_L3S int ipvlan_l3s_register(struct ipvl_port *port); void ipvlan_l3s_unregister(struct ipvl_port *port); void ipvlan_migrate_l3s_hook(struct net *oldnet, struct net *newnet); int ipvlan_l3s_init(void); void ipvlan_l3s_cleanup(void); #else static inline int ipvlan_l3s_register(struct ipvl_port *port) { return -ENOTSUPP; } static inline void ipvlan_l3s_unregister(struct ipvl_port *port) { } static inline void ipvlan_migrate_l3s_hook(struct net *oldnet, struct net *newnet) { } static inline int ipvlan_l3s_init(void) { return 0; } static inline void ipvlan_l3s_cleanup(void) { } #endif /* CONFIG_IPVLAN_L3S */ static inline bool netif_is_ipvlan_port(const struct net_device *dev) { return rcu_access_pointer(dev->rx_handler) == ipvlan_handle_frame; } #endif /* __IPVLAN_H */
2856 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_KTHREAD_H #define _LINUX_KTHREAD_H /* Simple interface for creating and stopping kernel threads without mess. */ #include <linux/err.h> #include <linux/sched.h> struct mm_struct; /* opaque kthread data */ struct kthread; /* * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will * always remain a kthread. For kthreads p->worker_private always * points to a struct kthread. For tasks that are not kthreads * p->worker_private is used to point to other things. * * Return NULL for any task that is not a kthread. */ static inline struct kthread *tsk_is_kthread(struct task_struct *p) { if (p->flags & PF_KTHREAD) return p->worker_private; return NULL; } __printf(4, 5) struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], ...); /** * kthread_create - create a kthread on the current node * @threadfn: the function to run in the thread * @data: data pointer for @threadfn() * @namefmt: printf-style format string for the thread name * @arg: arguments for @namefmt. * * This macro will create a kthread on the current node, leaving it in * the stopped state. This is just a helper for kthread_create_on_node(); * see the documentation there for more details. */ #define kthread_create(threadfn, data, namefmt, arg...) \ kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg) struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), void *data, unsigned int cpu, const char *namefmt); void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk); bool set_kthread_struct(struct task_struct *p); void kthread_set_per_cpu(struct task_struct *k, int cpu); bool kthread_is_per_cpu(struct task_struct *k); /** * kthread_run - create and wake a thread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @namefmt: printf-style name for the thread. * * Description: Convenient wrapper for kthread_create() followed by * wake_up_process(). Returns the kthread or ERR_PTR(-ENOMEM). */ #define kthread_run(threadfn, data, namefmt, ...) \ ({ \ struct task_struct *__k \ = kthread_create(threadfn, data, namefmt, ## __VA_ARGS__); \ if (!IS_ERR(__k)) \ wake_up_process(__k); \ __k; \ }) /** * kthread_run_on_cpu - create and wake a cpu bound thread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @cpu: The cpu on which the thread should be bound, * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Description: Convenient wrapper for kthread_create_on_cpu() * followed by wake_up_process(). Returns the kthread or * ERR_PTR(-ENOMEM). */ static inline struct task_struct * kthread_run_on_cpu(int (*threadfn)(void *data), void *data, unsigned int cpu, const char *namefmt) { struct task_struct *p; p = kthread_create_on_cpu(threadfn, data, cpu, namefmt); if (!IS_ERR(p)) wake_up_process(p); return p; } void free_kthread_struct(struct task_struct *k); void kthread_bind(struct task_struct *k, unsigned int cpu); void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask); int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask); int kthread_stop(struct task_struct *k); int kthread_stop_put(struct task_struct *k); bool kthread_should_stop(void); bool kthread_should_park(void); bool kthread_should_stop_or_park(void); bool kthread_freezable_should_stop(bool *was_frozen); void *kthread_func(struct task_struct *k); void *kthread_data(struct task_struct *k); void *kthread_probe_data(struct task_struct *k); int kthread_park(struct task_struct *k); void kthread_unpark(struct task_struct *k); void kthread_parkme(void); #define kthread_exit(result) do_exit(result) void kthread_complete_and_exit(struct completion *, long) __noreturn; int kthreads_update_housekeeping(void); void kthread_do_exit(struct kthread *, long); int kthreadd(void *unused); extern struct task_struct *kthreadd_task; extern int tsk_fork_get_node(struct task_struct *tsk); /* * Simple work processor based on kthread. * * This provides easier way to make use of kthreads. A kthread_work * can be queued and flushed using queue/kthread_flush_work() * respectively. Queued kthread_works are processed by a kthread * running kthread_worker_fn(). */ struct kthread_work; typedef void (*kthread_work_func_t)(struct kthread_work *work); void kthread_delayed_work_timer_fn(struct timer_list *t); enum { KTW_FREEZABLE = 1 << 0, /* freeze during suspend */ }; struct kthread_worker { unsigned int flags; raw_spinlock_t lock; struct list_head work_list; struct list_head delayed_work_list; struct task_struct *task; struct kthread_work *current_work; }; struct kthread_work { struct list_head node; kthread_work_func_t func; struct kthread_worker *worker; /* Number of canceling calls that are running at the moment. */ int canceling; }; struct kthread_delayed_work { struct kthread_work work; struct timer_list timer; }; #define KTHREAD_WORK_INIT(work, fn) { \ .node = LIST_HEAD_INIT((work).node), \ .func = (fn), \ } #define KTHREAD_DELAYED_WORK_INIT(dwork, fn) { \ .work = KTHREAD_WORK_INIT((dwork).work, (fn)), \ .timer = __TIMER_INITIALIZER(kthread_delayed_work_timer_fn,\ TIMER_IRQSAFE), \ } #define DEFINE_KTHREAD_WORK(work, fn) \ struct kthread_work work = KTHREAD_WORK_INIT(work, fn) #define DEFINE_KTHREAD_DELAYED_WORK(dwork, fn) \ struct kthread_delayed_work dwork = \ KTHREAD_DELAYED_WORK_INIT(dwork, fn) extern void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key); #define kthread_init_worker(worker) \ do { \ static struct lock_class_key __key; \ __kthread_init_worker((worker), "("#worker")->lock", &__key); \ } while (0) #define kthread_init_work(work, fn) \ do { \ memset((work), 0, sizeof(struct kthread_work)); \ INIT_LIST_HEAD(&(work)->node); \ (work)->func = (fn); \ } while (0) #define kthread_init_delayed_work(dwork, fn) \ do { \ kthread_init_work(&(dwork)->work, (fn)); \ timer_setup(&(dwork)->timer, \ kthread_delayed_work_timer_fn, \ TIMER_IRQSAFE); \ } while (0) int kthread_worker_fn(void *worker_ptr); __printf(3, 4) struct kthread_worker *kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...); #define kthread_create_worker(flags, namefmt, ...) \ kthread_create_worker_on_node(flags, NUMA_NO_NODE, namefmt, ## __VA_ARGS__); /** * kthread_run_worker - create and wake a kthread worker. * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the thread. * * Description: Convenient wrapper for kthread_create_worker() followed by * wake_up_process(). Returns the kthread_worker or ERR_PTR(-ENOMEM). */ #define kthread_run_worker(flags, namefmt, ...) \ ({ \ struct kthread_worker *__kw \ = kthread_create_worker(flags, namefmt, ## __VA_ARGS__); \ if (!IS_ERR(__kw)) \ wake_up_process(__kw->task); \ __kw; \ }) struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[]); /** * kthread_run_worker_on_cpu - create and wake a cpu bound kthread worker. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Description: Convenient wrapper for kthread_create_worker_on_cpu() * followed by wake_up_process(). Returns the kthread_worker or * ERR_PTR(-ENOMEM). */ static inline struct kthread_worker * kthread_run_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[]) { struct kthread_worker *kw; kw = kthread_create_worker_on_cpu(cpu, flags, namefmt); if (!IS_ERR(kw)) wake_up_process(kw->task); return kw; } bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work); bool kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay); bool kthread_mod_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay); void kthread_flush_work(struct kthread_work *work); void kthread_flush_worker(struct kthread_worker *worker); bool kthread_cancel_work_sync(struct kthread_work *work); bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work); void kthread_destroy_worker(struct kthread_worker *worker); void kthread_use_mm(struct mm_struct *mm); void kthread_unuse_mm(struct mm_struct *mm); struct cgroup_subsys_state; #ifdef CONFIG_BLK_CGROUP void kthread_associate_blkcg(struct cgroup_subsys_state *css); struct cgroup_subsys_state *kthread_blkcg(void); #else static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { } #endif #endif /* _LINUX_KTHREAD_H */
11 11 11 11 2 11 11 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 // SPDX-License-Identifier: GPL-2.0-only /* * Network Service Header * * Copyright (c) 2017 Red Hat, Inc. -- Jiri Benc <jbenc@redhat.com> */ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/gso.h> #include <net/nsh.h> #include <net/tun_proto.h> int nsh_push(struct sk_buff *skb, const struct nshhdr *pushed_nh) { struct nshhdr *nh; size_t length = nsh_hdr_len(pushed_nh); u8 next_proto; if (skb->mac_len) { next_proto = TUN_P_ETHERNET; } else { next_proto = tun_p_from_eth_p(skb->protocol); if (!next_proto) return -EAFNOSUPPORT; } /* Add the NSH header */ if (skb_cow_head(skb, length) < 0) return -ENOMEM; skb_push(skb, length); nh = (struct nshhdr *)(skb->data); memcpy(nh, pushed_nh, length); nh->np = next_proto; skb_postpush_rcsum(skb, nh, length); skb->protocol = htons(ETH_P_NSH); skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_mac_len(skb); return 0; } EXPORT_SYMBOL_GPL(nsh_push); int nsh_pop(struct sk_buff *skb) { struct nshhdr *nh; size_t length; __be16 inner_proto; if (!pskb_may_pull(skb, NSH_BASE_HDR_LEN)) return -ENOMEM; nh = (struct nshhdr *)(skb->data); length = nsh_hdr_len(nh); if (length < NSH_BASE_HDR_LEN) return -EINVAL; inner_proto = tun_p_to_eth_p(nh->np); if (!pskb_may_pull(skb, length)) return -ENOMEM; if (!inner_proto) return -EAFNOSUPPORT; skb_pull_rcsum(skb, length); skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_mac_len(skb); skb->protocol = inner_proto; return 0; } EXPORT_SYMBOL_GPL(nsh_pop); static struct sk_buff *nsh_gso_segment(struct sk_buff *skb, netdev_features_t features) { unsigned int outer_hlen, mac_len, nsh_len; struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; __be16 outer_proto, proto; skb_reset_network_header(skb); outer_proto = skb->protocol; outer_hlen = skb_mac_header_len(skb); mac_len = skb->mac_len; if (unlikely(!pskb_may_pull(skb, NSH_BASE_HDR_LEN))) goto out; nsh_len = nsh_hdr_len(nsh_hdr(skb)); if (nsh_len < NSH_BASE_HDR_LEN) goto out; if (unlikely(!pskb_may_pull(skb, nsh_len))) goto out; proto = tun_p_to_eth_p(nsh_hdr(skb)->np); if (!proto) goto out; __skb_pull(skb, nsh_len); skb_reset_mac_header(skb); skb->mac_len = proto == htons(ETH_P_TEB) ? ETH_HLEN : 0; skb->protocol = proto; features &= NETIF_F_SG; segs = skb_mac_gso_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, htons(ETH_P_NSH), nsh_len, mac_offset, mac_len); goto out; } for (skb = segs; skb; skb = skb->next) { skb->protocol = outer_proto; __skb_push(skb, nsh_len + outer_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, outer_hlen); skb->mac_len = mac_len; } out: return segs; } static struct packet_offload nsh_packet_offload __read_mostly = { .type = htons(ETH_P_NSH), .priority = 15, .callbacks = { .gso_segment = nsh_gso_segment, }, }; static int __init nsh_init_module(void) { dev_add_offload(&nsh_packet_offload); return 0; } static void __exit nsh_cleanup_module(void) { dev_remove_offload(&nsh_packet_offload); } module_init(nsh_init_module); module_exit(nsh_cleanup_module); MODULE_AUTHOR("Jiri Benc <jbenc@redhat.com>"); MODULE_DESCRIPTION("NSH protocol"); MODULE_LICENSE("GPL v2");
8 49 196 399 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_BITMAP_H #define __LINUX_BITMAP_H #ifndef __ASSEMBLY__ #include <linux/align.h> #include <linux/bitops.h> #include <linux/cleanup.h> #include <linux/errno.h> #include <linux/find.h> #include <linux/limits.h> #include <linux/string.h> #include <linux/types.h> #include <linux/bitmap-str.h> struct device; /* * bitmaps provide bit arrays that consume one or more unsigned * longs. The bitmap interface and available operations are listed * here, in bitmap.h * * Function implementations generic to all architectures are in * lib/bitmap.c. Functions implementations that are architecture * specific are in various arch/<arch>/include/asm/bitops.h headers * and other arch/<arch> specific files. * * See lib/bitmap.c for more details. */ /** * DOC: bitmap overview * * The available bitmap operations and their rough meaning in the * case that the bitmap is a single unsigned long are thus: * * The generated code is more efficient when nbits is known at * compile-time and at most BITS_PER_LONG. * * :: * * bitmap_zero(dst, nbits) *dst = 0UL * bitmap_fill(dst, nbits) *dst = ~0UL * bitmap_copy(dst, src, nbits) *dst = *src * bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2 * bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2 * bitmap_weighted_or(dst, src1, src2, nbits) *dst = *src1 | *src2. Returns Hamming Weight of dst * bitmap_weighted_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2. Returns Hamming Weight of dst * bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2 * bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2) * bitmap_complement(dst, src, nbits) *dst = ~(*src) * bitmap_equal(src1, src2, nbits) Are *src1 and *src2 equal? * bitmap_intersects(src1, src2, nbits) Do *src1 and *src2 overlap? * bitmap_subset(src1, src2, nbits) Is *src1 a subset of *src2? * bitmap_empty(src, nbits) Are all bits zero in *src? * bitmap_full(src, nbits) Are all bits set in *src? * bitmap_weight(src, nbits) Hamming Weight: number set bits * bitmap_weight_and(src1, src2, nbits) Hamming Weight of and'ed bitmap * bitmap_weight_andnot(src1, src2, nbits) Hamming Weight of andnot'ed bitmap * bitmap_weight_from(src, start, end) Hamming Weight starting from @start * bitmap_set(dst, pos, nbits) Set specified bit area * bitmap_clear(dst, pos, nbits) Clear specified bit area * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area * bitmap_find_next_zero_area_off(buf, len, pos, n, mask, mask_off) as above * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n * bitmap_cut(dst, src, first, n, nbits) Cut n bits from first, copy rest * bitmap_replace(dst, old, new, mask, nbits) *dst = (*old & ~(*mask)) | (*new & *mask) * bitmap_scatter(dst, src, mask, nbits) *dst = map(dense, sparse)(src) * bitmap_gather(dst, src, mask, nbits) *dst = map(sparse, dense)(src) * bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src) * bitmap_bitremap(oldbit, old, new, nbits) newbit = map(old, new)(oldbit) * bitmap_onto(dst, orig, relmap, nbits) *dst = orig relative to relmap * bitmap_fold(dst, orig, sz, nbits) dst bits = orig bits mod sz * bitmap_parse(buf, buflen, dst, nbits) Parse bitmap dst from kernel buf * bitmap_parse_user(ubuf, ulen, dst, nbits) Parse bitmap dst from user buf * bitmap_parselist(buf, dst, nbits) Parse bitmap dst from kernel buf * bitmap_parselist_user(buf, dst, nbits) Parse bitmap dst from user buf * bitmap_find_free_region(bitmap, bits, order) Find and allocate bit region * bitmap_release_region(bitmap, pos, order) Free specified bit region * bitmap_allocate_region(bitmap, pos, order) Allocate specified bit region * bitmap_from_arr32(dst, buf, nbits) Copy nbits from u32[] buf to dst * bitmap_from_arr64(dst, buf, nbits) Copy nbits from u64[] buf to dst * bitmap_to_arr32(buf, src, nbits) Copy nbits from buf to u32[] dst * bitmap_to_arr64(buf, src, nbits) Copy nbits from buf to u64[] dst * bitmap_get_value8(map, start) Get 8bit value from map at start * bitmap_set_value8(map, value, start) Set 8bit value to map at start * bitmap_read(map, start, nbits) Read an nbits-sized value from * map at start * bitmap_write(map, value, start, nbits) Write an nbits-sized value to * map at start * * Note, bitmap_zero() and bitmap_fill() operate over the region of * unsigned longs, that is, bits behind bitmap till the unsigned long * boundary will be zeroed or filled as well. Consider to use * bitmap_clear() or bitmap_set() to make explicit zeroing or filling * respectively. */ /** * DOC: bitmap bitops * * Also the following operations in asm/bitops.h apply to bitmaps.:: * * set_bit(bit, addr) *addr |= bit * clear_bit(bit, addr) *addr &= ~bit * change_bit(bit, addr) *addr ^= bit * test_bit(bit, addr) Is bit set in *addr? * test_and_set_bit(bit, addr) Set bit and return old value * test_and_clear_bit(bit, addr) Clear bit and return old value * test_and_change_bit(bit, addr) Change bit and return old value * find_first_zero_bit(addr, nbits) Position first zero bit in *addr * find_first_bit(addr, nbits) Position first set bit in *addr * find_next_zero_bit(addr, nbits, bit) * Position next zero bit in *addr >= bit * find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit * find_next_and_bit(addr1, addr2, nbits, bit) * Same as find_next_bit, but in * (*addr1 & *addr2) * */ /** * DOC: declare bitmap * The DECLARE_BITMAP(name,bits) macro, in linux/types.h, can be used * to declare an array named 'name' of just enough unsigned longs to * contain all bit positions from 0 to 'bits' - 1. */ /* * Allocation and deallocation of bitmap. * Provided in lib/bitmap.c to avoid circular dependency. */ unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags); unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags); unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node); unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node); void bitmap_free(const unsigned long *bitmap); DEFINE_FREE(bitmap, unsigned long *, if (_T) bitmap_free(_T)) /* Managed variants of the above. */ unsigned long *devm_bitmap_alloc(struct device *dev, unsigned int nbits, gfp_t flags); unsigned long *devm_bitmap_zalloc(struct device *dev, unsigned int nbits, gfp_t flags); /* * lib/bitmap.c provides these functions: */ bool __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __pure __bitmap_or_equal(const unsigned long *src1, const unsigned long *src2, const unsigned long *src3, unsigned int nbits); void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int nbits); void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits); void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits); void bitmap_cut(unsigned long *dst, const unsigned long *src, unsigned int first, unsigned int cut, unsigned int nbits); bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); unsigned int __bitmap_weighted_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_replace(unsigned long *dst, const unsigned long *old, const unsigned long *new, const unsigned long *mask, unsigned int nbits); bool __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); unsigned int __bitmap_weight_and(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_set(unsigned long *map, unsigned int start, int len); void __bitmap_clear(unsigned long *map, unsigned int start, int len); unsigned long bitmap_find_next_zero_area_off(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, unsigned long align_mask, unsigned long align_offset); /** * bitmap_find_next_zero_area - find a contiguous aligned zero area * @map: The address to base the search on * @size: The bitmap size in bits * @start: The bitnumber to start searching at * @nr: The number of zeroed bits we're looking for * @align_mask: Alignment mask for zero area * * The @align_mask should be one less than a power of 2; the effect is that * the bit offset of all zero areas this function finds is multiples of that * power of 2. A @align_mask of 0 means no alignment is required. */ static __always_inline unsigned long bitmap_find_next_zero_area(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, unsigned long align_mask) { return bitmap_find_next_zero_area_off(map, size, start, nr, align_mask, 0); } void bitmap_remap(unsigned long *dst, const unsigned long *src, const unsigned long *old, const unsigned long *new, unsigned int nbits); int bitmap_bitremap(int oldbit, const unsigned long *old, const unsigned long *new, int bits); void bitmap_onto(unsigned long *dst, const unsigned long *orig, const unsigned long *relmap, unsigned int bits); void bitmap_fold(unsigned long *dst, const unsigned long *orig, unsigned int sz, unsigned int nbits); #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) #define bitmap_size(nbits) (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE) static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { unsigned int len = bitmap_size(nbits); if (small_const_nbits(nbits)) *dst = 0; else memset(dst, 0, len); } static __always_inline void bitmap_fill(unsigned long *dst, unsigned int nbits) { unsigned int len = bitmap_size(nbits); if (small_const_nbits(nbits)) *dst = ~0UL; else memset(dst, 0xff, len); } static __always_inline void bitmap_copy(unsigned long *dst, const unsigned long *src, unsigned int nbits) { unsigned int len = bitmap_size(nbits); if (small_const_nbits(nbits)) *dst = *src; else memcpy(dst, src, len); } /* * Copy bitmap and clear tail bits in last word. */ static __always_inline void bitmap_copy_clear_tail(unsigned long *dst, const unsigned long *src, unsigned int nbits) { bitmap_copy(dst, src, nbits); if (nbits % BITS_PER_LONG) dst[nbits / BITS_PER_LONG] &= BITMAP_LAST_WORD_MASK(nbits); } static inline void bitmap_copy_and_extend(unsigned long *to, const unsigned long *from, unsigned int count, unsigned int size) { unsigned int copy = BITS_TO_LONGS(count); memcpy(to, from, copy * sizeof(long)); if (count % BITS_PER_LONG) to[copy - 1] &= BITMAP_LAST_WORD_MASK(count); memset(to + copy, 0, bitmap_size(size) - copy * sizeof(long)); } /* * On 32-bit systems bitmaps are represented as u32 arrays internally. On LE64 * machines the order of hi and lo parts of numbers match the bitmap structure. * In both cases conversion is not needed when copying data from/to arrays of * u32. But in LE64 case, typecast in bitmap_copy_clear_tail() may lead * to out-of-bound access. To avoid that, both LE and BE variants of 64-bit * architectures are not using bitmap_copy_clear_tail(). */ #if BITS_PER_LONG == 64 void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits); void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits); #else #define bitmap_from_arr32(bitmap, buf, nbits) \ bitmap_copy_clear_tail((unsigned long *) (bitmap), \ (const unsigned long *) (buf), (nbits)) #define bitmap_to_arr32(buf, bitmap, nbits) \ bitmap_copy_clear_tail((unsigned long *) (buf), \ (const unsigned long *) (bitmap), (nbits)) #endif /* * On 64-bit systems bitmaps are represented as u64 arrays internally. So, * the conversion is not needed when copying data from/to arrays of u64. */ #if BITS_PER_LONG == 32 void bitmap_from_arr64(unsigned long *bitmap, const u64 *buf, unsigned int nbits); void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits); #else #define bitmap_from_arr64(bitmap, buf, nbits) \ bitmap_copy_clear_tail((unsigned long *)(bitmap), (const unsigned long *)(buf), (nbits)) #define bitmap_to_arr64(buf, bitmap, nbits) \ bitmap_copy_clear_tail((unsigned long *)(buf), (const unsigned long *)(bitmap), (nbits)) #endif static __always_inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0; return __bitmap_and(dst, src1, src2, nbits); } static __always_inline void bitmap_or(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 | *src2; else __bitmap_or(dst, src1, src2, nbits); } static __always_inline unsigned int bitmap_weighted_or(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) { *dst = *src1 | *src2; return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits)); } else { return __bitmap_weighted_or(dst, src1, src2, nbits); } } static __always_inline unsigned int bitmap_weighted_xor(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) { *dst = *src1 ^ *src2; return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits)); } else { return __bitmap_weighted_xor(dst, src1, src2, nbits); } } static __always_inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 ^ *src2; else __bitmap_xor(dst, src1, src2, nbits); } static __always_inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; return __bitmap_andnot(dst, src1, src2, nbits); } static __always_inline void bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = ~(*src); else __bitmap_complement(dst, src, nbits); } #ifdef __LITTLE_ENDIAN #define BITMAP_MEM_ALIGNMENT 8 #else #define BITMAP_MEM_ALIGNMENT (8 * sizeof(unsigned long)) #endif #define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1) static __always_inline bool bitmap_equal(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return !((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits)); if (__builtin_constant_p(nbits & BITMAP_MEM_MASK) && IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT)) return !memcmp(src1, src2, nbits / 8); return __bitmap_equal(src1, src2, nbits); } /** * bitmap_or_equal - Check whether the or of two bitmaps is equal to a third * @src1: Pointer to bitmap 1 * @src2: Pointer to bitmap 2 will be or'ed with bitmap 1 * @src3: Pointer to bitmap 3. Compare to the result of *@src1 | *@src2 * @nbits: number of bits in each of these bitmaps * * Returns: True if (*@src1 | *@src2) == *@src3, false otherwise */ static __always_inline bool bitmap_or_equal(const unsigned long *src1, const unsigned long *src2, const unsigned long *src3, unsigned int nbits) { if (!small_const_nbits(nbits)) return __bitmap_or_equal(src1, src2, src3, nbits); return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits)); } static __always_inline bool bitmap_intersects(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; else return __bitmap_intersects(src1, src2, nbits); } static __always_inline bool bitmap_subset(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits)); else return __bitmap_subset(src1, src2, nbits); } static __always_inline bool bitmap_empty(const unsigned long *src, unsigned nbits) { if (small_const_nbits(nbits)) return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); return find_first_bit(src, nbits) == nbits; } static __always_inline bool bitmap_full(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); return find_first_zero_bit(src, nbits) == nbits; } static __always_inline unsigned int bitmap_weight(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)); return __bitmap_weight(src, nbits); } static __always_inline unsigned long bitmap_weight_and(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return hweight_long(*src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)); return __bitmap_weight_and(src1, src2, nbits); } static __always_inline unsigned long bitmap_weight_andnot(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return hweight_long(*src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)); return __bitmap_weight_andnot(src1, src2, nbits); } /** * bitmap_weight_from - Hamming weight for a memory region * @bitmap: The base address * @start: The bitnumber to starts weighting * @end: the bitmap size in bits * * Returns the number of set bits in the region. If @start >= @end, * return >= end. */ static __always_inline unsigned long bitmap_weight_from(const unsigned long *bitmap, unsigned int start, unsigned int end) { unsigned long w; if (unlikely(start >= end)) return end; if (small_const_nbits(end)) return hweight_long(*bitmap & GENMASK(end - 1, start)); bitmap += start / BITS_PER_LONG; /* Opencode round_down() to not include math.h */ end -= start & ~(BITS_PER_LONG - 1); start %= BITS_PER_LONG; w = bitmap_weight(bitmap, end); if (start) w -= hweight_long(*bitmap & BITMAP_LAST_WORD_MASK(start)); return w; } static __always_inline void bitmap_set(unsigned long *map, unsigned int start, unsigned int nbits) { if (__builtin_constant_p(nbits) && nbits == 1) __set_bit(start, map); else if (small_const_nbits(start + nbits)) *map |= GENMASK(start + nbits - 1, start); else if (__builtin_constant_p(start & BITMAP_MEM_MASK) && IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) && __builtin_constant_p(nbits & BITMAP_MEM_MASK) && IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT)) memset((char *)map + start / 8, 0xff, nbits / 8); else __bitmap_set(map, start, nbits); } static __always_inline void bitmap_clear(unsigned long *map, unsigned int start, unsigned int nbits) { if (__builtin_constant_p(nbits) && nbits == 1) __clear_bit(start, map); else if (small_const_nbits(start + nbits)) *map &= ~GENMASK(start + nbits - 1, start); else if (__builtin_constant_p(start & BITMAP_MEM_MASK) && IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) && __builtin_constant_p(nbits & BITMAP_MEM_MASK) && IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT)) memset((char *)map + start / 8, 0, nbits / 8); else __bitmap_clear(map, start, nbits); } static __always_inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> shift; else __bitmap_shift_right(dst, src, shift, nbits); } static __always_inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = (*src << shift) & BITMAP_LAST_WORD_MASK(nbits); else __bitmap_shift_left(dst, src, shift, nbits); } static __always_inline void bitmap_replace(unsigned long *dst, const unsigned long *old, const unsigned long *new, const unsigned long *mask, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = (*old & ~(*mask)) | (*new & *mask); else __bitmap_replace(dst, old, new, mask, nbits); } /** * bitmap_scatter - Scatter a bitmap according to the given mask * @dst: scattered bitmap * @src: gathered bitmap * @mask: mask representing bits to assign to in the scattered bitmap * @nbits: number of bits in each of these bitmaps * * Scatters bitmap with sequential bits according to the given @mask. * * Example: * If @src bitmap = 0x005a, with @mask = 0x1313, @dst will be 0x0302. * * Or in binary form * @src @mask @dst * 0000000001011010 0001001100010011 0000001100000010 * * (Bits 0, 1, 2, 3, 4, 5 are copied to the bits 0, 1, 4, 8, 9, 12) * * A more 'visual' description of the operation:: * * src: 0000000001011010 * |||||| * +------+||||| * | +----+|||| * | |+----+||| * | || +-+|| * | || | || * mask: ...v..vv...v..vv * ...0..11...0..10 * dst: 0000001100000010 * * A relationship exists between bitmap_scatter() and bitmap_gather(). See * bitmap_gather() for the bitmap gather detailed operations. TL;DR: * bitmap_gather() can be seen as the 'reverse' bitmap_scatter() operation. */ static __always_inline void bitmap_scatter(unsigned long *dst, const unsigned long *src, const unsigned long *mask, unsigned int nbits) { unsigned int n = 0; unsigned int bit; bitmap_zero(dst, nbits); for_each_set_bit(bit, mask, nbits) __assign_bit(bit, dst, test_bit(n++, src)); } /** * bitmap_gather - Gather a bitmap according to given mask * @dst: gathered bitmap * @src: scattered bitmap * @mask: mask representing bits to extract from in the scattered bitmap * @nbits: number of bits in each of these bitmaps * * Gathers bitmap with sparse bits according to the given @mask. * * Example: * If @src bitmap = 0x0302, with @mask = 0x1313, @dst will be 0x001a. * * Or in binary form * @src @mask @dst * 0000001100000010 0001001100010011 0000000000011010 * * (Bits 0, 1, 4, 8, 9, 12 are copied to the bits 0, 1, 2, 3, 4, 5) * * A more 'visual' description of the operation:: * * mask: ...v..vv...v..vv * src: 0000001100000010 * ^ ^^ ^ 0 * | || | 10 * | || > 010 * | |+--> 1010 * | +--> 11010 * +----> 011010 * dst: 0000000000011010 * * A relationship exists between bitmap_gather() and bitmap_scatter(). See * bitmap_scatter() for the bitmap scatter detailed operations. TL;DR: * bitmap_scatter() can be seen as the 'reverse' bitmap_gather() operation. * * Suppose scattered computed using bitmap_scatter(scattered, src, mask, n). * The operation bitmap_gather(result, scattered, mask, n) leads to a result * equal or equivalent to src. * * The result can be 'equivalent' because bitmap_scatter() and bitmap_gather() * are not bijective. * The result and src values are equivalent in that sense that a call to * bitmap_scatter(res, src, mask, n) and a call to * bitmap_scatter(res, result, mask, n) will lead to the same res value. */ static __always_inline void bitmap_gather(unsigned long *dst, const unsigned long *src, const unsigned long *mask, unsigned int nbits) { unsigned int n = 0; unsigned int bit; bitmap_zero(dst, nbits); for_each_set_bit(bit, mask, nbits) __assign_bit(n++, dst, test_bit(bit, src)); } static __always_inline void bitmap_next_set_region(unsigned long *bitmap, unsigned int *rs, unsigned int *re, unsigned int end) { *rs = find_next_bit(bitmap, end, *rs); *re = find_next_zero_bit(bitmap, end, *rs + 1); } /** * bitmap_release_region - release allocated bitmap region * @bitmap: array of unsigned longs corresponding to the bitmap * @pos: beginning of bit region to release * @order: region size (log base 2 of number of bits) to release * * This is the complement to __bitmap_find_free_region() and releases * the found region (by clearing it in the bitmap). */ static __always_inline void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order) { bitmap_clear(bitmap, pos, BIT(order)); } /** * bitmap_allocate_region - allocate bitmap region * @bitmap: array of unsigned longs corresponding to the bitmap * @pos: beginning of bit region to allocate * @order: region size (log base 2 of number of bits) to allocate * * Allocate (set bits in) a specified region of a bitmap. * * Returns: 0 on success, or %-EBUSY if specified region wasn't * free (not all bits were zero). */ static __always_inline int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order) { unsigned int len = BIT(order); if (find_next_bit(bitmap, pos + len, pos) < pos + len) return -EBUSY; bitmap_set(bitmap, pos, len); return 0; } /** * bitmap_find_free_region - find a contiguous aligned mem region * @bitmap: array of unsigned longs corresponding to the bitmap * @bits: number of bits in the bitmap * @order: region size (log base 2 of number of bits) to find * * Find a region of free (zero) bits in a @bitmap of @bits bits and * allocate them (set them to one). Only consider regions of length * a power (@order) of two, aligned to that power of two, which * makes the search algorithm much faster. * * Returns: the bit offset in bitmap of the allocated region, * or -errno on failure. */ static __always_inline int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order) { unsigned int pos, end; /* scans bitmap by regions of size order */ for (pos = 0; (end = pos + BIT(order)) <= bits; pos = end) { if (!bitmap_allocate_region(bitmap, pos, order)) return pos; } return -ENOMEM; } /** * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap. * @n: u64 value * * Linux bitmaps are internally arrays of unsigned longs, i.e. 32-bit * integers in 32-bit environment, and 64-bit integers in 64-bit one. * * There are four combinations of endianness and length of the word in linux * ABIs: LE64, BE64, LE32 and BE32. * * On 64-bit kernels 64-bit LE and BE numbers are naturally ordered in * bitmaps and therefore don't require any special handling. * * On 32-bit kernels 32-bit LE ABI orders lo word of 64-bit number in memory * prior to hi, and 32-bit BE orders hi word prior to lo. The bitmap on the * other hand is represented as an array of 32-bit words and the position of * bit N may therefore be calculated as: word #(N/32) and bit #(N%32) in that * word. For example, bit #42 is located at 10th position of 2nd word. * It matches 32-bit LE ABI, and we can simply let the compiler store 64-bit * values in memory as it usually does. But for BE we need to swap hi and lo * words manually. * * With all that, the macro BITMAP_FROM_U64() does explicit reordering of hi and * lo parts of u64. For LE32 it does nothing, and for BE environment it swaps * hi and lo words, as is expected by bitmap. */ #if __BITS_PER_LONG == 64 #define BITMAP_FROM_U64(n) (n) #else #define BITMAP_FROM_U64(n) ((unsigned long) ((u64)(n) & ULONG_MAX)), \ ((unsigned long) ((u64)(n) >> 32)) #endif /** * bitmap_from_u64 - Check and swap words within u64. * @mask: source bitmap * @dst: destination bitmap * * In 32-bit Big Endian kernel, when using ``(u32 *)(&val)[*]`` * to read u64 mask, we will get the wrong word. * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits, * but we expect the lower 32-bits of u64. */ static __always_inline void bitmap_from_u64(unsigned long *dst, u64 mask) { bitmap_from_arr64(dst, &mask, 64); } /** * bitmap_read - read a value of n-bits from the memory region * @map: address to the bitmap memory region * @start: bit offset of the n-bit value * @nbits: size of value in bits, nonzero, up to BITS_PER_LONG * * Returns: value of @nbits bits located at the @start bit offset within the * @map memory region. For @nbits = 0 and @nbits > BITS_PER_LONG the return * value is undefined. */ static __always_inline unsigned long bitmap_read(const unsigned long *map, unsigned long start, unsigned long nbits) { size_t index = BIT_WORD(start); unsigned long offset = start % BITS_PER_LONG; unsigned long space = BITS_PER_LONG - offset; unsigned long value_low, value_high; if (unlikely(!nbits || nbits > BITS_PER_LONG)) return 0; if (space >= nbits) return (map[index] >> offset) & BITMAP_LAST_WORD_MASK(nbits); value_low = map[index] & BITMAP_FIRST_WORD_MASK(start); value_high = map[index + 1] & BITMAP_LAST_WORD_MASK(start + nbits); return (value_low >> offset) | (value_high << space); } /** * bitmap_write - write n-bit value within a memory region * @map: address to the bitmap memory region * @value: value to write, clamped to nbits * @start: bit offset of the n-bit value * @nbits: size of value in bits, nonzero, up to BITS_PER_LONG. * * bitmap_write() behaves as-if implemented as @nbits calls of __assign_bit(), * i.e. bits beyond @nbits are ignored: * * for (bit = 0; bit < nbits; bit++) * __assign_bit(start + bit, bitmap, val & BIT(bit)); * * For @nbits == 0 and @nbits > BITS_PER_LONG no writes are performed. */ static __always_inline void bitmap_write(unsigned long *map, unsigned long value, unsigned long start, unsigned long nbits) { size_t index; unsigned long offset; unsigned long space; unsigned long mask; bool fit; if (unlikely(!nbits || nbits > BITS_PER_LONG)) return; mask = BITMAP_LAST_WORD_MASK(nbits); value &= mask; offset = start % BITS_PER_LONG; space = BITS_PER_LONG - offset; fit = space >= nbits; index = BIT_WORD(start); map[index] &= (fit ? (~(mask << offset)) : ~BITMAP_FIRST_WORD_MASK(start)); map[index] |= value << offset; if (fit) return; map[index + 1] &= BITMAP_FIRST_WORD_MASK(start + nbits); map[index + 1] |= (value >> space); } #define bitmap_get_value8(map, start) \ bitmap_read(map, start, BITS_PER_BYTE) #define bitmap_set_value8(map, value, start) \ bitmap_write(map, value, start, BITS_PER_BYTE) #endif /* __ASSEMBLY__ */ #endif /* __LINUX_BITMAP_H */
2 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 #ifndef _NET_FLOW_OFFLOAD_H #define _NET_FLOW_OFFLOAD_H #include <linux/kernel.h> #include <linux/list.h> #include <linux/netlink.h> #include <net/flow_dissector.h> struct flow_match { struct flow_dissector *dissector; void *mask; void *key; }; struct flow_match_meta { struct flow_dissector_key_meta *key, *mask; }; struct flow_match_basic { struct flow_dissector_key_basic *key, *mask; }; struct flow_match_control { struct flow_dissector_key_control *key, *mask; }; struct flow_match_eth_addrs { struct flow_dissector_key_eth_addrs *key, *mask; }; struct flow_match_vlan { struct flow_dissector_key_vlan *key, *mask; }; struct flow_match_arp { struct flow_dissector_key_arp *key, *mask; }; struct flow_match_ipv4_addrs { struct flow_dissector_key_ipv4_addrs *key, *mask; }; struct flow_match_ipv6_addrs { struct flow_dissector_key_ipv6_addrs *key, *mask; }; struct flow_match_ip { struct flow_dissector_key_ip *key, *mask; }; struct flow_match_ports { struct flow_dissector_key_ports *key, *mask; }; struct flow_match_ports_range { struct flow_dissector_key_ports_range *key, *mask; }; struct flow_match_icmp { struct flow_dissector_key_icmp *key, *mask; }; struct flow_match_tcp { struct flow_dissector_key_tcp *key, *mask; }; struct flow_match_ipsec { struct flow_dissector_key_ipsec *key, *mask; }; struct flow_match_mpls { struct flow_dissector_key_mpls *key, *mask; }; struct flow_match_enc_keyid { struct flow_dissector_key_keyid *key, *mask; }; struct flow_match_enc_opts { struct flow_dissector_key_enc_opts *key, *mask; }; struct flow_match_ct { struct flow_dissector_key_ct *key, *mask; }; struct flow_match_pppoe { struct flow_dissector_key_pppoe *key, *mask; }; struct flow_match_l2tpv3 { struct flow_dissector_key_l2tpv3 *key, *mask; }; struct flow_rule; void flow_rule_match_meta(const struct flow_rule *rule, struct flow_match_meta *out); void flow_rule_match_basic(const struct flow_rule *rule, struct flow_match_basic *out); void flow_rule_match_control(const struct flow_rule *rule, struct flow_match_control *out); void flow_rule_match_eth_addrs(const struct flow_rule *rule, struct flow_match_eth_addrs *out); void flow_rule_match_vlan(const struct flow_rule *rule, struct flow_match_vlan *out); void flow_rule_match_cvlan(const struct flow_rule *rule, struct flow_match_vlan *out); void flow_rule_match_arp(const struct flow_rule *rule, struct flow_match_arp *out); void flow_rule_match_ipv4_addrs(const struct flow_rule *rule, struct flow_match_ipv4_addrs *out); void flow_rule_match_ipv6_addrs(const struct flow_rule *rule, struct flow_match_ipv6_addrs *out); void flow_rule_match_ip(const struct flow_rule *rule, struct flow_match_ip *out); void flow_rule_match_ports(const struct flow_rule *rule, struct flow_match_ports *out); void flow_rule_match_ports_range(const struct flow_rule *rule, struct flow_match_ports_range *out); void flow_rule_match_tcp(const struct flow_rule *rule, struct flow_match_tcp *out); void flow_rule_match_ipsec(const struct flow_rule *rule, struct flow_match_ipsec *out); void flow_rule_match_icmp(const struct flow_rule *rule, struct flow_match_icmp *out); void flow_rule_match_mpls(const struct flow_rule *rule, struct flow_match_mpls *out); void flow_rule_match_enc_control(const struct flow_rule *rule, struct flow_match_control *out); void flow_rule_match_enc_ipv4_addrs(const struct flow_rule *rule, struct flow_match_ipv4_addrs *out); void flow_rule_match_enc_ipv6_addrs(const struct flow_rule *rule, struct flow_match_ipv6_addrs *out); void flow_rule_match_enc_ip(const struct flow_rule *rule, struct flow_match_ip *out); void flow_rule_match_enc_ports(const struct flow_rule *rule, struct flow_match_ports *out); void flow_rule_match_enc_keyid(const struct flow_rule *rule, struct flow_match_enc_keyid *out); void flow_rule_match_enc_opts(const struct flow_rule *rule, struct flow_match_enc_opts *out); void flow_rule_match_ct(const struct flow_rule *rule, struct flow_match_ct *out); void flow_rule_match_pppoe(const struct flow_rule *rule, struct flow_match_pppoe *out); void flow_rule_match_l2tpv3(const struct flow_rule *rule, struct flow_match_l2tpv3 *out); enum flow_action_id { FLOW_ACTION_ACCEPT = 0, FLOW_ACTION_DROP, FLOW_ACTION_TRAP, FLOW_ACTION_GOTO, FLOW_ACTION_REDIRECT, FLOW_ACTION_MIRRED, FLOW_ACTION_REDIRECT_INGRESS, FLOW_ACTION_MIRRED_INGRESS, FLOW_ACTION_VLAN_PUSH, FLOW_ACTION_VLAN_POP, FLOW_ACTION_VLAN_MANGLE, FLOW_ACTION_TUNNEL_ENCAP, FLOW_ACTION_TUNNEL_DECAP, FLOW_ACTION_MANGLE, FLOW_ACTION_ADD, FLOW_ACTION_CSUM, FLOW_ACTION_MARK, FLOW_ACTION_PTYPE, FLOW_ACTION_PRIORITY, FLOW_ACTION_RX_QUEUE_MAPPING, FLOW_ACTION_WAKE, FLOW_ACTION_QUEUE, FLOW_ACTION_SAMPLE, FLOW_ACTION_POLICE, FLOW_ACTION_CT, FLOW_ACTION_CT_METADATA, FLOW_ACTION_MPLS_PUSH, FLOW_ACTION_MPLS_POP, FLOW_ACTION_MPLS_MANGLE, FLOW_ACTION_GATE, FLOW_ACTION_PPPOE_PUSH, FLOW_ACTION_JUMP, FLOW_ACTION_PIPE, FLOW_ACTION_VLAN_PUSH_ETH, FLOW_ACTION_VLAN_POP_ETH, FLOW_ACTION_CONTINUE, NUM_FLOW_ACTIONS, }; /* This is mirroring enum pedit_header_type definition for easy mapping between * tc pedit action. Legacy TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK is mapped to * FLOW_ACT_MANGLE_UNSPEC, which is supported by no driver. */ enum flow_action_mangle_base { FLOW_ACT_MANGLE_UNSPEC = 0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, FLOW_ACT_MANGLE_HDR_TYPE_IP4, FLOW_ACT_MANGLE_HDR_TYPE_IP6, FLOW_ACT_MANGLE_HDR_TYPE_TCP, FLOW_ACT_MANGLE_HDR_TYPE_UDP, }; enum flow_action_hw_stats_bit { FLOW_ACTION_HW_STATS_IMMEDIATE_BIT, FLOW_ACTION_HW_STATS_DELAYED_BIT, FLOW_ACTION_HW_STATS_DISABLED_BIT, FLOW_ACTION_HW_STATS_NUM_BITS }; enum flow_action_hw_stats { FLOW_ACTION_HW_STATS_IMMEDIATE = BIT(FLOW_ACTION_HW_STATS_IMMEDIATE_BIT), FLOW_ACTION_HW_STATS_DELAYED = BIT(FLOW_ACTION_HW_STATS_DELAYED_BIT), FLOW_ACTION_HW_STATS_ANY = FLOW_ACTION_HW_STATS_IMMEDIATE | FLOW_ACTION_HW_STATS_DELAYED, FLOW_ACTION_HW_STATS_DISABLED = BIT(FLOW_ACTION_HW_STATS_DISABLED_BIT), FLOW_ACTION_HW_STATS_DONT_CARE = BIT(FLOW_ACTION_HW_STATS_NUM_BITS) - 1, }; typedef void (*action_destr)(void *priv); struct flow_action_cookie { u32 cookie_len; u8 cookie[]; }; struct flow_action_cookie *flow_action_cookie_create(void *data, unsigned int len, gfp_t gfp); void flow_action_cookie_destroy(struct flow_action_cookie *cookie); struct flow_action_police { u32 burst; u64 rate_bytes_ps; u64 peakrate_bytes_ps; u32 avrate; u16 overhead; u64 burst_pkt; u64 rate_pkt_ps; u32 mtu; struct { enum flow_action_id act_id; u32 extval; } exceed, notexceed; }; struct flow_action_entry { enum flow_action_id id; u32 hw_index; unsigned long cookie; u64 miss_cookie; enum flow_action_hw_stats hw_stats; action_destr destructor; void *destructor_priv; union { u32 chain_index; /* FLOW_ACTION_GOTO */ struct net_device *dev; /* FLOW_ACTION_REDIRECT */ struct { /* FLOW_ACTION_VLAN */ u16 vid; __be16 proto; u8 prio; } vlan; struct { /* FLOW_ACTION_VLAN_PUSH_ETH */ unsigned char dst[ETH_ALEN]; unsigned char src[ETH_ALEN]; } vlan_push_eth; struct { /* FLOW_ACTION_MANGLE */ /* FLOW_ACTION_ADD */ enum flow_action_mangle_base htype; u32 offset; u32 mask; u32 val; } mangle; struct ip_tunnel_info *tunnel; /* FLOW_ACTION_TUNNEL_ENCAP */ u32 csum_flags; /* FLOW_ACTION_CSUM */ u32 mark; /* FLOW_ACTION_MARK */ u16 ptype; /* FLOW_ACTION_PTYPE */ u16 rx_queue; /* FLOW_ACTION_RX_QUEUE_MAPPING */ u32 priority; /* FLOW_ACTION_PRIORITY */ struct { /* FLOW_ACTION_QUEUE */ u32 ctx; u32 index; u8 vf; } queue; struct { /* FLOW_ACTION_SAMPLE */ struct psample_group *psample_group; u32 rate; u32 trunc_size; bool truncate; } sample; struct flow_action_police police; /* FLOW_ACTION_POLICE */ struct { /* FLOW_ACTION_CT */ int action; u16 zone; struct nf_flowtable *flow_table; } ct; struct { unsigned long cookie; u32 mark; u32 labels[4]; bool orig_dir; } ct_metadata; struct { /* FLOW_ACTION_MPLS_PUSH */ u32 label; __be16 proto; u8 tc; u8 bos; u8 ttl; } mpls_push; struct { /* FLOW_ACTION_MPLS_POP */ __be16 proto; } mpls_pop; struct { /* FLOW_ACTION_MPLS_MANGLE */ u32 label; u8 tc; u8 bos; u8 ttl; } mpls_mangle; struct { s32 prio; u64 basetime; u64 cycletime; u64 cycletimeext; u32 num_entries; struct action_gate_entry *entries; } gate; struct { /* FLOW_ACTION_PPPOE_PUSH */ u16 sid; } pppoe; }; struct flow_action_cookie *user_cookie; /* user defined action cookie */ }; struct flow_action { unsigned int num_entries; struct flow_action_entry entries[] __counted_by(num_entries); }; static inline bool flow_action_has_entries(const struct flow_action *action) { return action->num_entries; } /** * flow_offload_has_one_action() - check if exactly one action is present * @action: tc filter flow offload action * * Return: true if exactly one action is present. */ static inline bool flow_offload_has_one_action(const struct flow_action *action) { return action->num_entries == 1; } static inline bool flow_action_is_last_entry(const struct flow_action *action, const struct flow_action_entry *entry) { return entry == &action->entries[action->num_entries - 1]; } #define flow_action_for_each(__i, __act, __actions) \ for (__i = 0, __act = &(__actions)->entries[0]; \ __i < (__actions)->num_entries; \ __act = &(__actions)->entries[++__i]) static inline bool flow_action_mixed_hw_stats_check(const struct flow_action *action, struct netlink_ext_ack *extack) { const struct flow_action_entry *action_entry; u8 last_hw_stats; int i; if (flow_offload_has_one_action(action)) return true; flow_action_for_each(i, action_entry, action) { if (i && action_entry->hw_stats != last_hw_stats) { NL_SET_ERR_MSG_MOD(extack, "Mixing HW stats types for actions is not supported"); return false; } last_hw_stats = action_entry->hw_stats; } return true; } static inline const struct flow_action_entry * flow_action_first_entry_get(const struct flow_action *action) { WARN_ON(!flow_action_has_entries(action)); return &action->entries[0]; } static inline bool __flow_action_hw_stats_check(const struct flow_action *action, struct netlink_ext_ack *extack, bool check_allow_bit, enum flow_action_hw_stats_bit allow_bit) { const struct flow_action_entry *action_entry; if (!flow_action_has_entries(action)) return true; if (!flow_action_mixed_hw_stats_check(action, extack)) return false; action_entry = flow_action_first_entry_get(action); /* Zero is not a legal value for hw_stats, catch anyone passing it */ WARN_ON_ONCE(!action_entry->hw_stats); if (!check_allow_bit && ~action_entry->hw_stats & FLOW_ACTION_HW_STATS_ANY) { NL_SET_ERR_MSG_MOD(extack, "Driver supports only default HW stats type \"any\""); return false; } else if (check_allow_bit && !(action_entry->hw_stats & BIT(allow_bit))) { NL_SET_ERR_MSG_MOD(extack, "Driver does not support selected HW stats type"); return false; } return true; } static inline bool flow_action_hw_stats_check(const struct flow_action *action, struct netlink_ext_ack *extack, enum flow_action_hw_stats_bit allow_bit) { return __flow_action_hw_stats_check(action, extack, true, allow_bit); } static inline bool flow_action_basic_hw_stats_check(const struct flow_action *action, struct netlink_ext_ack *extack) { return __flow_action_hw_stats_check(action, extack, false, 0); } struct flow_rule { struct flow_match match; struct flow_action action; }; struct flow_rule *flow_rule_alloc(unsigned int num_actions); static inline bool flow_rule_match_key(const struct flow_rule *rule, enum flow_dissector_key_id key) { return dissector_uses_key(rule->match.dissector, key); } /** * flow_rule_is_supp_control_flags() - check for supported control flags * @supp_flags: control flags supported by driver * @ctrl_flags: control flags present in rule * @extack: The netlink extended ACK for reporting errors. * * Return: true if only supported control flags are set, false otherwise. */ static inline bool flow_rule_is_supp_control_flags(const u32 supp_flags, const u32 ctrl_flags, struct netlink_ext_ack *extack) { if (likely((ctrl_flags & ~supp_flags) == 0)) return true; NL_SET_ERR_MSG_FMT_MOD(extack, "Unsupported match on control.flags %#x", ctrl_flags); return false; } /** * flow_rule_is_supp_enc_control_flags() - check for supported control flags * @supp_enc_flags: encapsulation control flags supported by driver * @enc_ctrl_flags: encapsulation control flags present in rule * @extack: The netlink extended ACK for reporting errors. * * Return: true if only supported control flags are set, false otherwise. */ static inline bool flow_rule_is_supp_enc_control_flags(const u32 supp_enc_flags, const u32 enc_ctrl_flags, struct netlink_ext_ack *extack) { if (likely((enc_ctrl_flags & ~supp_enc_flags) == 0)) return true; NL_SET_ERR_MSG_FMT_MOD(extack, "Unsupported match on enc_control.flags %#x", enc_ctrl_flags); return false; } /** * flow_rule_has_control_flags() - check for presence of any control flags * @ctrl_flags: control flags present in rule * @extack: The netlink extended ACK for reporting errors. * * Return: true if control flags are set, false otherwise. */ static inline bool flow_rule_has_control_flags(const u32 ctrl_flags, struct netlink_ext_ack *extack) { return !flow_rule_is_supp_control_flags(0, ctrl_flags, extack); } /** * flow_rule_has_enc_control_flags() - check for presence of any control flags * @enc_ctrl_flags: encapsulation control flags present in rule * @extack: The netlink extended ACK for reporting errors. * * Return: true if control flags are set, false otherwise. */ static inline bool flow_rule_has_enc_control_flags(const u32 enc_ctrl_flags, struct netlink_ext_ack *extack) { return !flow_rule_is_supp_enc_control_flags(0, enc_ctrl_flags, extack); } /** * flow_rule_match_has_control_flags() - match and check for any control flags * @rule: The flow_rule under evaluation. * @extack: The netlink extended ACK for reporting errors. * * Return: true if control flags are set, false otherwise. */ static inline bool flow_rule_match_has_control_flags(const struct flow_rule *rule, struct netlink_ext_ack *extack) { struct flow_match_control match; if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CONTROL)) return false; flow_rule_match_control(rule, &match); return flow_rule_has_control_flags(match.mask->flags, extack); } struct flow_stats { u64 pkts; u64 bytes; u64 drops; u64 lastused; enum flow_action_hw_stats used_hw_stats; bool used_hw_stats_valid; }; static inline void flow_stats_update(struct flow_stats *flow_stats, u64 bytes, u64 pkts, u64 drops, u64 lastused, enum flow_action_hw_stats used_hw_stats) { flow_stats->pkts += pkts; flow_stats->bytes += bytes; flow_stats->drops += drops; flow_stats->lastused = max_t(u64, flow_stats->lastused, lastused); /* The driver should pass value with a maximum of one bit set. * Passing FLOW_ACTION_HW_STATS_ANY is invalid. */ WARN_ON(used_hw_stats == FLOW_ACTION_HW_STATS_ANY); flow_stats->used_hw_stats |= used_hw_stats; flow_stats->used_hw_stats_valid = true; } enum flow_block_command { FLOW_BLOCK_BIND, FLOW_BLOCK_UNBIND, }; enum flow_block_binder_type { FLOW_BLOCK_BINDER_TYPE_UNSPEC, FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS, FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS, FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP, FLOW_BLOCK_BINDER_TYPE_RED_MARK, }; struct flow_block { struct list_head cb_list; }; struct netlink_ext_ack; struct flow_block_offload { enum flow_block_command command; enum flow_block_binder_type binder_type; bool block_shared; bool unlocked_driver_cb; struct net *net; struct flow_block *block; struct list_head cb_list; struct list_head *driver_block_list; struct netlink_ext_ack *extack; struct Qdisc *sch; struct list_head *cb_list_head; }; enum tc_setup_type; typedef int flow_setup_cb_t(enum tc_setup_type type, void *type_data, void *cb_priv); struct flow_block_cb; struct flow_block_indr { struct list_head list; struct net_device *dev; struct Qdisc *sch; enum flow_block_binder_type binder_type; void *data; void *cb_priv; void (*cleanup)(struct flow_block_cb *block_cb); }; struct flow_block_cb { struct list_head driver_list; struct list_head list; flow_setup_cb_t *cb; void *cb_ident; void *cb_priv; void (*release)(void *cb_priv); struct flow_block_indr indr; unsigned int refcnt; }; struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv)); struct flow_block_cb *flow_indr_block_cb_alloc(flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv), struct flow_block_offload *bo, struct net_device *dev, struct Qdisc *sch, void *data, void *indr_cb_priv, void (*cleanup)(struct flow_block_cb *block_cb)); void flow_block_cb_free(struct flow_block_cb *block_cb); struct flow_block_cb *flow_block_cb_lookup(struct flow_block *block, flow_setup_cb_t *cb, void *cb_ident); void *flow_block_cb_priv(struct flow_block_cb *block_cb); void flow_block_cb_incref(struct flow_block_cb *block_cb); unsigned int flow_block_cb_decref(struct flow_block_cb *block_cb); static inline void flow_block_cb_add(struct flow_block_cb *block_cb, struct flow_block_offload *offload) { list_add_tail(&block_cb->list, &offload->cb_list); } static inline void flow_block_cb_remove(struct flow_block_cb *block_cb, struct flow_block_offload *offload) { list_move(&block_cb->list, &offload->cb_list); } static inline void flow_indr_block_cb_remove(struct flow_block_cb *block_cb, struct flow_block_offload *offload) { list_del(&block_cb->indr.list); list_move(&block_cb->list, &offload->cb_list); } bool flow_block_cb_is_busy(flow_setup_cb_t *cb, void *cb_ident, struct list_head *driver_block_list); int flow_block_cb_setup_simple(struct flow_block_offload *f, struct list_head *driver_list, flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, bool ingress_only); enum flow_cls_command { FLOW_CLS_REPLACE, FLOW_CLS_DESTROY, FLOW_CLS_STATS, FLOW_CLS_TMPLT_CREATE, FLOW_CLS_TMPLT_DESTROY, }; struct flow_cls_common_offload { u32 chain_index; __be16 protocol; u32 prio; bool skip_sw; struct netlink_ext_ack *extack; }; struct flow_cls_offload { struct flow_cls_common_offload common; enum flow_cls_command command; bool use_act_stats; unsigned long cookie; struct flow_rule *rule; struct flow_stats stats; u32 classid; }; enum offload_act_command { FLOW_ACT_REPLACE, FLOW_ACT_DESTROY, FLOW_ACT_STATS, }; struct flow_offload_action { struct netlink_ext_ack *extack; /* NULL in FLOW_ACT_STATS process*/ enum offload_act_command command; enum flow_action_id id; u32 index; unsigned long cookie; struct flow_stats stats; struct flow_action action; }; struct flow_offload_action *offload_action_alloc(unsigned int num_actions); static inline struct flow_rule * flow_cls_offload_flow_rule(const struct flow_cls_offload *flow_cmd) { return flow_cmd->rule; } static inline void flow_block_init(struct flow_block *flow_block) { INIT_LIST_HEAD(&flow_block->cb_list); } typedef int flow_indr_block_bind_cb_t(struct net_device *dev, struct Qdisc *sch, void *cb_priv, enum tc_setup_type type, void *type_data, void *data, void (*cleanup)(struct flow_block_cb *block_cb)); int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv); void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, void (*release)(void *cb_priv)); int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, enum tc_setup_type type, void *data, struct flow_block_offload *bo, void (*cleanup)(struct flow_block_cb *block_cb)); bool flow_indr_dev_exists(void); #endif /* _NET_FLOW_OFFLOAD_H */
439 89 7 89 89 54 20 22 21 51 42 11 110 3 3 16 16 19 8 10 10 10 10 10 10 10 10 10 39 38 1 34 25 25 79 2 10 13 6 23 8 34 4 175 175 173 173 173 33 2 137 137 117 3 125 110 125 1 35 15 10 4 3 7 61 23 14 4 20 20 31 9 83 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 /* SPDX-License-Identifier: GPL-2.0 */ /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #ifndef __MPTCP_PROTOCOL_H #define __MPTCP_PROTOCOL_H #include <linux/random.h> #include <net/tcp.h> #include <net/inet_connection_sock.h> #include <uapi/linux/mptcp.h> #include <net/genetlink.h> #include <net/rstreason.h> #define MPTCP_SUPPORTED_VERSION 1 /* MPTCP option bits */ #define OPTION_MPTCP_MPC_SYN BIT(0) #define OPTION_MPTCP_MPC_SYNACK BIT(1) #define OPTION_MPTCP_MPC_ACK BIT(2) #define OPTION_MPTCP_MPJ_SYN BIT(3) #define OPTION_MPTCP_MPJ_SYNACK BIT(4) #define OPTION_MPTCP_MPJ_ACK BIT(5) #define OPTION_MPTCP_ADD_ADDR BIT(6) #define OPTION_MPTCP_RM_ADDR BIT(7) #define OPTION_MPTCP_FASTCLOSE BIT(8) #define OPTION_MPTCP_PRIO BIT(9) #define OPTION_MPTCP_RST BIT(10) #define OPTION_MPTCP_DSS BIT(11) #define OPTION_MPTCP_FAIL BIT(12) #define OPTION_MPTCP_CSUMREQD BIT(13) #define OPTIONS_MPTCP_MPC (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | \ OPTION_MPTCP_MPC_ACK) #define OPTIONS_MPTCP_MPJ (OPTION_MPTCP_MPJ_SYN | OPTION_MPTCP_MPJ_SYNACK | \ OPTION_MPTCP_MPJ_ACK) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 #define MPTCPOPT_MP_JOIN 1 #define MPTCPOPT_DSS 2 #define MPTCPOPT_ADD_ADDR 3 #define MPTCPOPT_RM_ADDR 4 #define MPTCPOPT_MP_PRIO 5 #define MPTCPOPT_MP_FAIL 6 #define MPTCPOPT_MP_FASTCLOSE 7 #define MPTCPOPT_RST 8 /* MPTCP suboption lengths */ #define TCPOLEN_MPTCP_MPC_SYN 4 #define TCPOLEN_MPTCP_MPC_SYNACK 12 #define TCPOLEN_MPTCP_MPC_ACK 20 #define TCPOLEN_MPTCP_MPC_ACK_DATA 22 #define TCPOLEN_MPTCP_MPJ_SYN 12 #define TCPOLEN_MPTCP_MPJ_SYNACK 16 #define TCPOLEN_MPTCP_MPJ_ACK 24 #define TCPOLEN_MPTCP_DSS_BASE 4 #define TCPOLEN_MPTCP_DSS_ACK32 4 #define TCPOLEN_MPTCP_DSS_ACK64 8 #define TCPOLEN_MPTCP_DSS_MAP32 10 #define TCPOLEN_MPTCP_DSS_MAP64 14 #define TCPOLEN_MPTCP_DSS_CHECKSUM 2 #define TCPOLEN_MPTCP_ADD_ADDR 16 #define TCPOLEN_MPTCP_ADD_ADDR_PORT 18 #define TCPOLEN_MPTCP_ADD_ADDR_BASE 8 #define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 10 #define TCPOLEN_MPTCP_ADD_ADDR6 28 #define TCPOLEN_MPTCP_ADD_ADDR6_PORT 30 #define TCPOLEN_MPTCP_ADD_ADDR6_BASE 20 #define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22 #define TCPOLEN_MPTCP_PORT_LEN 2 #define TCPOLEN_MPTCP_PORT_ALIGN 2 #define TCPOLEN_MPTCP_RM_ADDR_BASE 3 #define TCPOLEN_MPTCP_PRIO 3 #define TCPOLEN_MPTCP_PRIO_ALIGN 4 #define TCPOLEN_MPTCP_FASTCLOSE 12 #define TCPOLEN_MPTCP_RST 4 #define TCPOLEN_MPTCP_FAIL 12 #define TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM (TCPOLEN_MPTCP_DSS_CHECKSUM + TCPOLEN_MPTCP_MPC_ACK_DATA) /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) #define MPTCPOPT_THMAC_LEN 8 /* MPTCP MP_CAPABLE flags */ #define MPTCP_VERSION_MASK (0x0F) #define MPTCP_CAP_CHECKSUM_REQD BIT(7) #define MPTCP_CAP_EXTENSIBILITY BIT(6) #define MPTCP_CAP_DENY_JOIN_ID0 BIT(5) #define MPTCP_CAP_HMAC_SHA256 BIT(0) #define MPTCP_CAP_FLAG_MASK (0x1F) /* MPTCP DSS flags */ #define MPTCP_DSS_DATA_FIN BIT(4) #define MPTCP_DSS_DSN64 BIT(3) #define MPTCP_DSS_HAS_MAP BIT(2) #define MPTCP_DSS_ACK64 BIT(1) #define MPTCP_DSS_HAS_ACK BIT(0) #define MPTCP_DSS_FLAG_MASK (0x1F) /* MPTCP ADD_ADDR flags */ #define MPTCP_ADDR_ECHO BIT(0) /* MPTCP MP_PRIO flags */ #define MPTCP_PRIO_BKUP BIT(0) /* MPTCP TCPRST flags */ #define MPTCP_RST_TRANSIENT BIT(0) /* MPTCP socket atomic flags */ #define MPTCP_WORK_RTX 1 #define MPTCP_FALLBACK_DONE 2 #define MPTCP_WORK_CLOSE_SUBFLOW 3 /* MPTCP socket release cb flags */ #define MPTCP_PUSH_PENDING 1 #define MPTCP_CLEAN_UNA 2 #define MPTCP_ERROR_REPORT 3 #define MPTCP_RETRANSMIT 4 #define MPTCP_FLUSH_JOIN_LIST 5 #define MPTCP_SYNC_STATE 6 #define MPTCP_SYNC_SNDBUF 7 struct mptcp_skb_cb { u64 map_seq; u64 end_seq; u32 offset; u8 has_rxtstamp; u8 cant_coalesce; }; #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) static inline bool before64(__u64 seq1, __u64 seq2) { return (__s64)(seq1 - seq2) < 0; } #define after64(seq2, seq1) before64(seq1, seq2) struct mptcp_options_received { u64 sndr_key; u64 rcvr_key; u64 data_ack; u64 data_seq; u32 subflow_seq; u16 data_len; __sum16 csum; struct_group(status, u16 suboptions; u16 use_map:1, dsn64:1, data_fin:1, use_ack:1, ack64:1, mpc_map:1, reset_reason:4, reset_transient:1, echo:1, backup:1, deny_join_id0:1, __unused:2; ); u8 join_id; u32 token; u32 nonce; u64 thmac; u8 hmac[MPTCPOPT_HMAC_LEN]; struct mptcp_addr_info addr; struct mptcp_rm_list rm_list; u64 ahmac; u64 fail_seq; }; static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) { return htonl((TCPOPT_MPTCP << 24) | (len << 16) | (subopt << 12) | ((nib & 0xF) << 8) | field); } enum mptcp_pm_status { MPTCP_PM_ADD_ADDR_RECEIVED, MPTCP_PM_ADD_ADDR_SEND_ACK, MPTCP_PM_RM_ADDR_RECEIVED, MPTCP_PM_ESTABLISHED, MPTCP_PM_SUBFLOW_ESTABLISHED, MPTCP_PM_ALREADY_ESTABLISHED, /* persistent status, set after ESTABLISHED event */ MPTCP_PM_MPC_ENDPOINT_ACCOUNTED /* persistent status, set after MPC local address is * accounted int id_avail_bitmap */ }; enum mptcp_pm_type { MPTCP_PM_TYPE_KERNEL = 0, MPTCP_PM_TYPE_USERSPACE, __MPTCP_PM_TYPE_NR, __MPTCP_PM_TYPE_MAX = __MPTCP_PM_TYPE_NR - 1, }; /* Status bits below MPTCP_PM_ALREADY_ESTABLISHED need pm worker actions */ #define MPTCP_PM_WORK_MASK ((1 << MPTCP_PM_ALREADY_ESTABLISHED) - 1) enum mptcp_addr_signal_status { MPTCP_ADD_ADDR_SIGNAL, MPTCP_ADD_ADDR_ECHO, MPTCP_RM_ADDR_SIGNAL, }; /* max value of mptcp_addr_info.id */ #define MPTCP_PM_MAX_ADDR_ID U8_MAX struct mptcp_pm_data { struct mptcp_addr_info local; struct mptcp_addr_info remote; struct list_head anno_list; struct list_head userspace_pm_local_addr_list; spinlock_t lock; /*protects the whole PM data */ struct_group(reset, u8 addr_signal; bool server_side; bool work_pending; bool accept_addr; bool accept_subflow; bool remote_deny_join_id0; u8 add_addr_signaled; u8 add_addr_accepted; u8 local_addr_used; u8 pm_type; u8 extra_subflows; u8 status; ); DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); struct mptcp_rm_list rm_list_tx; struct mptcp_rm_list rm_list_rx; }; struct mptcp_pm_local { struct mptcp_addr_info addr; u32 flags; int ifindex; }; struct mptcp_pm_addr_entry { struct list_head list; struct mptcp_addr_info addr; u32 flags; int ifindex; struct socket *lsk; }; struct mptcp_data_frag { struct list_head list; u64 data_seq; u16 data_len; u16 offset; u8 overhead; u8 eor; /* currently using 1 bit */ u16 already_sent; struct page *page; }; /* Arbitrary compromise between as low as possible to react timely to subflow * close event and as big as possible to avoid being fouled by biased large * samples due to peer sending data on a different subflow WRT to the incoming * ack. */ #define MPTCP_RTT_SAMPLES 5 /* MPTCP connection sock */ struct mptcp_sock { /* inet_connection_sock must be the first member */ struct inet_connection_sock sk; u64 local_key; /* protected by the first subflow socket lock * lockless access read */ u64 remote_key; /* same as above */ u64 write_seq; u64 bytes_sent; u64 snd_nxt; u64 bytes_received; u64 ack_seq; atomic64_t rcv_wnd_sent; u64 rcv_data_fin_seq; u64 bytes_retrans; u64 bytes_consumed; int snd_burst; int old_wspace; u64 recovery_snd_nxt; /* in recovery mode accept up to this seq; * recovery related fields are under data_lock * protection */ u64 bytes_acked; u64 snd_una; u64 wnd_end; u32 last_data_sent; u32 last_data_recv; u32 last_ack_recv; unsigned long timer_ival; u32 token; unsigned long flags; unsigned long cb_flags; bool recovery; /* closing subflow write queue reinjected */ bool can_ack; bool fully_established; bool rcv_data_fin; bool snd_data_fin_enable; bool rcv_fastclose; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ bool csum_enabled; bool allow_infinite_fallback; u8 pending_state; /* A subflow asked to set this sk_state, * protected by the msk data lock */ u8 mpc_endpoint_id; u8 recvmsg_inq:1, cork:1, nodelay:1, fastopening:1, in_accept_queue:1, free_first:1, rcvspace_init:1, fastclosing:1; u32 notsent_lowat; int keepalive_cnt; int keepalive_idle; int keepalive_intvl; int maxseg; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; struct list_head conn_list; struct list_head rtx_queue; struct mptcp_data_frag *first_pending; struct list_head join_list; struct sock *first; /* The mptcp ops can safely dereference, using suitable * ONCE annotation, the subflow outside the socket * lock as such sock is freed after close(). */ struct mptcp_pm_data pm; struct mptcp_sched_ops *sched; /* Most recent rtt_us observed by in use incoming subflows. */ struct { u32 samples[MPTCP_RTT_SAMPLES]; u32 next_sample; } rcv_rtt_est; struct { int space; /* bytes copied in last measurement window */ int copied; /* bytes copied in this measurement window */ u64 time; /* start time of measurement window */ } rcvq_space; u8 scaling_ratio; bool allow_subflows; u32 subflow_id; u32 setsockopt_seq; char ca_name[TCP_CA_NAME_MAX]; spinlock_t fallback_lock; /* protects fallback, * allow_infinite_fallback and * allow_join */ struct list_head backlog_list; /* protected by the data lock */ u32 backlog_len; u32 backlog_unaccounted; }; #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) #define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock) #define mptcp_for_each_subflow(__msk, __subflow) \ list_for_each_entry(__subflow, &((__msk)->conn_list), node) #define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp) \ list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node) #define mptcp_next_subflow(__msk, __subflow) \ list_next_entry_circular(__subflow, &((__msk)->conn_list), node) extern struct genl_family mptcp_genl_family; static inline void msk_owned_by_me(const struct mptcp_sock *msk) { sock_owned_by_me((const struct sock *)msk); } #ifdef CONFIG_DEBUG_NET /* MPTCP-specific: we might (indirectly) call this helper with the wrong sk */ #undef tcp_sk #define tcp_sk(ptr) ({ \ typeof(ptr) _ptr = (ptr); \ WARN_ON(_ptr->sk_protocol != IPPROTO_TCP); \ container_of_const(_ptr, struct tcp_sock, inet_conn.icsk_inet.sk); \ }) #define mptcp_sk(ptr) ({ \ typeof(ptr) _ptr = (ptr); \ WARN_ON(_ptr->sk_protocol != IPPROTO_MPTCP); \ container_of_const(_ptr, struct mptcp_sock, sk.icsk_inet.sk); \ }) #else /* !CONFIG_DEBUG_NET */ #define mptcp_sk(ptr) container_of_const(ptr, struct mptcp_sock, sk.icsk_inet.sk) #endif static inline int mptcp_win_from_space(const struct sock *sk, int space) { return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space); } static inline int mptcp_space_from_win(const struct sock *sk, int win) { return __tcp_space_from_win(mptcp_sk(sk)->scaling_ratio, win); } static inline int __mptcp_space(const struct sock *sk) { return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - READ_ONCE(mptcp_sk(sk)->backlog_len) - sk_rmem_alloc_get(sk)); } static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); return msk->first_pending; } static inline void mptcp_init_rtt_est(struct mptcp_sock *msk) { int i; for (i = 0; i < MPTCP_RTT_SAMPLES; ++i) msk->rcv_rtt_est.samples[i] = U32_MAX; msk->rcv_rtt_est.next_sample = 0; msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; } static inline u32 mptcp_rtt_us_est(const struct mptcp_sock *msk) { u32 rtt_us = READ_ONCE(msk->rcv_rtt_est.samples[0]); int i; /* Lockless access of collected samples. */ for (i = 1; i < MPTCP_RTT_SAMPLES; ++i) rtt_us = min(rtt_us, READ_ONCE(msk->rcv_rtt_est.samples[i])); return rtt_us; } static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *cur; cur = msk->first_pending; return list_is_last(&cur->list, &msk->rtx_queue) ? NULL : list_next_entry(cur, list); } static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); if (!msk->first_pending) return NULL; if (WARN_ON_ONCE(list_empty(&msk->rtx_queue))) return NULL; return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); } static inline struct mptcp_data_frag *mptcp_rtx_head(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); if (msk->snd_una == msk->snd_nxt) return NULL; return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list); } struct csum_pseudo_header { __be64 data_seq; __be32 subflow_seq; __be16 data_len; __sum16 csum; }; struct mptcp_subflow_request_sock { struct tcp_request_sock sk; u16 mp_capable : 1, mp_join : 1, backup : 1, request_bkup : 1, csum_reqd : 1, allow_join_id0 : 1; u8 local_id; u8 remote_id; u64 local_key; u64 idsn; u32 token; u32 ssn_offset; u64 thmac; u32 local_nonce; u32 remote_nonce; struct mptcp_sock *msk; struct hlist_nulls_node token_node; }; static inline struct mptcp_subflow_request_sock * mptcp_subflow_rsk(const struct request_sock *rsk) { return (struct mptcp_subflow_request_sock *)rsk; } struct mptcp_delegated_action { struct napi_struct napi; local_lock_t bh_lock; struct list_head head; }; DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); #define MPTCP_DELEGATE_SCHEDULED 0 #define MPTCP_DELEGATE_SEND 1 #define MPTCP_DELEGATE_ACK 2 #define MPTCP_DELEGATE_SNDBUF 3 #define MPTCP_DELEGATE_ACTIONS_MASK (~BIT(MPTCP_DELEGATE_SCHEDULED)) /* MPTCP subflow context */ struct mptcp_subflow_context { struct list_head node;/* conn_list of subflows */ struct_group(reset, unsigned long avg_pacing_rate; /* protected by msk socket lock */ u64 local_key; u64 remote_key; u64 idsn; u64 map_seq; u64 rcv_wnd_sent; u32 snd_isn; u32 token; u32 rel_write_seq; u32 map_subflow_seq; u32 ssn_offset; u32 map_data_len; __wsum map_data_csum; u32 map_csum_len; u32 prev_rtt_seq; u32 request_mptcp : 1, /* send MP_CAPABLE */ request_join : 1, /* send MP_JOIN */ request_bkup : 1, mp_capable : 1, /* remote is MPTCP capable */ mp_join : 1, /* remote is JOINing */ pm_notified : 1, /* PM hook called for established status */ conn_finished : 1, map_valid : 1, map_csum_reqd : 1, map_data_fin : 1, mpc_map : 1, backup : 1, send_mp_prio : 1, send_mp_fail : 1, send_fastclose : 1, send_infinite_map : 1, remote_key_valid : 1, /* received the peer key from */ disposable : 1, /* ctx can be free at ulp release time */ closing : 1, /* must not pass rx data to msk anymore */ stale : 1, /* unable to snd/rcv data, do not use for xmit */ valid_csum_seen : 1, /* at least one csum validated */ is_mptfo : 1, /* subflow is doing TFO */ close_event_done : 1, /* has done the post-closed part */ mpc_drop : 1, /* the MPC option has been dropped in a rtx */ __unused : 8; bool data_avail; bool scheduled; bool pm_listener; /* a listener managed by the kernel PM? */ bool fully_established; /* path validated */ u32 lent_mem_frag; u32 remote_nonce; u64 thmac; u32 local_nonce; u32 remote_token; union { u8 hmac[MPTCPOPT_HMAC_LEN]; /* MPJ subflow only */ u64 iasn; /* initial ack sequence number, MPC subflows only */ }; s16 local_id; /* if negative not initialized yet */ u8 remote_id; u8 reset_seen:1; u8 reset_transient:1; u8 reset_reason:4; u8 stale_count; u32 subflow_id; long delegated_status; unsigned long fail_tout; ); struct list_head delegated_node; /* link into delegated_action, protected by local BH */ u32 setsockopt_seq; u32 stale_rcv_tstamp; int cached_sndbuf; /* sndbuf size when last synced with the msk sndbuf, * protected by the msk socket lock */ struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ const struct inet_connection_sock_af_ops *icsk_af_ops; void (*tcp_state_change)(struct sock *sk); void (*tcp_error_report)(struct sock *sk); struct rcu_head rcu; }; static inline struct mptcp_subflow_context * mptcp_subflow_ctx(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); /* Use RCU on icsk_ulp_data only for sock diag code */ return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data; } static inline struct sock * mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) { return subflow->tcp_sock; } static inline void mptcp_subflow_ctx_reset(struct mptcp_subflow_context *subflow) { memset(&subflow->reset, 0, sizeof(subflow->reset)); subflow->request_mptcp = 1; WRITE_ONCE(subflow->local_id, -1); } /* Convert reset reasons in MPTCP to enum sk_rst_reason type */ static inline enum sk_rst_reason sk_rst_convert_mptcp_reason(u32 reason) { switch (reason) { case MPTCP_RST_EUNSPEC: return SK_RST_REASON_MPTCP_RST_EUNSPEC; case MPTCP_RST_EMPTCP: return SK_RST_REASON_MPTCP_RST_EMPTCP; case MPTCP_RST_ERESOURCE: return SK_RST_REASON_MPTCP_RST_ERESOURCE; case MPTCP_RST_EPROHIBIT: return SK_RST_REASON_MPTCP_RST_EPROHIBIT; case MPTCP_RST_EWQ2BIG: return SK_RST_REASON_MPTCP_RST_EWQ2BIG; case MPTCP_RST_EBADPERF: return SK_RST_REASON_MPTCP_RST_EBADPERF; case MPTCP_RST_EMIDDLEBOX: return SK_RST_REASON_MPTCP_RST_EMIDDLEBOX; default: /* It should not happen, or else errors may occur * in MPTCP layer */ return SK_RST_REASON_ERROR; } } static inline void mptcp_send_active_reset_reason(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); enum sk_rst_reason reason; reason = sk_rst_convert_mptcp_reason(subflow->reset_reason); tcp_send_active_reset(sk, GFP_ATOMIC, reason); } /* Made the fwd mem carried by the given skb available to the msk, * To be paired with a previous mptcp_subflow_lend_fwdmem() before freeing * the skb or setting the skb ownership. */ static inline void mptcp_borrow_fwdmem(struct sock *sk, struct sk_buff *skb) { struct sock *ssk = skb->sk; /* The subflow just lend the skb fwd memory; if the subflow meanwhile * closed, mptcp_close_ssk() already released the ssk rcv memory. */ DEBUG_NET_WARN_ON_ONCE(skb->destructor); sk_forward_alloc_add(sk, skb->truesize); if (!ssk) return; atomic_sub(skb->truesize, &ssk->sk_rmem_alloc); skb->sk = NULL; } static inline void __mptcp_subflow_lend_fwdmem(struct mptcp_subflow_context *subflow, int size) { int frag = (subflow->lent_mem_frag + size) & (PAGE_SIZE - 1); subflow->lent_mem_frag = frag; } static inline void mptcp_subflow_lend_fwdmem(struct mptcp_subflow_context *subflow, struct sk_buff *skb) { __mptcp_subflow_lend_fwdmem(subflow, skb->truesize); skb->destructor = NULL; } static inline u64 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow) { return tcp_sk(mptcp_subflow_tcp_sock(subflow))->copied_seq - subflow->ssn_offset - subflow->map_subflow_seq; } static inline u64 mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) { return subflow->map_seq + mptcp_subflow_get_map_offset(subflow); } void mptcp_subflow_process_delegated(struct sock *ssk, long actions); static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow, int action) { long old, set_bits = BIT(MPTCP_DELEGATE_SCHEDULED) | BIT(action); struct mptcp_delegated_action *delegated; bool schedule; /* the caller held the subflow bh socket lock */ lockdep_assert_in_softirq(); /* The implied barrier pairs with tcp_release_cb_override() * mptcp_napi_poll(), and ensures the below list check sees list * updates done prior to delegated status bits changes */ old = set_mask_bits(&subflow->delegated_status, 0, set_bits); if (!(old & BIT(MPTCP_DELEGATE_SCHEDULED))) { if (WARN_ON_ONCE(!list_empty(&subflow->delegated_node))) return; local_lock_nested_bh(&mptcp_delegated_actions.bh_lock); delegated = this_cpu_ptr(&mptcp_delegated_actions); schedule = list_empty(&delegated->head); list_add_tail(&subflow->delegated_node, &delegated->head); local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); sock_hold(mptcp_subflow_tcp_sock(subflow)); if (schedule) napi_schedule(&delegated->napi); } } static inline struct mptcp_subflow_context * mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated) { struct mptcp_subflow_context *ret; local_lock_nested_bh(&mptcp_delegated_actions.bh_lock); if (list_empty(&delegated->head)) { local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); return NULL; } ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node); list_del_init(&ret->delegated_node); local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); return ret; } void __mptcp_inherit_memcg(struct sock *sk, struct sock *ssk, gfp_t gfp); void __mptcp_inherit_cgrp_data(struct sock *sk, struct sock *ssk); int mptcp_is_enabled(const struct net *net); unsigned int mptcp_get_add_addr_timeout(const struct net *net); int mptcp_is_checksum_enabled(const struct net *net); int mptcp_allow_join_id0(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net); unsigned int mptcp_close_timeout(const struct sock *sk); int mptcp_get_pm_type(const struct net *net); const char *mptcp_get_path_manager(const struct net *net); const char *mptcp_get_scheduler(const struct net *net); void mptcp_active_disable(struct sock *sk); bool mptcp_active_should_disable(struct sock *ssk); void mptcp_active_enable(struct sock *sk); void mptcp_get_available_schedulers(char *buf, size_t maxlen); void __mptcp_subflow_fully_established(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt); bool __mptcp_retransmit_pending_data(struct sock *sk); void mptcp_check_and_set_pending(struct sock *sk); void __mptcp_push_pending(struct sock *sk, unsigned int flags); bool mptcp_subflow_data_available(struct sock *sk); void __init mptcp_subflow_init(void); void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how); void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); void __mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); void mptcp_cancel_work(struct sock *sk); void __mptcp_unaccepted_force_close(struct sock *sk); void mptcp_set_state(struct sock *sk, int state); bool mptcp_addresses_equal(const struct mptcp_addr_info *a, const struct mptcp_addr_info *b, bool use_port); void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr); void mptcp_remote_address(const struct sock_common *skc, struct mptcp_addr_info *addr); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, const struct mptcp_addr_info *remote); int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, struct socket **new_sock); void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); struct mptcp_sched_ops *mptcp_sched_find(const char *name); int mptcp_validate_scheduler(struct mptcp_sched_ops *sched); int mptcp_register_scheduler(struct mptcp_sched_ops *sched); void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); void mptcp_sched_init(void); int mptcp_init_sched(struct mptcp_sock *msk, struct mptcp_sched_ops *sched); void mptcp_release_sched(struct mptcp_sock *msk); void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, bool scheduled); struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk); struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk); int mptcp_sched_get_send(struct mptcp_sock *msk); int mptcp_sched_get_retrans(struct mptcp_sock *msk); static inline u64 mptcp_data_avail(const struct mptcp_sock *msk) { return READ_ONCE(msk->bytes_received) - READ_ONCE(msk->bytes_consumed); } static inline bool mptcp_epollin_ready(const struct sock *sk) { u64 data_avail = mptcp_data_avail(mptcp_sk(sk)); if (!data_avail) return false; /* mptcp doesn't have to deal with small skbs in the receive queue, * as it can always coalesce them */ return (data_avail >= sk->sk_rcvlowat) || tcp_under_memory_pressure(sk); } int mptcp_set_rcvlowat(struct sock *sk, int val); static inline bool __tcp_can_send(const struct sock *ssk) { /* only send if our side has not closed yet */ return ((1 << inet_sk_state_load(ssk)) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); } static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow) { /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ if (subflow->request_join && !READ_ONCE(subflow->fully_established)) return false; return __tcp_can_send(mptcp_subflow_tcp_sock(subflow)); } void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow); bool mptcp_subflow_active(struct mptcp_subflow_context *subflow); void mptcp_subflow_drop_ctx(struct sock *ssk); static inline void mptcp_subflow_tcp_fallback(struct sock *sk, struct mptcp_subflow_context *ctx) { sk->sk_data_ready = sock_def_readable; sk->sk_state_change = ctx->tcp_state_change; sk->sk_write_space = sk_stream_write_space; sk->sk_error_report = ctx->tcp_error_report; inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; } void __init mptcp_proto_init(void); #if IS_ENABLED(CONFIG_MPTCP_IPV6) int __init mptcp_proto_v6_init(void); void __init mptcp_subflow_v6_init(void); #endif struct sock *mptcp_sk_clone_init(const struct sock *sk, const struct mptcp_options_received *mp_opt, struct sock *ssk, struct request_sock *req); void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); void __mptcp_sync_state(struct sock *sk, int state); void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout); static inline void mptcp_stop_tout_timer(struct sock *sk) { if (!inet_csk(sk)->icsk_mtup.probe_timestamp) return; sk_stop_timer(sk, &inet_csk(sk)->mptcp_tout_timer); inet_csk(sk)->icsk_mtup.probe_timestamp = 0; } static inline void mptcp_set_close_tout(struct sock *sk, unsigned long tout) { /* avoid 0 timestamp, as that means no close timeout */ inet_csk(sk)->icsk_mtup.probe_timestamp = tout ? : 1; } static inline void mptcp_start_tout_timer(struct sock *sk) { mptcp_set_close_tout(sk, tcp_jiffies32); mptcp_reset_tout_timer(mptcp_sk(sk), 0); } static inline bool mptcp_is_fully_established(struct sock *sk) { return inet_sk_state_load(sk) == TCP_ESTABLISHED && READ_ONCE(mptcp_sk(sk)->fully_established); } static inline u64 mptcp_stamp(void) { return div_u64(tcp_clock_ns(), NSEC_PER_USEC); } void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); bool mptcp_schedule_work(struct sock *sk); int mptcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int mptcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option); u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq); static inline u64 mptcp_expand_seq(u64 old_seq, u64 cur_seq, bool use_64bit) { if (use_64bit) return cur_seq; return __mptcp_expand_seq(old_seq, cur_seq); } void __mptcp_check_push(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); void __mptcp_error_report(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk) { return READ_ONCE(msk->snd_data_fin_enable) && READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt); } static inline u32 mptcp_notsent_lowat(const struct sock *sk) { struct net *net = sock_net(sk); u32 val; val = READ_ONCE(mptcp_sk(sk)->notsent_lowat); return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); } static inline bool mptcp_stream_memory_free(const struct sock *sk, int wake) { const struct mptcp_sock *msk = mptcp_sk(sk); u32 notsent_bytes; notsent_bytes = READ_ONCE(msk->write_seq) - READ_ONCE(msk->snd_nxt); return (notsent_bytes << wake) < mptcp_notsent_lowat(sk); } static inline bool __mptcp_stream_is_writeable(const struct sock *sk, int wake) { return mptcp_stream_memory_free(sk, wake) && __sk_stream_is_writeable(sk, wake); } static inline void mptcp_write_space(struct sock *sk) { /* pairs with memory barrier in mptcp_poll */ smp_mb(); if (mptcp_stream_memory_free(sk, 1)) INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk); } static inline void __mptcp_sync_sndbuf(struct sock *sk) { struct mptcp_subflow_context *subflow; int ssk_sndbuf, new_sndbuf; if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) return; new_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[0]); mptcp_for_each_subflow(mptcp_sk(sk), subflow) { ssk_sndbuf = READ_ONCE(mptcp_subflow_tcp_sock(subflow)->sk_sndbuf); subflow->cached_sndbuf = ssk_sndbuf; new_sndbuf += ssk_sndbuf; } /* the msk max wmem limit is <nr_subflows> * tcp wmem[2] */ WRITE_ONCE(sk->sk_sndbuf, new_sndbuf); mptcp_write_space(sk); } /* The called held both the msk socket and the subflow socket locks, * possibly under BH */ static inline void __mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); if (READ_ONCE(ssk->sk_sndbuf) != subflow->cached_sndbuf) __mptcp_sync_sndbuf(sk); } /* the caller held only the subflow socket lock, either in process or * BH context. Additionally this can be called under the msk data lock, * so we can't acquire such lock here: let the delegate action acquires * the needed locks in suitable order. */ static inline void mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); if (likely(READ_ONCE(ssk->sk_sndbuf) == subflow->cached_sndbuf)) return; local_bh_disable(); mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_SNDBUF); local_bh_enable(); } #define MPTCP_TOKEN_MAX_RETRIES 4 void __init mptcp_token_init(void); static inline void mptcp_token_init_request(struct request_sock *req) { mptcp_subflow_rsk(req)->token_node.pprev = NULL; } int mptcp_token_new_request(struct request_sock *req); void mptcp_token_destroy_request(struct request_sock *req); int mptcp_token_new_connect(struct sock *ssk); void mptcp_token_accept(struct mptcp_subflow_request_sock *r, struct mptcp_sock *msk); bool mptcp_token_exists(u32 token); struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token); struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, long *s_num); void mptcp_token_destroy(struct mptcp_sock *msk); void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum); void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); void mptcp_pm_data_reset(struct mptcp_sock *msk); void mptcp_pm_destroy(struct mptcp_sock *msk); int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, struct mptcp_addr_info *addr); int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, bool require_family, struct mptcp_pm_addr_entry *entry); bool mptcp_pm_addr_families_match(const struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *rem); void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); void mptcp_pm_connection_closed(struct mptcp_sock *msk); void mptcp_pm_subflow_established(struct mptcp_sock *msk); bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk); void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct mptcp_subflow_context *subflow); void mptcp_pm_add_addr_received(const struct sock *ssk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, bool prio, bool backup); void mptcp_pm_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id); void mptcp_pm_rm_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq); int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, struct mptcp_addr_info *addr, struct mptcp_addr_info *rem, u8 bkup); bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool check_id); bool mptcp_lookup_subflow_by_saddr(const struct list_head *list, const struct mptcp_addr_info *saddr); bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local, struct genl_info *info); int mptcp_userspace_pm_set_flags(struct mptcp_pm_addr_entry *local, struct genl_info *info); int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo); int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *entry); /* the default path manager, used in mptcp_pm_unregister */ extern struct mptcp_pm_ops mptcp_pm_kernel; struct mptcp_pm_ops *mptcp_pm_find(const char *name); int mptcp_pm_register(struct mptcp_pm_ops *pm_ops); void mptcp_pm_unregister(struct mptcp_pm_ops *pm_ops); int mptcp_pm_validate(struct mptcp_pm_ops *pm_ops); void mptcp_pm_get_available(char *buf, size_t maxlen); void mptcp_userspace_pm_free_local_addr_list(struct mptcp_sock *msk); void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_info *info); void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id); void mptcp_event_pm_listener(const struct sock *ssk, enum mptcp_event_type event); bool mptcp_userspace_pm_active(const struct mptcp_sock *msk); void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, struct request_sock *req); int mptcp_pm_genl_fill_addr(struct sk_buff *msg, struct netlink_callback *cb, struct mptcp_pm_addr_entry *entry); static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & (BIT(MPTCP_ADD_ADDR_SIGNAL) | BIT(MPTCP_ADD_ADDR_ECHO)); } static inline bool mptcp_pm_should_add_signal_addr(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL); } static inline bool mptcp_pm_should_add_signal_echo(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_ECHO); } static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_RM_ADDR_SIGNAL); } static inline bool mptcp_pm_is_userspace(const struct mptcp_sock *msk) { return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_USERSPACE; } static inline bool mptcp_pm_is_kernel(const struct mptcp_sock *msk) { return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_KERNEL; } static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; if (family == AF_INET6) len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; if (!echo) len += MPTCPOPT_THMAC_LEN; /* account for 2 trailing 'nop' options */ if (port) len += TCPOLEN_MPTCP_PORT_LEN + TCPOLEN_MPTCP_PORT_ALIGN; return len; } static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list) { if (rm_list->nr == 0 || rm_list->nr > MPTCP_RM_IDS_MAX) return -EINVAL; return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1; } bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, unsigned int opt_size, unsigned int remaining, struct mptcp_addr_info *addr, bool *echo, bool *drop_other_suboptions); bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *skc); int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *skc); bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc); bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); int mptcp_pm_nl_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); int mptcp_pm_nl_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, struct genl_info *info); int mptcp_userspace_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, struct genl_info *info); static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflow) { int local_id = READ_ONCE(subflow->local_id); if (local_id < 0) return 0; return local_id; } void __init mptcp_pm_kernel_register(void); void __init mptcp_pm_userspace_register(void); void __init mptcp_pm_nl_init(void); void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_endp_fullmesh_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) { if (--msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk)) WRITE_ONCE(msk->pm.accept_subflow, true); } static inline void mptcp_pm_close_subflow(struct mptcp_sock *msk) { spin_lock_bh(&msk->pm.lock); __mptcp_pm_close_subflow(msk); spin_unlock_bh(&msk->pm.lock); } static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.remote_deny_join_id0) && msk->pm.local_addr_used == 0 && mptcp_pm_get_limit_add_addr_accepted(msk) == 0 && msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk); } void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb) { return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); } void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk) { return test_bit(MPTCP_FALLBACK_DONE, &msk->flags); } static inline bool mptcp_check_fallback(const struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); return __mptcp_check_fallback(msk); } static inline bool __mptcp_has_initial_subflow(const struct mptcp_sock *msk) { struct sock *ssk = READ_ONCE(msk->first); return ssk && ((1 << inet_sk_state_load(ssk)) & (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_LISTEN)); } bool __mptcp_try_fallback(struct mptcp_sock *msk, int fb_mib); static inline bool mptcp_try_fallback(struct sock *ssk, int fb_mib) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; struct mptcp_sock *msk; msk = mptcp_sk(sk); if (!__mptcp_try_fallback(msk, fb_mib)) return false; if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) { gfp_t saved_allocation = ssk->sk_allocation; /* we are in a atomic (BH) scope, override ssk default for data * fin allocation */ ssk->sk_allocation = GFP_ATOMIC; ssk->sk_shutdown |= SEND_SHUTDOWN; tcp_shutdown(ssk, SEND_SHUTDOWN); ssk->sk_allocation = saved_allocation; } return true; } static inline void mptcp_early_fallback(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, int fb_mib) { subflow->request_mptcp = 0; WARN_ON_ONCE(!__mptcp_try_fallback(msk, fb_mib)); } static inline bool mptcp_check_infinite_map(struct sk_buff *skb) { struct mptcp_ext *mpext; mpext = skb ? mptcp_get_ext(skb) : NULL; if (mpext && mpext->infinite_map) return true; return false; } static inline bool is_active_ssk(struct mptcp_subflow_context *subflow) { return (subflow->request_mptcp || subflow->request_join); } static inline bool subflow_simultaneous_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* Note that the sk state implies !subflow->conn_finished. */ return sk->sk_state == TCP_SYN_RECV && is_active_ssk(subflow); } #ifdef CONFIG_SYN_COOKIES void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb); bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb); void __init mptcp_join_cookie_init(void); #else static inline void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb) {} static inline bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb) { return false; } static inline void mptcp_join_cookie_init(void) {} #endif #endif /* __MPTCP_PROTOCOL_H */
35 35 17 17 14 3 17 4 4 4 1 2 17 17 2 1 2 3 1 3 3 3 14 3 4 4 46 46 4 12 4 48 43 6 7 11 2 26 3 40 1 40 22 22 4 21 16 8 41 41 41 41 41 12 39 41 41 41 41 41 10 31 41 41 41 40 41 41 41 41 12 39 45 40 1 40 2 3 5 1 2 1 5 12 12 12 62 2 52 8 36 6 9 46 48 49 37 12 12 3 8 14 16 10 2 12 19 6 10 10 6 6 2 10 7 3 8 50 2 10 8 2 8 5 63 1 62 10 7 4 10 1 1 40 40 9 37 6 1 13 25 2 5 18 18 8 2 17 3 3 2 11 9 6 17 17 6 12 6 17 2 2 15 17 17 6 12 9 14 4 10 7 3 2 2 17 11 13 3 10 7 2 17 7 17 17 17 7 7 7 39 39 38 7 7 7 44 44 9 2 1 43 4 27 7 2 7 34 39 1 3 39 1 7 36 36 10 30 36 7 4 14 16 5 12 16 12 7 4 3 1 2 2 27 3 30 25 5 5 1 3 1 4 11 11 11 11 1 11 33 2 12 19 5 6 8 14 14 3 3 3 13 13 13 8 5 13 4 13 13 7 7 7 7 7 3 4 3 1 20 1 21 21 20 13 28 1 41 21 20 2 41 2 40 2 21 20 5 15 20 41 41 41 40 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 /* * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. * Copyright (c) 2016-2017, Lance Chao <lancerchao@fb.com>. All rights reserved. * Copyright (c) 2016, Fridolin Pokorny <fridolin.pokorny@gmail.com>. All rights reserved. * Copyright (c) 2016, Nikos Mavrogiannopoulos <nmav@gnutls.org>. All rights reserved. * Copyright (c) 2018, Covalent IO, Inc. http://covalent.io * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/bug.h> #include <linux/sched/signal.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/splice.h> #include <crypto/aead.h> #include <net/strparser.h> #include <net/tls.h> #include <trace/events/sock.h> #include "tls.h" struct tls_decrypt_arg { struct_group(inargs, bool zc; bool async; bool async_done; u8 tail; ); struct sk_buff *skb; }; struct tls_decrypt_ctx { struct sock *sk; u8 iv[TLS_MAX_IV_SIZE]; u8 aad[TLS_MAX_AAD_SIZE]; u8 tail; bool free_sgout; struct scatterlist sg[]; }; noinline void tls_err_abort(struct sock *sk, int err) { WARN_ON_ONCE(err >= 0); /* sk->sk_err should contain a positive error code. */ WRITE_ONCE(sk->sk_err, -err); /* Paired with smp_rmb() in tcp_poll() */ smp_wmb(); sk_error_report(sk); } static int __skb_nsg(struct sk_buff *skb, int offset, int len, unsigned int recursion_level) { int start = skb_headlen(skb); int i, chunk = start - offset; struct sk_buff *frag_iter; int elt = 0; if (unlikely(recursion_level >= 24)) return -EMSGSIZE; if (chunk > 0) { if (chunk > len) chunk = len; elt++; len -= chunk; if (len == 0) return elt; offset += chunk; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; WARN_ON(start > offset + len); end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); chunk = end - offset; if (chunk > 0) { if (chunk > len) chunk = len; elt++; len -= chunk; if (len == 0) return elt; offset += chunk; } start = end; } if (unlikely(skb_has_frag_list(skb))) { skb_walk_frags(skb, frag_iter) { int end, ret; WARN_ON(start > offset + len); end = start + frag_iter->len; chunk = end - offset; if (chunk > 0) { if (chunk > len) chunk = len; ret = __skb_nsg(frag_iter, offset - start, chunk, recursion_level + 1); if (unlikely(ret < 0)) return ret; elt += ret; len -= chunk; if (len == 0) return elt; offset += chunk; } start = end; } } BUG_ON(len); return elt; } /* Return the number of scatterlist elements required to completely map the * skb, or -EMSGSIZE if the recursion depth is exceeded. */ static int skb_nsg(struct sk_buff *skb, int offset, int len) { return __skb_nsg(skb, offset, len, 0); } static int tls_padding_length(struct tls_prot_info *prot, struct sk_buff *skb, struct tls_decrypt_arg *darg) { struct strp_msg *rxm = strp_msg(skb); struct tls_msg *tlm = tls_msg(skb); int sub = 0; /* Determine zero-padding length */ if (prot->version == TLS_1_3_VERSION) { int offset = rxm->full_len - TLS_TAG_SIZE - 1; char content_type = darg->zc ? darg->tail : 0; int err; while (content_type == 0) { if (offset < prot->prepend_size) return -EBADMSG; err = skb_copy_bits(skb, rxm->offset + offset, &content_type, 1); if (err) return err; if (content_type) break; sub++; offset--; } tlm->control = content_type; } return sub; } static void tls_decrypt_done(void *data, int err) { struct aead_request *aead_req = data; struct crypto_aead *aead = crypto_aead_reqtfm(aead_req); struct scatterlist *sgout = aead_req->dst; struct tls_sw_context_rx *ctx; struct tls_decrypt_ctx *dctx; struct tls_context *tls_ctx; struct scatterlist *sg; unsigned int pages; struct sock *sk; int aead_size; /* If requests get too backlogged crypto API returns -EBUSY and calls * ->complete(-EINPROGRESS) immediately followed by ->complete(0) * to make waiting for backlog to flush with crypto_wait_req() easier. * First wait converts -EBUSY -> -EINPROGRESS, and the second one * -EINPROGRESS -> 0. * We have a single struct crypto_async_request per direction, this * scheme doesn't help us, so just ignore the first ->complete(). */ if (err == -EINPROGRESS) return; aead_size = sizeof(*aead_req) + crypto_aead_reqsize(aead); aead_size = ALIGN(aead_size, __alignof__(*dctx)); dctx = (void *)((u8 *)aead_req + aead_size); sk = dctx->sk; tls_ctx = tls_get_ctx(sk); ctx = tls_sw_ctx_rx(tls_ctx); /* Propagate if there was an err */ if (err) { if (err == -EBADMSG) TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTERROR); ctx->async_wait.err = err; tls_err_abort(sk, err); } /* Free the destination pages if skb was not decrypted inplace */ if (dctx->free_sgout) { /* Skip the first S/G entry as it points to AAD */ for_each_sg(sg_next(sgout), sg, UINT_MAX, pages) { if (!sg) break; put_page(sg_page(sg)); } } kfree(aead_req); if (atomic_dec_and_test(&ctx->decrypt_pending)) complete(&ctx->async_wait.completion); } static int tls_decrypt_async_wait(struct tls_sw_context_rx *ctx) { if (!atomic_dec_and_test(&ctx->decrypt_pending)) crypto_wait_req(-EINPROGRESS, &ctx->async_wait); atomic_inc(&ctx->decrypt_pending); __skb_queue_purge(&ctx->async_hold); return ctx->async_wait.err; } static int tls_do_decryption(struct sock *sk, struct scatterlist *sgin, struct scatterlist *sgout, char *iv_recv, size_t data_len, struct aead_request *aead_req, struct tls_decrypt_arg *darg) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); int ret; aead_request_set_tfm(aead_req, ctx->aead_recv); aead_request_set_ad(aead_req, prot->aad_size); aead_request_set_crypt(aead_req, sgin, sgout, data_len + prot->tag_size, (u8 *)iv_recv); if (darg->async) { aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, tls_decrypt_done, aead_req); DEBUG_NET_WARN_ON_ONCE(atomic_read(&ctx->decrypt_pending) < 1); atomic_inc(&ctx->decrypt_pending); } else { DECLARE_CRYPTO_WAIT(wait); aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &wait); ret = crypto_aead_decrypt(aead_req); if (ret == -EINPROGRESS || ret == -EBUSY) ret = crypto_wait_req(ret, &wait); return ret; } ret = crypto_aead_decrypt(aead_req); if (ret == -EINPROGRESS) return 0; if (ret == -EBUSY) { ret = tls_decrypt_async_wait(ctx); darg->async_done = true; /* all completions have run, we're not doing async anymore */ darg->async = false; return ret; } atomic_dec(&ctx->decrypt_pending); darg->async = false; return ret; } static void tls_trim_both_msgs(struct sock *sk, int target_size) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; sk_msg_trim(sk, &rec->msg_plaintext, target_size); if (target_size > 0) target_size += prot->overhead_size; sk_msg_trim(sk, &rec->msg_encrypted, target_size); } static int tls_alloc_encrypted_msg(struct sock *sk, int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; struct sk_msg *msg_en = &rec->msg_encrypted; return sk_msg_alloc(sk, msg_en, len, 0); } static int tls_clone_plaintext_msg(struct sock *sk, int required) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; struct sk_msg *msg_pl = &rec->msg_plaintext; struct sk_msg *msg_en = &rec->msg_encrypted; int skip, len; /* We add page references worth len bytes from encrypted sg * at the end of plaintext sg. It is guaranteed that msg_en * has enough required room (ensured by caller). */ len = required - msg_pl->sg.size; /* Skip initial bytes in msg_en's data to be able to use * same offset of both plain and encrypted data. */ skip = prot->prepend_size + msg_pl->sg.size; return sk_msg_clone(sk, msg_pl, msg_en, skip, len); } static struct tls_rec *tls_get_rec(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct sk_msg *msg_pl, *msg_en; struct tls_rec *rec; int mem_size; mem_size = sizeof(struct tls_rec) + crypto_aead_reqsize(ctx->aead_send); rec = kzalloc(mem_size, sk->sk_allocation); if (!rec) return NULL; msg_pl = &rec->msg_plaintext; msg_en = &rec->msg_encrypted; sk_msg_init(msg_pl); sk_msg_init(msg_en); sg_init_table(rec->sg_aead_in, 2); sg_set_buf(&rec->sg_aead_in[0], rec->aad_space, prot->aad_size); sg_unmark_end(&rec->sg_aead_in[1]); sg_init_table(rec->sg_aead_out, 2); sg_set_buf(&rec->sg_aead_out[0], rec->aad_space, prot->aad_size); sg_unmark_end(&rec->sg_aead_out[1]); rec->sk = sk; return rec; } static void tls_free_rec(struct sock *sk, struct tls_rec *rec) { sk_msg_free(sk, &rec->msg_encrypted); sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } static void tls_free_open_rec(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; if (rec) { tls_free_rec(sk, rec); ctx->open_rec = NULL; } } int tls_tx_records(struct sock *sk, int flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec, *tmp; struct sk_msg *msg_en; int tx_flags, rc = 0; if (tls_is_partially_sent_record(tls_ctx)) { rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); if (flags == -1) tx_flags = rec->tx_flags; else tx_flags = flags; rc = tls_push_partial_record(sk, tls_ctx, tx_flags); if (rc) goto tx_err; /* Full record has been transmitted. * Remove the head of tx_list */ list_del(&rec->list); sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } /* Tx all ready records */ list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) { if (READ_ONCE(rec->tx_ready)) { if (flags == -1) tx_flags = rec->tx_flags; else tx_flags = flags; msg_en = &rec->msg_encrypted; rc = tls_push_sg(sk, tls_ctx, &msg_en->sg.data[msg_en->sg.curr], 0, tx_flags); if (rc) goto tx_err; list_del(&rec->list); sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } else { break; } } tx_err: if (rc < 0 && rc != -EAGAIN) tls_err_abort(sk, rc); return rc; } static void tls_encrypt_done(void *data, int err) { struct tls_sw_context_tx *ctx; struct tls_context *tls_ctx; struct tls_prot_info *prot; struct tls_rec *rec = data; struct scatterlist *sge; struct sk_msg *msg_en; struct sock *sk; if (err == -EINPROGRESS) /* see the comment in tls_decrypt_done() */ return; msg_en = &rec->msg_encrypted; sk = rec->sk; tls_ctx = tls_get_ctx(sk); prot = &tls_ctx->prot_info; ctx = tls_sw_ctx_tx(tls_ctx); sge = sk_msg_elem(msg_en, msg_en->sg.curr); sge->offset -= prot->prepend_size; sge->length += prot->prepend_size; /* Check if error is previously set on socket */ if (err || sk->sk_err) { rec = NULL; /* If err is already set on socket, return the same code */ if (sk->sk_err) { ctx->async_wait.err = -sk->sk_err; } else { ctx->async_wait.err = err; tls_err_abort(sk, err); } } if (rec) { struct tls_rec *first_rec; /* Mark the record as ready for transmission */ smp_store_mb(rec->tx_ready, true); /* If received record is at head of tx_list, schedule tx */ first_rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); if (rec == first_rec) { /* Schedule the transmission */ if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) schedule_delayed_work(&ctx->tx_work.work, 1); } } if (atomic_dec_and_test(&ctx->encrypt_pending)) complete(&ctx->async_wait.completion); } static int tls_encrypt_async_wait(struct tls_sw_context_tx *ctx) { if (!atomic_dec_and_test(&ctx->encrypt_pending)) crypto_wait_req(-EINPROGRESS, &ctx->async_wait); atomic_inc(&ctx->encrypt_pending); return ctx->async_wait.err; } static int tls_do_encryption(struct sock *sk, struct tls_context *tls_ctx, struct tls_sw_context_tx *ctx, struct aead_request *aead_req, size_t data_len, u32 start) { struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_rec *rec = ctx->open_rec; struct sk_msg *msg_en = &rec->msg_encrypted; struct scatterlist *sge = sk_msg_elem(msg_en, start); int rc, iv_offset = 0; /* For CCM based ciphers, first byte of IV is a constant */ switch (prot->cipher_type) { case TLS_CIPHER_AES_CCM_128: rec->iv_data[0] = TLS_AES_CCM_IV_B0_BYTE; iv_offset = 1; break; case TLS_CIPHER_SM4_CCM: rec->iv_data[0] = TLS_SM4_CCM_IV_B0_BYTE; iv_offset = 1; break; } memcpy(&rec->iv_data[iv_offset], tls_ctx->tx.iv, prot->iv_size + prot->salt_size); tls_xor_iv_with_seq(prot, rec->iv_data + iv_offset, tls_ctx->tx.rec_seq); sge->offset += prot->prepend_size; sge->length -= prot->prepend_size; msg_en->sg.curr = start; aead_request_set_tfm(aead_req, ctx->aead_send); aead_request_set_ad(aead_req, prot->aad_size); aead_request_set_crypt(aead_req, rec->sg_aead_in, rec->sg_aead_out, data_len, rec->iv_data); aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, tls_encrypt_done, rec); /* Add the record in tx_list */ list_add_tail((struct list_head *)&rec->list, &ctx->tx_list); DEBUG_NET_WARN_ON_ONCE(atomic_read(&ctx->encrypt_pending) < 1); atomic_inc(&ctx->encrypt_pending); rc = crypto_aead_encrypt(aead_req); if (rc == -EBUSY) { rc = tls_encrypt_async_wait(ctx); rc = rc ?: -EINPROGRESS; /* * The async callback tls_encrypt_done() has already * decremented encrypt_pending and restored the sge on * both success and error. Skip the synchronous cleanup * below on error, just remove the record and return. */ if (rc != -EINPROGRESS) { list_del(&rec->list); return rc; } } if (!rc || rc != -EINPROGRESS) { atomic_dec(&ctx->encrypt_pending); sge->offset -= prot->prepend_size; sge->length += prot->prepend_size; } if (!rc) { WRITE_ONCE(rec->tx_ready, true); } else if (rc != -EINPROGRESS) { list_del(&rec->list); return rc; } /* Unhook the record from context if encryption is not failure */ ctx->open_rec = NULL; tls_advance_record_sn(sk, prot, &tls_ctx->tx); return rc; } static int tls_split_open_record(struct sock *sk, struct tls_rec *from, struct tls_rec **to, struct sk_msg *msg_opl, struct sk_msg *msg_oen, u32 split_point, u32 tx_overhead_size, u32 *orig_end) { u32 i, j, bytes = 0, apply = msg_opl->apply_bytes; struct scatterlist *sge, *osge, *nsge; u32 orig_size = msg_opl->sg.size; struct scatterlist tmp = { }; struct sk_msg *msg_npl; struct tls_rec *new; int ret; new = tls_get_rec(sk); if (!new) return -ENOMEM; ret = sk_msg_alloc(sk, &new->msg_encrypted, msg_opl->sg.size + tx_overhead_size, 0); if (ret < 0) { tls_free_rec(sk, new); return ret; } *orig_end = msg_opl->sg.end; i = msg_opl->sg.start; sge = sk_msg_elem(msg_opl, i); while (apply && sge->length) { if (sge->length > apply) { u32 len = sge->length - apply; get_page(sg_page(sge)); sg_set_page(&tmp, sg_page(sge), len, sge->offset + apply); sge->length = apply; bytes += apply; apply = 0; } else { apply -= sge->length; bytes += sge->length; } sk_msg_iter_var_next(i); if (i == msg_opl->sg.end) break; sge = sk_msg_elem(msg_opl, i); } msg_opl->sg.end = i; msg_opl->sg.curr = i; msg_opl->sg.copybreak = 0; msg_opl->apply_bytes = 0; msg_opl->sg.size = bytes; msg_npl = &new->msg_plaintext; msg_npl->apply_bytes = apply; msg_npl->sg.size = orig_size - bytes; j = msg_npl->sg.start; nsge = sk_msg_elem(msg_npl, j); if (tmp.length) { memcpy(nsge, &tmp, sizeof(*nsge)); sk_msg_iter_var_next(j); nsge = sk_msg_elem(msg_npl, j); } osge = sk_msg_elem(msg_opl, i); while (osge->length) { memcpy(nsge, osge, sizeof(*nsge)); sg_unmark_end(nsge); sk_msg_iter_var_next(i); sk_msg_iter_var_next(j); if (i == *orig_end) break; osge = sk_msg_elem(msg_opl, i); nsge = sk_msg_elem(msg_npl, j); } msg_npl->sg.end = j; msg_npl->sg.curr = j; msg_npl->sg.copybreak = 0; *to = new; return 0; } static void tls_merge_open_record(struct sock *sk, struct tls_rec *to, struct tls_rec *from, u32 orig_end) { struct sk_msg *msg_npl = &from->msg_plaintext; struct sk_msg *msg_opl = &to->msg_plaintext; struct scatterlist *osge, *nsge; u32 i, j; i = msg_opl->sg.end; sk_msg_iter_var_prev(i); j = msg_npl->sg.start; osge = sk_msg_elem(msg_opl, i); nsge = sk_msg_elem(msg_npl, j); if (sg_page(osge) == sg_page(nsge) && osge->offset + osge->length == nsge->offset) { osge->length += nsge->length; put_page(sg_page(nsge)); } msg_opl->sg.end = orig_end; msg_opl->sg.curr = orig_end; msg_opl->sg.copybreak = 0; msg_opl->apply_bytes = msg_opl->sg.size + msg_npl->sg.size; msg_opl->sg.size += msg_npl->sg.size; sk_msg_free(sk, &to->msg_encrypted); sk_msg_xfer_full(&to->msg_encrypted, &from->msg_encrypted); kfree(from); } static int tls_push_record(struct sock *sk, int flags, unsigned char record_type) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec, *tmp = NULL; u32 i, split_point, orig_end; struct sk_msg *msg_pl, *msg_en; struct aead_request *req; bool split; int rc; if (!rec) return 0; msg_pl = &rec->msg_plaintext; msg_en = &rec->msg_encrypted; split_point = msg_pl->apply_bytes; split = split_point && split_point < msg_pl->sg.size; if (unlikely((!split && msg_pl->sg.size + prot->overhead_size > msg_en->sg.size) || (split && split_point + prot->overhead_size > msg_en->sg.size))) { split = true; split_point = msg_en->sg.size; } if (split) { rc = tls_split_open_record(sk, rec, &tmp, msg_pl, msg_en, split_point, prot->overhead_size, &orig_end); if (rc < 0) return rc; /* This can happen if above tls_split_open_record allocates * a single large encryption buffer instead of two smaller * ones. In this case adjust pointers and continue without * split. */ if (!msg_pl->sg.size) { tls_merge_open_record(sk, rec, tmp, orig_end); msg_pl = &rec->msg_plaintext; msg_en = &rec->msg_encrypted; split = false; } sk_msg_trim(sk, msg_en, msg_pl->sg.size + prot->overhead_size); } rec->tx_flags = flags; req = &rec->aead_req; i = msg_pl->sg.end; sk_msg_iter_var_prev(i); /* msg_pl->sg.data is a ring; data[MAX+1] is reserved for the wrap * link (frags won't use it). 'i' is now the last filled entry: * * i end start * v v v [ rsv ] * [ d ][ d ][ ][ ]...[ ][ d ][ d ][ d ][chain] * ^ END v * `-----------------------------------------' * * Note that SGL does not allow chain-after-chain, so for TLS 1.3, * we must make sure we don't create the wrap entry and then chain * link to content_type immediately at index 0. */ if (i < msg_pl->sg.start) sg_chain(msg_pl->sg.data, ARRAY_SIZE(msg_pl->sg.data), msg_pl->sg.data); rec->content_type = record_type; if (prot->version == TLS_1_3_VERSION) { /* Add content type to end of message. No padding added */ sg_set_buf(&rec->sg_content_type, &rec->content_type, 1); sg_mark_end(&rec->sg_content_type); sg_chain(msg_pl->sg.data, i + 2, &rec->sg_content_type); } else { sg_mark_end(sk_msg_elem(msg_pl, i)); } i = msg_pl->sg.start; sg_chain(rec->sg_aead_in, 2, &msg_pl->sg.data[i]); i = msg_en->sg.end; sk_msg_iter_var_prev(i); sg_mark_end(sk_msg_elem(msg_en, i)); i = msg_en->sg.start; sg_chain(rec->sg_aead_out, 2, &msg_en->sg.data[i]); tls_make_aad(rec->aad_space, msg_pl->sg.size + prot->tail_size, tls_ctx->tx.rec_seq, record_type, prot); tls_fill_prepend(tls_ctx, page_address(sg_page(&msg_en->sg.data[i])) + msg_en->sg.data[i].offset, msg_pl->sg.size + prot->tail_size, record_type); tls_ctx->pending_open_record_frags = false; rc = tls_do_encryption(sk, tls_ctx, ctx, req, msg_pl->sg.size + prot->tail_size, i); if (rc < 0) { if (rc != -EINPROGRESS) { tls_err_abort(sk, -EBADMSG); if (split) { tls_ctx->pending_open_record_frags = true; tls_merge_open_record(sk, rec, tmp, orig_end); } } ctx->async_capable = 1; return rc; } else if (split) { msg_pl = &tmp->msg_plaintext; msg_en = &tmp->msg_encrypted; sk_msg_trim(sk, msg_en, msg_pl->sg.size + prot->overhead_size); tls_ctx->pending_open_record_frags = true; ctx->open_rec = tmp; } return tls_tx_records(sk, flags); } static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, bool full_record, u8 record_type, ssize_t *copied, int flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct sk_msg msg_redir = { }; struct sk_psock *psock; struct sock *sk_redir; struct tls_rec *rec; bool enospc, policy, redir_ingress; int err = 0, send; u32 delta = 0; policy = !(flags & MSG_SENDPAGE_NOPOLICY); psock = sk_psock_get(sk); if (!psock || !policy) { err = tls_push_record(sk, flags, record_type); if (err && err != -EINPROGRESS && sk->sk_err == EBADMSG) { *copied -= sk_msg_free(sk, msg); tls_free_open_rec(sk); err = -sk->sk_err; } if (psock) sk_psock_put(sk, psock); return err; } more_data: enospc = sk_msg_full(msg); if (psock->eval == __SK_NONE) { delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); delta -= msg->sg.size; if ((s32)delta > 0) { /* It indicates that we executed bpf_msg_pop_data(), * causing the plaintext data size to decrease. * Therefore the encrypted data size also needs to * correspondingly decrease. We only need to subtract * delta to calculate the new ciphertext length since * ktls does not support block encryption. */ struct sk_msg *enc = &ctx->open_rec->msg_encrypted; sk_msg_trim(sk, enc, enc->sg.size - delta); } } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc && !full_record) { err = -ENOSPC; goto out_err; } msg->cork_bytes = 0; send = msg->sg.size; if (msg->apply_bytes && msg->apply_bytes < send) send = msg->apply_bytes; switch (psock->eval) { case __SK_PASS: err = tls_push_record(sk, flags, record_type); if (err && err != -EINPROGRESS && sk->sk_err == EBADMSG) { *copied -= sk_msg_free(sk, msg); tls_free_open_rec(sk); err = -sk->sk_err; goto out_err; } break; case __SK_REDIRECT: redir_ingress = psock->redir_ingress; sk_redir = psock->sk_redir; memcpy(&msg_redir, msg, sizeof(*msg)); if (msg->apply_bytes < send) msg->apply_bytes = 0; else msg->apply_bytes -= send; sk_msg_return_zero(sk, msg, send); msg->sg.size -= send; release_sock(sk); err = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress, &msg_redir, send, flags); lock_sock(sk); if (err < 0) { /* Regardless of whether the data represented by * msg_redir is sent successfully, we have already * uncharged it via sk_msg_return_zero(). The * msg->sg.size represents the remaining unprocessed * data, which needs to be uncharged here. */ sk_mem_uncharge(sk, msg->sg.size); *copied -= sk_msg_free_nocharge(sk, &msg_redir); msg->sg.size = 0; } if (msg->sg.size == 0) tls_free_open_rec(sk); break; case __SK_DROP: default: sk_msg_free_partial(sk, msg, send); if (msg->apply_bytes < send) msg->apply_bytes = 0; else msg->apply_bytes -= send; if (msg->sg.size == 0) tls_free_open_rec(sk); *copied -= (send + delta); err = -EACCES; } if (likely(!err)) { bool reset_eval = !ctx->open_rec; rec = ctx->open_rec; if (rec) { msg = &rec->msg_plaintext; if (!msg->apply_bytes) reset_eval = true; } if (reset_eval) { psock->eval = __SK_NONE; if (psock->sk_redir) { sock_put(psock->sk_redir); psock->sk_redir = NULL; } } if (rec) goto more_data; } out_err: sk_psock_put(sk, psock); return err; } static int tls_sw_push_pending_record(struct sock *sk, int flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; struct sk_msg *msg_pl; size_t copied; if (!rec) return 0; msg_pl = &rec->msg_plaintext; copied = msg_pl->sg.size; if (!copied) return 0; return bpf_exec_tx_verdict(msg_pl, sk, true, TLS_RECORD_TYPE_DATA, &copied, flags); } static int tls_sw_sendmsg_splice(struct sock *sk, struct msghdr *msg, struct sk_msg *msg_pl, size_t try_to_copy, ssize_t *copied) { struct page *page = NULL, **pages = &page; do { ssize_t part; size_t off; part = iov_iter_extract_pages(&msg->msg_iter, &pages, try_to_copy, 1, 0, &off); if (part <= 0) return part ?: -EIO; if (WARN_ON_ONCE(!sendpage_ok(page))) { iov_iter_revert(&msg->msg_iter, part); return -EIO; } sk_msg_page_add(msg_pl, page, part, off); msg_pl->sg.copybreak = 0; msg_pl->sg.curr = msg_pl->sg.end; sk_mem_charge(sk, part); *copied += part; try_to_copy -= part; } while (try_to_copy && !sk_msg_full(msg_pl)); return 0; } static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); bool async_capable = ctx->async_capable; unsigned char record_type = TLS_RECORD_TYPE_DATA; bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); bool eor = !(msg->msg_flags & MSG_MORE); size_t try_to_copy; ssize_t copied = 0; struct sk_msg *msg_pl, *msg_en; struct tls_rec *rec; int required_size; int num_async = 0; bool full_record; int record_room; int num_zc = 0; int orig_size; int ret = 0; if (!eor && (msg->msg_flags & MSG_EOR)) return -EINVAL; if (unlikely(msg->msg_controllen)) { ret = tls_process_cmsg(sk, msg, &record_type); if (ret) { if (ret == -EINPROGRESS) num_async++; else if (ret != -EAGAIN) goto end; } } while (msg_data_left(msg)) { if (sk->sk_err) { ret = -sk->sk_err; goto send_end; } if (ctx->open_rec) rec = ctx->open_rec; else rec = ctx->open_rec = tls_get_rec(sk); if (!rec) { ret = -ENOMEM; goto send_end; } msg_pl = &rec->msg_plaintext; msg_en = &rec->msg_encrypted; orig_size = msg_pl->sg.size; full_record = false; try_to_copy = msg_data_left(msg); record_room = tls_ctx->tx_max_payload_len - msg_pl->sg.size; if (try_to_copy >= record_room) { try_to_copy = record_room; full_record = true; } required_size = msg_pl->sg.size + try_to_copy + prot->overhead_size; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; alloc_encrypted: ret = tls_alloc_encrypted_msg(sk, required_size); if (ret) { if (ret != -ENOSPC) goto wait_for_memory; /* Adjust try_to_copy according to the amount that was * actually allocated. The difference is due * to max sg elements limit */ try_to_copy -= required_size - msg_en->sg.size; full_record = true; } if (try_to_copy && (msg->msg_flags & MSG_SPLICE_PAGES)) { ret = tls_sw_sendmsg_splice(sk, msg, msg_pl, try_to_copy, &copied); if (ret < 0) goto send_end; tls_ctx->pending_open_record_frags = true; if (sk_msg_full(msg_pl)) { full_record = true; sk_msg_trim(sk, msg_en, msg_pl->sg.size + prot->overhead_size); } if (full_record || eor) goto copied; continue; } if (!is_kvec && (full_record || eor) && !async_capable) { u32 first = msg_pl->sg.end; ret = sk_msg_zerocopy_from_iter(sk, &msg->msg_iter, msg_pl, try_to_copy); if (ret) goto fallback_to_reg_send; num_zc++; copied += try_to_copy; sk_msg_sg_copy_set(msg_pl, first); ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, record_type, &copied, msg->msg_flags); if (ret) { if (ret == -EINPROGRESS) num_async++; else if (ret == -ENOMEM) goto wait_for_memory; else if (ctx->open_rec && ret == -ENOSPC) { if (msg_pl->cork_bytes) { ret = 0; goto send_end; } goto rollback_iter; } else if (ret != -EAGAIN) goto send_end; } /* Transmit if any encryptions have completed */ if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { cancel_delayed_work(&ctx->tx_work.work); tls_tx_records(sk, msg->msg_flags); } continue; rollback_iter: copied -= try_to_copy; sk_msg_sg_copy_clear(msg_pl, first); iov_iter_revert(&msg->msg_iter, msg_pl->sg.size - orig_size); fallback_to_reg_send: sk_msg_trim(sk, msg_pl, orig_size); } required_size = msg_pl->sg.size + try_to_copy; ret = tls_clone_plaintext_msg(sk, required_size); if (ret) { if (ret != -ENOSPC) goto send_end; /* Adjust try_to_copy according to the amount that was * actually allocated. The difference is due * to max sg elements limit */ try_to_copy -= required_size - msg_pl->sg.size; full_record = true; sk_msg_trim(sk, msg_en, msg_pl->sg.size + prot->overhead_size); } if (try_to_copy) { ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_pl, try_to_copy); if (ret < 0) goto trim_sgl; } /* Open records defined only if successfully copied, otherwise * we would trim the sg but not reset the open record frags. */ tls_ctx->pending_open_record_frags = true; copied += try_to_copy; copied: if (full_record || eor) { ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, record_type, &copied, msg->msg_flags); if (ret) { if (ret == -EINPROGRESS) num_async++; else if (ret == -ENOMEM) goto wait_for_memory; else if (ret != -EAGAIN) { if (ret == -ENOSPC) ret = 0; goto send_end; } } /* Transmit if any encryptions have completed */ if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { cancel_delayed_work(&ctx->tx_work.work); tls_tx_records(sk, msg->msg_flags); } } continue; wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: ret = sk_stream_wait_memory(sk, &timeo); if (ret) { trim_sgl: if (ctx->open_rec) tls_trim_both_msgs(sk, orig_size); goto send_end; } if (ctx->open_rec && msg_en->sg.size < required_size) goto alloc_encrypted; } send_end: if (!num_async) { goto end; } else if (num_zc || eor) { int err; /* Wait for pending encryptions to get completed */ err = tls_encrypt_async_wait(ctx); if (err) { ret = err; copied = 0; } } /* Transmit if any encryptions have completed */ if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { cancel_delayed_work(&ctx->tx_work.work); tls_tx_records(sk, msg->msg_flags); } end: ret = sk_stream_error(sk, msg->msg_flags, ret); return copied > 0 ? copied : ret; } int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct tls_context *tls_ctx = tls_get_ctx(sk); int ret; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_CMSG_COMPAT | MSG_SPLICE_PAGES | MSG_EOR | MSG_SENDPAGE_NOPOLICY)) return -EOPNOTSUPP; ret = mutex_lock_interruptible(&tls_ctx->tx_lock); if (ret) return ret; lock_sock(sk); ret = tls_sw_sendmsg_locked(sk, msg, size); release_sock(sk); mutex_unlock(&tls_ctx->tx_lock); return ret; } /* * Handle unexpected EOF during splice without SPLICE_F_MORE set. */ void tls_sw_splice_eof(struct socket *sock) { struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec; struct sk_msg *msg_pl; ssize_t copied = 0; bool retrying = false; int ret = 0; if (!ctx->open_rec) return; mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); retry: /* same checks as in tls_sw_push_pending_record() */ rec = ctx->open_rec; if (!rec) goto unlock; msg_pl = &rec->msg_plaintext; if (msg_pl->sg.size == 0) goto unlock; /* Check the BPF advisor and perform transmission. */ ret = bpf_exec_tx_verdict(msg_pl, sk, false, TLS_RECORD_TYPE_DATA, &copied, 0); switch (ret) { case 0: case -EAGAIN: if (retrying) goto unlock; retrying = true; goto retry; case -EINPROGRESS: break; default: goto unlock; } /* Wait for pending encryptions to get completed */ if (tls_encrypt_async_wait(ctx)) goto unlock; /* Transmit if any encryptions have completed */ if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { cancel_delayed_work(&ctx->tx_work.work); tls_tx_records(sk, 0); } unlock: release_sock(sk); mutex_unlock(&tls_ctx->tx_lock); } /* When has_copied is true the caller has already moved bytes to * userspace. Report sk_err but leave it set so the next read * surfaces it instead of a spurious EOF, otherwise sk_err is * consumed via sock_error(). */ static int tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, bool released, bool has_copied) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); DEFINE_WAIT_FUNC(wait, woken_wake_function); int ret = 0; long timeo; /* a rekey is pending, let userspace deal with it */ if (unlikely(ctx->key_update_pending)) return -EKEYEXPIRED; timeo = sock_rcvtimeo(sk, nonblock); while (!tls_strp_msg_ready(ctx)) { if (!sk_psock_queue_empty(psock)) return 0; if (sk->sk_err) { if (has_copied) return -READ_ONCE(sk->sk_err); return sock_error(sk); } if (ret < 0) return ret; if (!skb_queue_empty(&sk->sk_receive_queue)) { tls_strp_check_rcv(&ctx->strp); if (tls_strp_msg_ready(ctx)) break; } if (sk->sk_shutdown & RCV_SHUTDOWN) return 0; if (sock_flag(sk, SOCK_DONE)) return 0; if (!timeo) return -EAGAIN; released = true; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); ret = sk_wait_event(sk, &timeo, tls_strp_msg_ready(ctx) || !sk_psock_queue_empty(psock), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); /* Handle signals */ if (signal_pending(current)) return sock_intr_errno(timeo); } if (unlikely(!tls_strp_msg_load(&ctx->strp, released))) return tls_rx_rec_wait(sk, psock, nonblock, false, has_copied); return 1; } static int tls_setup_from_iter(struct iov_iter *from, int length, int *pages_used, struct scatterlist *to, int to_max_pages) { int rc = 0, i = 0, num_elem = *pages_used, maxpages; struct page *pages[MAX_SKB_FRAGS]; unsigned int size = 0; ssize_t copied, use; size_t offset; while (length > 0) { i = 0; maxpages = to_max_pages - num_elem; if (maxpages == 0) { rc = -EFAULT; goto out; } copied = iov_iter_get_pages2(from, pages, length, maxpages, &offset); if (copied <= 0) { rc = -EFAULT; goto out; } length -= copied; size += copied; while (copied) { use = min_t(int, copied, PAGE_SIZE - offset); sg_set_page(&to[num_elem], pages[i], use, offset); sg_unmark_end(&to[num_elem]); /* We do not uncharge memory from this API */ offset = 0; copied -= use; i++; num_elem++; } } /* Mark the end in the last sg entry if newly added */ if (num_elem > *pages_used) sg_mark_end(&to[num_elem - 1]); out: if (rc) iov_iter_revert(from, size); *pages_used = num_elem; return rc; } static struct sk_buff * tls_alloc_clrtxt_skb(struct sock *sk, struct sk_buff *skb, unsigned int full_len) { struct strp_msg *clr_rxm; struct sk_buff *clr_skb; int err; clr_skb = alloc_skb_with_frags(0, full_len, TLS_PAGE_ORDER, &err, sk->sk_allocation); if (!clr_skb) return NULL; skb_copy_header(clr_skb, skb); clr_skb->len = full_len; clr_skb->data_len = full_len; clr_rxm = strp_msg(clr_skb); clr_rxm->offset = 0; return clr_skb; } /* Decrypt handlers * * tls_decrypt_sw() and tls_decrypt_device() are decrypt handlers. * They must transform the darg in/out argument are as follows: * | Input | Output * ------------------------------------------------------------------- * zc | Zero-copy decrypt allowed | Zero-copy performed * async | Async decrypt allowed | Async crypto used / in progress * skb | * | Output skb * * If ZC decryption was performed darg.skb will point to the input skb. */ /* This function decrypts the input skb into either out_iov or in out_sg * or in skb buffers itself. The input parameter 'darg->zc' indicates if * zero-copy mode needs to be tried or not. With zero-copy mode, either * out_iov or out_sg must be non-NULL. In case both out_iov and out_sg are * NULL, then the decryption happens inside skb buffers itself, i.e. * zero-copy gets disabled and 'darg->zc' is updated. */ static int tls_decrypt_sg(struct sock *sk, struct iov_iter *out_iov, struct scatterlist *out_sg, struct tls_decrypt_arg *darg) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct tls_prot_info *prot = &tls_ctx->prot_info; int n_sgin, n_sgout, aead_size, err, pages = 0; struct sk_buff *skb = tls_strp_msg(ctx); const struct strp_msg *rxm = strp_msg(skb); const struct tls_msg *tlm = tls_msg(skb); struct aead_request *aead_req; struct scatterlist *sgin = NULL; struct scatterlist *sgout = NULL; const int data_len = rxm->full_len - prot->overhead_size; int tail_pages = !!prot->tail_size; struct tls_decrypt_ctx *dctx; struct sk_buff *clear_skb; int iv_offset = 0; u8 *mem; n_sgin = skb_nsg(skb, rxm->offset + prot->prepend_size, rxm->full_len - prot->prepend_size); if (n_sgin < 1) return n_sgin ?: -EBADMSG; if (darg->zc && (out_iov || out_sg)) { clear_skb = NULL; if (out_iov) n_sgout = 1 + tail_pages + iov_iter_npages_cap(out_iov, INT_MAX, data_len); else n_sgout = sg_nents(out_sg); } else { darg->zc = false; clear_skb = tls_alloc_clrtxt_skb(sk, skb, rxm->full_len); if (!clear_skb) return -ENOMEM; n_sgout = 1 + skb_shinfo(clear_skb)->nr_frags; } /* Increment to accommodate AAD */ n_sgin = n_sgin + 1; /* Allocate a single block of memory which contains * aead_req || tls_decrypt_ctx. * Both structs are variable length. */ aead_size = sizeof(*aead_req) + crypto_aead_reqsize(ctx->aead_recv); aead_size = ALIGN(aead_size, __alignof__(*dctx)); mem = kmalloc(aead_size + struct_size(dctx, sg, size_add(n_sgin, n_sgout)), sk->sk_allocation); if (!mem) { err = -ENOMEM; goto exit_free_skb; } /* Segment the allocated memory */ aead_req = (struct aead_request *)mem; dctx = (struct tls_decrypt_ctx *)(mem + aead_size); dctx->sk = sk; sgin = &dctx->sg[0]; sgout = &dctx->sg[n_sgin]; /* For CCM based ciphers, first byte of nonce+iv is a constant */ switch (prot->cipher_type) { case TLS_CIPHER_AES_CCM_128: dctx->iv[0] = TLS_AES_CCM_IV_B0_BYTE; iv_offset = 1; break; case TLS_CIPHER_SM4_CCM: dctx->iv[0] = TLS_SM4_CCM_IV_B0_BYTE; iv_offset = 1; break; } /* Prepare IV */ if (prot->version == TLS_1_3_VERSION || prot->cipher_type == TLS_CIPHER_CHACHA20_POLY1305) { memcpy(&dctx->iv[iv_offset], tls_ctx->rx.iv, prot->iv_size + prot->salt_size); } else { err = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE, &dctx->iv[iv_offset] + prot->salt_size, prot->iv_size); if (err < 0) goto exit_free; memcpy(&dctx->iv[iv_offset], tls_ctx->rx.iv, prot->salt_size); } tls_xor_iv_with_seq(prot, &dctx->iv[iv_offset], tls_ctx->rx.rec_seq); /* Prepare AAD */ tls_make_aad(dctx->aad, rxm->full_len - prot->overhead_size + prot->tail_size, tls_ctx->rx.rec_seq, tlm->control, prot); /* Prepare sgin */ sg_init_table(sgin, n_sgin); sg_set_buf(&sgin[0], dctx->aad, prot->aad_size); err = skb_to_sgvec(skb, &sgin[1], rxm->offset + prot->prepend_size, rxm->full_len - prot->prepend_size); if (err < 0) goto exit_free; if (clear_skb) { sg_init_table(sgout, n_sgout); sg_set_buf(&sgout[0], dctx->aad, prot->aad_size); err = skb_to_sgvec(clear_skb, &sgout[1], prot->prepend_size, data_len + prot->tail_size); if (err < 0) goto exit_free; } else if (out_iov) { sg_init_table(sgout, n_sgout); sg_set_buf(&sgout[0], dctx->aad, prot->aad_size); err = tls_setup_from_iter(out_iov, data_len, &pages, &sgout[1], (n_sgout - 1 - tail_pages)); if (err < 0) goto exit_free_pages; if (prot->tail_size) { sg_unmark_end(&sgout[pages]); sg_set_buf(&sgout[pages + 1], &dctx->tail, prot->tail_size); sg_mark_end(&sgout[pages + 1]); } } else if (out_sg) { memcpy(sgout, out_sg, n_sgout * sizeof(*sgout)); } dctx->free_sgout = !!pages; /* Prepare and submit AEAD request */ err = tls_do_decryption(sk, sgin, sgout, dctx->iv, data_len + prot->tail_size, aead_req, darg); if (err) { if (darg->async_done) goto exit_free_skb; goto exit_free_pages; } darg->skb = clear_skb ?: tls_strp_msg(ctx); clear_skb = NULL; if (unlikely(darg->async)) { err = tls_strp_msg_hold(&ctx->strp, &ctx->async_hold); if (err) { err = tls_decrypt_async_wait(ctx); darg->async = false; } return err; } if (unlikely(darg->async_done)) return 0; if (prot->tail_size) darg->tail = dctx->tail; exit_free_pages: /* Release the pages in case iov was mapped to pages */ for (; pages > 0; pages--) put_page(sg_page(&sgout[pages])); exit_free: kfree(mem); exit_free_skb: consume_skb(clear_skb); return err; } static int tls_decrypt_sw(struct sock *sk, struct tls_context *tls_ctx, struct msghdr *msg, struct tls_decrypt_arg *darg) { struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct tls_prot_info *prot = &tls_ctx->prot_info; struct strp_msg *rxm; int pad, err; err = tls_decrypt_sg(sk, &msg->msg_iter, NULL, darg); if (err < 0) { if (err == -EBADMSG) TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTERROR); return err; } /* keep going even for ->async, the code below is TLS 1.3 */ /* If opportunistic TLS 1.3 ZC failed retry without ZC */ if (unlikely(darg->zc && prot->version == TLS_1_3_VERSION && darg->tail != TLS_RECORD_TYPE_DATA)) { darg->zc = false; if (!darg->tail) TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXNOPADVIOL); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTRETRY); return tls_decrypt_sw(sk, tls_ctx, msg, darg); } pad = tls_padding_length(prot, darg->skb, darg); if (pad < 0) { if (darg->skb != tls_strp_msg(ctx)) consume_skb(darg->skb); return pad; } rxm = strp_msg(darg->skb); rxm->full_len -= pad; return 0; } static int tls_decrypt_device(struct sock *sk, struct msghdr *msg, struct tls_context *tls_ctx, struct tls_decrypt_arg *darg) { struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct tls_prot_info *prot = &tls_ctx->prot_info; struct strp_msg *rxm; int pad, err; if (tls_ctx->rx_conf != TLS_HW) return 0; err = tls_device_decrypted(sk, tls_ctx); if (err <= 0) return err; pad = tls_padding_length(prot, tls_strp_msg(ctx), darg); if (pad < 0) return pad; darg->async = false; darg->skb = tls_strp_msg(ctx); /* ->zc downgrade check, in case TLS 1.3 gets here */ darg->zc &= !(prot->version == TLS_1_3_VERSION && tls_msg(darg->skb)->control != TLS_RECORD_TYPE_DATA); rxm = strp_msg(darg->skb); rxm->full_len -= pad; if (!darg->zc) { /* Non-ZC case needs a real skb */ darg->skb = tls_strp_msg_detach(ctx); if (!darg->skb) return -ENOMEM; } else { unsigned int off, len; /* In ZC case nobody cares about the output skb. * Just copy the data here. Note the skb is not fully trimmed. */ off = rxm->offset + prot->prepend_size; len = rxm->full_len - prot->overhead_size; err = skb_copy_datagram_msg(darg->skb, off, msg, len); if (err) return err; } return 1; } static int tls_check_pending_rekey(struct sock *sk, struct tls_context *ctx, struct sk_buff *skb) { const struct strp_msg *rxm = strp_msg(skb); const struct tls_msg *tlm = tls_msg(skb); char hs_type; int err; if (likely(tlm->control != TLS_RECORD_TYPE_HANDSHAKE)) return 0; if (rxm->full_len < 1) return 0; err = skb_copy_bits(skb, rxm->offset, &hs_type, 1); if (err < 0) { DEBUG_NET_WARN_ON_ONCE(1); return err; } if (hs_type == TLS_HANDSHAKE_KEYUPDATE) { struct tls_sw_context_rx *rx_ctx = ctx->priv_ctx_rx; WRITE_ONCE(rx_ctx->key_update_pending, true); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXREKEYRECEIVED); } return 0; } static int tls_rx_one_record(struct sock *sk, struct msghdr *msg, struct tls_decrypt_arg *darg) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct strp_msg *rxm; int err; err = tls_decrypt_device(sk, msg, tls_ctx, darg); if (!err) err = tls_decrypt_sw(sk, tls_ctx, msg, darg); if (err < 0) return err; rxm = strp_msg(darg->skb); rxm->offset += prot->prepend_size; rxm->full_len -= prot->overhead_size; tls_advance_record_sn(sk, prot, &tls_ctx->rx); return tls_check_pending_rekey(sk, tls_ctx, darg->skb); } int decrypt_skb(struct sock *sk, struct scatterlist *sgout) { struct tls_decrypt_arg darg = { .zc = true, }; return tls_decrypt_sg(sk, NULL, sgout, &darg); } /* All records returned from a recvmsg() call must have the same type. * 0 is not a valid content type. Use it as "no type reported, yet". */ static int tls_record_content_type(struct msghdr *msg, struct tls_msg *tlm, u8 *control) { int err; if (!*control) { *control = tlm->control; if (!*control) return -EBADMSG; err = put_cmsg(msg, SOL_TLS, TLS_GET_RECORD_TYPE, sizeof(*control), control); if (*control != TLS_RECORD_TYPE_DATA) { if (err || msg->msg_flags & MSG_CTRUNC) return -EIO; } } else if (*control != tlm->control) { return 0; } return 1; } static void tls_rx_rec_done(struct tls_sw_context_rx *ctx) { tls_strp_msg_done(&ctx->strp); } /* This function traverses the rx_list in tls receive context to copies the * decrypted records into the buffer provided by caller zero copy is not * true. Further, the records are removed from the rx_list if it is not a peek * case and the record has been consumed completely. */ static int process_rx_list(struct tls_sw_context_rx *ctx, struct msghdr *msg, u8 *control, size_t skip, size_t len, bool is_peek, bool *more) { struct sk_buff *skb = skb_peek(&ctx->rx_list); struct tls_msg *tlm; ssize_t copied = 0; int err; while (skip && skb) { struct strp_msg *rxm = strp_msg(skb); tlm = tls_msg(skb); err = tls_record_content_type(msg, tlm, control); if (err <= 0) goto more; if (skip < rxm->full_len) break; skip = skip - rxm->full_len; skb = skb_peek_next(skb, &ctx->rx_list); } while (len && skb) { struct sk_buff *next_skb; struct strp_msg *rxm = strp_msg(skb); int chunk = min_t(unsigned int, rxm->full_len - skip, len); tlm = tls_msg(skb); err = tls_record_content_type(msg, tlm, control); if (err <= 0) goto more; err = skb_copy_datagram_msg(skb, rxm->offset + skip, msg, chunk); if (err < 0) goto more; len = len - chunk; copied = copied + chunk; /* Consume the data from record if it is non-peek case*/ if (!is_peek) { rxm->offset = rxm->offset + chunk; rxm->full_len = rxm->full_len - chunk; /* Return if there is unconsumed data in the record */ if (rxm->full_len - skip) break; } /* The remaining skip-bytes must lie in 1st record in rx_list. * So from the 2nd record, 'skip' should be 0. */ skip = 0; if (msg) msg->msg_flags |= MSG_EOR; next_skb = skb_peek_next(skb, &ctx->rx_list); if (!is_peek) { __skb_unlink(skb, &ctx->rx_list); consume_skb(skb); } skb = next_skb; } err = 0; out: return copied ? : err; more: if (more) *more = true; goto out; } static bool tls_read_flush_backlog(struct sock *sk, struct tls_prot_info *prot, size_t len_left, size_t decrypted, ssize_t done, size_t *flushed_at) { size_t max_rec; if (len_left <= decrypted) return false; max_rec = prot->overhead_size - prot->tail_size + TLS_MAX_PAYLOAD_SIZE; if (done - *flushed_at < SZ_128K && tcp_inq(sk) > max_rec) return false; *flushed_at = done; return sk_flush_backlog(sk); } static int tls_rx_reader_acquire(struct sock *sk, struct tls_sw_context_rx *ctx, bool nonblock) { long timeo; int ret; timeo = sock_rcvtimeo(sk, nonblock); while (unlikely(ctx->reader_present)) { DEFINE_WAIT_FUNC(wait, woken_wake_function); ctx->reader_contended = 1; add_wait_queue(&ctx->wq, &wait); ret = sk_wait_event(sk, &timeo, !READ_ONCE(ctx->reader_present), &wait); remove_wait_queue(&ctx->wq, &wait); if (timeo <= 0) return -EAGAIN; if (signal_pending(current)) return sock_intr_errno(timeo); if (ret < 0) return ret; } WRITE_ONCE(ctx->reader_present, 1); return 0; } static int tls_rx_reader_lock(struct sock *sk, struct tls_sw_context_rx *ctx, bool nonblock) { int err; lock_sock(sk); err = tls_rx_reader_acquire(sk, ctx, nonblock); if (err) release_sock(sk); return err; } static void tls_rx_reader_release(struct sock *sk, struct tls_sw_context_rx *ctx) { if (unlikely(ctx->reader_contended)) { if (wq_has_sleeper(&ctx->wq)) wake_up(&ctx->wq); else ctx->reader_contended = 0; WARN_ON_ONCE(!ctx->reader_present); } WRITE_ONCE(ctx->reader_present, 0); } static void tls_rx_reader_unlock(struct sock *sk, struct tls_sw_context_rx *ctx) { tls_rx_reader_release(sk, ctx); release_sock(sk); } int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct tls_prot_info *prot = &tls_ctx->prot_info; ssize_t decrypted = 0, async_copy_bytes = 0; struct sk_psock *psock; unsigned char control = 0; size_t flushed_at = 0; struct strp_msg *rxm; struct tls_msg *tlm; ssize_t copied = 0; ssize_t peeked = 0; bool async = false; int target, err; bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); bool is_peek = flags & MSG_PEEK; bool rx_more = false; bool released = true; bool bpf_strp_enabled; bool zc_capable; if (unlikely(flags & MSG_ERRQUEUE)) return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR); err = tls_rx_reader_lock(sk, ctx, flags & MSG_DONTWAIT); if (err < 0) return err; psock = sk_psock_get(sk); bpf_strp_enabled = sk_psock_strp_enabled(psock); /* If crypto failed the connection is broken */ err = ctx->async_wait.err; if (err) goto end; /* Process pending decrypted records. It must be non-zero-copy */ err = process_rx_list(ctx, msg, &control, 0, len, is_peek, &rx_more); if (err < 0) goto end; /* process_rx_list() will set @control if it processed any records */ copied = err; if (len <= copied || rx_more || (control && control != TLS_RECORD_TYPE_DATA)) goto end; target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); len = len - copied; zc_capable = !bpf_strp_enabled && !is_kvec && !is_peek && ctx->zc_capable; decrypted = 0; while (len && (decrypted + copied < target || tls_strp_msg_ready(ctx))) { struct tls_decrypt_arg darg; int to_decrypt, chunk; err = tls_rx_rec_wait(sk, psock, flags & MSG_DONTWAIT, released, !!(decrypted + copied)); if (err <= 0) { if (psock) { chunk = sk_msg_recvmsg(sk, psock, msg, len, flags); if (chunk > 0) { decrypted += chunk; len -= chunk; continue; } } goto recv_end; } memset(&darg.inargs, 0, sizeof(darg.inargs)); rxm = strp_msg(tls_strp_msg(ctx)); tlm = tls_msg(tls_strp_msg(ctx)); to_decrypt = rxm->full_len - prot->overhead_size; if (zc_capable && to_decrypt <= len && tlm->control == TLS_RECORD_TYPE_DATA) darg.zc = true; /* Do not use async mode if record is non-data */ if (tlm->control == TLS_RECORD_TYPE_DATA && !bpf_strp_enabled) darg.async = ctx->async_capable; else darg.async = false; err = tls_rx_one_record(sk, msg, &darg); if (err < 0) { tls_err_abort(sk, -EBADMSG); goto recv_end; } async |= darg.async; /* If the type of records being processed is not known yet, * set it to record type just dequeued. If it is already known, * but does not match the record type just dequeued, go to end. * We always get record type here since for tls1.2, record type * is known just after record is dequeued from stream parser. * For tls1.3, we disable async. */ err = tls_record_content_type(msg, tls_msg(darg.skb), &control); if (err <= 0) { DEBUG_NET_WARN_ON_ONCE(darg.zc); tls_rx_rec_done(ctx); put_on_rx_list_err: __skb_queue_tail(&ctx->rx_list, darg.skb); goto recv_end; } /* periodically flush backlog, and feed strparser */ released = tls_read_flush_backlog(sk, prot, len, to_decrypt, decrypted + copied, &flushed_at); /* TLS 1.3 may have updated the length by more than overhead */ rxm = strp_msg(darg.skb); chunk = rxm->full_len; tls_rx_rec_done(ctx); if (!darg.zc) { bool partially_consumed = chunk > len; struct sk_buff *skb = darg.skb; DEBUG_NET_WARN_ON_ONCE(darg.skb == ctx->strp.anchor); if (async) { /* TLS 1.2-only, to_decrypt must be text len */ chunk = min_t(int, to_decrypt, len); async_copy_bytes += chunk; put_on_rx_list: decrypted += chunk; len -= chunk; __skb_queue_tail(&ctx->rx_list, skb); if (unlikely(control != TLS_RECORD_TYPE_DATA)) break; continue; } if (bpf_strp_enabled) { released = true; err = sk_psock_tls_strp_read(psock, skb); if (err != __SK_PASS) { rxm->offset = rxm->offset + rxm->full_len; rxm->full_len = 0; if (err == __SK_DROP) consume_skb(skb); continue; } } if (partially_consumed) chunk = len; err = skb_copy_datagram_msg(skb, rxm->offset, msg, chunk); if (err < 0) goto put_on_rx_list_err; if (is_peek) { peeked += chunk; goto put_on_rx_list; } if (partially_consumed) { rxm->offset += chunk; rxm->full_len -= chunk; goto put_on_rx_list; } consume_skb(skb); } decrypted += chunk; len -= chunk; /* Return full control message to userspace before trying * to parse another message type */ msg->msg_flags |= MSG_EOR; if (control != TLS_RECORD_TYPE_DATA) break; } recv_end: if (async) { int ret; /* Wait for all previously submitted records to be decrypted */ ret = tls_decrypt_async_wait(ctx); if (ret) { if (err >= 0 || err == -EINPROGRESS) err = ret; goto end; } /* Drain records from the rx_list & copy if required */ if (is_peek) err = process_rx_list(ctx, msg, &control, copied + peeked, decrypted - peeked, is_peek, NULL); else err = process_rx_list(ctx, msg, &control, 0, async_copy_bytes, is_peek, NULL); /* we could have copied less than we wanted, and possibly nothing */ decrypted += max(err, 0) - async_copy_bytes; } copied += decrypted; end: tls_rx_reader_unlock(sk, ctx); if (psock) sk_psock_put(sk, psock); return copied ? : err; } ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct tls_context *tls_ctx = tls_get_ctx(sock->sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct strp_msg *rxm = NULL; struct sock *sk = sock->sk; struct tls_msg *tlm; struct sk_buff *skb; ssize_t copied = 0; int chunk; int err; err = tls_rx_reader_lock(sk, ctx, flags & SPLICE_F_NONBLOCK); if (err < 0) return err; if (!skb_queue_empty(&ctx->rx_list)) { skb = __skb_dequeue(&ctx->rx_list); } else { struct tls_decrypt_arg darg; err = tls_rx_rec_wait(sk, NULL, flags & SPLICE_F_NONBLOCK, true, false); if (err <= 0) goto splice_read_end; memset(&darg.inargs, 0, sizeof(darg.inargs)); err = tls_rx_one_record(sk, NULL, &darg); if (err < 0) { tls_err_abort(sk, -EBADMSG); goto splice_read_end; } tls_rx_rec_done(ctx); skb = darg.skb; } rxm = strp_msg(skb); tlm = tls_msg(skb); /* splice does not support reading control messages */ if (tlm->control != TLS_RECORD_TYPE_DATA) { err = -EINVAL; goto splice_requeue; } chunk = min_t(unsigned int, rxm->full_len, len); copied = skb_splice_bits(skb, sk, rxm->offset, pipe, chunk, flags); if (copied < 0) goto splice_requeue; if (copied < rxm->full_len) { rxm->offset += copied; rxm->full_len -= copied; goto splice_requeue; } consume_skb(skb); splice_read_end: tls_rx_reader_unlock(sk, ctx); return copied ? : err; splice_requeue: __skb_queue_head(&ctx->rx_list, skb); goto splice_read_end; } int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t read_actor) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct tls_prot_info *prot = &tls_ctx->prot_info; struct strp_msg *rxm = NULL; struct sk_buff *skb = NULL; struct sk_psock *psock; size_t flushed_at = 0; bool released = true; struct tls_msg *tlm; ssize_t copied = 0; ssize_t decrypted; int err, used; psock = sk_psock_get(sk); if (psock) { sk_psock_put(sk, psock); return -EINVAL; } err = tls_rx_reader_acquire(sk, ctx, true); if (err < 0) return err; /* If crypto failed the connection is broken */ err = ctx->async_wait.err; if (err) goto read_sock_end; decrypted = 0; do { if (!skb_queue_empty(&ctx->rx_list)) { skb = __skb_dequeue(&ctx->rx_list); rxm = strp_msg(skb); tlm = tls_msg(skb); } else { struct tls_decrypt_arg darg; err = tls_rx_rec_wait(sk, NULL, true, released, !!copied); if (err <= 0) goto read_sock_end; memset(&darg.inargs, 0, sizeof(darg.inargs)); err = tls_rx_one_record(sk, NULL, &darg); if (err < 0) { tls_err_abort(sk, -EBADMSG); goto read_sock_end; } released = tls_read_flush_backlog(sk, prot, INT_MAX, 0, decrypted, &flushed_at); skb = darg.skb; rxm = strp_msg(skb); tlm = tls_msg(skb); decrypted += rxm->full_len; tls_rx_rec_done(ctx); } /* read_sock does not support reading control messages */ if (tlm->control != TLS_RECORD_TYPE_DATA) { err = -EINVAL; goto read_sock_requeue; } used = read_actor(desc, skb, rxm->offset, rxm->full_len); if (used <= 0) { if (!copied) err = used; goto read_sock_requeue; } copied += used; if (used < rxm->full_len) { rxm->offset += used; rxm->full_len -= used; if (!desc->count) goto read_sock_requeue; } else { consume_skb(skb); if (!desc->count) skb = NULL; } } while (skb); read_sock_end: tls_rx_reader_release(sk, ctx); return copied ? : err; read_sock_requeue: __skb_queue_head(&ctx->rx_list, skb); goto read_sock_end; } bool tls_sw_sock_is_readable(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); bool ingress_empty = true; struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock) ingress_empty = list_empty(&psock->ingress_msg); rcu_read_unlock(); return !ingress_empty || tls_strp_msg_ready(ctx) || !skb_queue_empty(&ctx->rx_list); } int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb) { struct tls_context *tls_ctx = tls_get_ctx(strp->sk); struct tls_prot_info *prot = &tls_ctx->prot_info; char header[TLS_HEADER_SIZE + TLS_MAX_IV_SIZE]; size_t cipher_overhead; size_t data_len = 0; int ret; /* Verify that we have a full TLS header, or wait for more data */ if (strp->stm.offset + prot->prepend_size > skb->len) return 0; /* Sanity-check size of on-stack buffer. */ if (WARN_ON(prot->prepend_size > sizeof(header))) { ret = -EINVAL; goto read_failure; } /* Linearize header to local buffer */ ret = skb_copy_bits(skb, strp->stm.offset, header, prot->prepend_size); if (ret < 0) goto read_failure; strp->mark = header[0]; data_len = ((header[4] & 0xFF) | (header[3] << 8)); cipher_overhead = prot->tag_size; if (prot->version != TLS_1_3_VERSION && prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305) cipher_overhead += prot->iv_size; if (data_len > TLS_MAX_PAYLOAD_SIZE + cipher_overhead + prot->tail_size) { ret = -EMSGSIZE; goto read_failure; } if (data_len < cipher_overhead) { ret = -EBADMSG; goto read_failure; } /* Note that both TLS1.3 and TLS1.2 use TLS_1_2 version here */ if (header[1] != TLS_1_2_VERSION_MINOR || header[2] != TLS_1_2_VERSION_MAJOR) { ret = -EINVAL; goto read_failure; } tls_device_rx_resync_new_rec(strp->sk, data_len + TLS_HEADER_SIZE, TCP_SKB_CB(skb)->seq + strp->stm.offset); return data_len + TLS_HEADER_SIZE; read_failure: tls_strp_abort_strp(strp, ret); return ret; } void tls_rx_msg_ready(struct tls_strparser *strp) { struct tls_sw_context_rx *ctx; ctx = container_of(strp, struct tls_sw_context_rx, strp); ctx->saved_data_ready(strp->sk); } static void tls_data_ready(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct sk_psock *psock; gfp_t alloc_save; trace_sk_data_ready(sk); alloc_save = sk->sk_allocation; sk->sk_allocation = GFP_ATOMIC; tls_strp_data_ready(&ctx->strp); sk->sk_allocation = alloc_save; psock = sk_psock_get(sk); if (psock) { if (!list_empty(&psock->ingress_msg)) ctx->saved_data_ready(sk); sk_psock_put(sk, psock); } } void tls_sw_cancel_work_tx(struct tls_context *tls_ctx) { struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); set_bit(BIT_TX_CLOSING, &ctx->tx_bitmask); set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask); disable_delayed_work_sync(&ctx->tx_work.work); } void tls_sw_release_resources_tx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec, *tmp; /* Wait for any pending async encryptions to complete */ tls_encrypt_async_wait(ctx); tls_tx_records(sk, -1); /* Free up un-sent records in tx_list. First, free * the partially sent record if any at head of tx_list. */ if (tls_ctx->partially_sent_record) { tls_free_partial_record(sk, tls_ctx); rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); list_del(&rec->list); sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) { list_del(&rec->list); sk_msg_free(sk, &rec->msg_encrypted); sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } crypto_free_aead(ctx->aead_send); tls_free_open_rec(sk); } void tls_sw_free_ctx_tx(struct tls_context *tls_ctx) { struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); kfree(ctx); } void tls_sw_release_resources_rx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); if (ctx->aead_recv) { __skb_queue_purge(&ctx->rx_list); crypto_free_aead(ctx->aead_recv); tls_strp_stop(&ctx->strp); /* If tls_sw_strparser_arm() was not called (cleanup paths) * we still want to tls_strp_stop(), but sk->sk_data_ready was * never swapped. */ if (ctx->saved_data_ready) { write_lock_bh(&sk->sk_callback_lock); sk->sk_data_ready = ctx->saved_data_ready; write_unlock_bh(&sk->sk_callback_lock); } } } void tls_sw_strparser_done(struct tls_context *tls_ctx) { struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); tls_strp_done(&ctx->strp); } void tls_sw_free_ctx_rx(struct tls_context *tls_ctx) { struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); kfree(ctx); } void tls_sw_free_resources_rx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx; ctx = tls_sw_ctx_rx(tls_ctx); tls_sw_release_resources_rx(sk); __tls_strp_done(&ctx->strp); tls_sw_free_ctx_rx(tls_ctx); } /* The work handler to transmitt the encrypted records in tx_list */ static void tx_work_handler(struct work_struct *work) { struct delayed_work *delayed_work = to_delayed_work(work); struct tx_work *tx_work = container_of(delayed_work, struct tx_work, work); struct sock *sk = tx_work->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx; if (unlikely(!tls_ctx)) return; ctx = tls_sw_ctx_tx(tls_ctx); if (test_bit(BIT_TX_CLOSING, &ctx->tx_bitmask)) return; if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) return; if (mutex_trylock(&tls_ctx->tx_lock)) { lock_sock(sk); tls_tx_records(sk, -1); release_sock(sk); mutex_unlock(&tls_ctx->tx_lock); } else if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { /* Someone is holding the tx_lock, they will likely run Tx * and cancel the work on their way out of the lock section. * Schedule a long delay just in case. */ schedule_delayed_work(&ctx->tx_work.work, msecs_to_jiffies(10)); } } static bool tls_is_tx_ready(struct tls_sw_context_tx *ctx) { struct tls_rec *rec; rec = list_first_entry_or_null(&ctx->tx_list, struct tls_rec, list); if (!rec) return false; return READ_ONCE(rec->tx_ready); } void tls_sw_write_space(struct sock *sk, struct tls_context *ctx) { struct tls_sw_context_tx *tx_ctx = tls_sw_ctx_tx(ctx); /* Schedule the transmission if tx list is ready */ if (tls_is_tx_ready(tx_ctx) && !test_and_set_bit(BIT_TX_SCHEDULED, &tx_ctx->tx_bitmask)) schedule_delayed_work(&tx_ctx->tx_work.work, 0); } void tls_sw_strparser_arm(struct sock *sk, struct tls_context *tls_ctx) { struct tls_sw_context_rx *rx_ctx = tls_sw_ctx_rx(tls_ctx); write_lock_bh(&sk->sk_callback_lock); rx_ctx->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = tls_data_ready; write_unlock_bh(&sk->sk_callback_lock); } void tls_update_rx_zc_capable(struct tls_context *tls_ctx) { struct tls_sw_context_rx *rx_ctx = tls_sw_ctx_rx(tls_ctx); rx_ctx->zc_capable = tls_ctx->rx_no_pad || tls_ctx->prot_info.version != TLS_1_3_VERSION; } static struct tls_sw_context_tx *init_ctx_tx(struct tls_context *ctx, struct sock *sk) { struct tls_sw_context_tx *sw_ctx_tx; if (!ctx->priv_ctx_tx) { sw_ctx_tx = kzalloc_obj(*sw_ctx_tx); if (!sw_ctx_tx) return NULL; } else { sw_ctx_tx = ctx->priv_ctx_tx; } crypto_init_wait(&sw_ctx_tx->async_wait); atomic_set(&sw_ctx_tx->encrypt_pending, 1); INIT_LIST_HEAD(&sw_ctx_tx->tx_list); INIT_DELAYED_WORK(&sw_ctx_tx->tx_work.work, tx_work_handler); sw_ctx_tx->tx_work.sk = sk; return sw_ctx_tx; } static struct tls_sw_context_rx *init_ctx_rx(struct tls_context *ctx) { struct tls_sw_context_rx *sw_ctx_rx; if (!ctx->priv_ctx_rx) { sw_ctx_rx = kzalloc_obj(*sw_ctx_rx); if (!sw_ctx_rx) return NULL; } else { sw_ctx_rx = ctx->priv_ctx_rx; } crypto_init_wait(&sw_ctx_rx->async_wait); atomic_set(&sw_ctx_rx->decrypt_pending, 1); init_waitqueue_head(&sw_ctx_rx->wq); skb_queue_head_init(&sw_ctx_rx->rx_list); skb_queue_head_init(&sw_ctx_rx->async_hold); return sw_ctx_rx; } int init_prot_info(struct tls_prot_info *prot, const struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc) { u16 nonce_size = cipher_desc->nonce; if (crypto_info->version == TLS_1_3_VERSION) { nonce_size = 0; prot->aad_size = TLS_HEADER_SIZE; prot->tail_size = 1; } else { prot->aad_size = TLS_AAD_SPACE_SIZE; prot->tail_size = 0; } /* Sanity-check the sizes for stack allocations. */ if (nonce_size > TLS_MAX_IV_SIZE || prot->aad_size > TLS_MAX_AAD_SIZE) return -EINVAL; prot->version = crypto_info->version; prot->cipher_type = crypto_info->cipher_type; prot->prepend_size = TLS_HEADER_SIZE + nonce_size; prot->tag_size = cipher_desc->tag; prot->overhead_size = prot->prepend_size + prot->tag_size + prot->tail_size; prot->iv_size = cipher_desc->iv; prot->salt_size = cipher_desc->salt; prot->rec_seq_size = cipher_desc->rec_seq; return 0; } static void tls_finish_key_update(struct sock *sk, struct tls_context *tls_ctx) { struct tls_sw_context_rx *ctx = tls_ctx->priv_ctx_rx; WRITE_ONCE(ctx->key_update_pending, false); /* wake-up pre-existing poll() */ ctx->saved_data_ready(sk); } int tls_set_sw_offload(struct sock *sk, int tx, struct tls_crypto_info *new_crypto_info) { struct tls_crypto_info *crypto_info, *src_crypto_info; struct tls_sw_context_tx *sw_ctx_tx = NULL; struct tls_sw_context_rx *sw_ctx_rx = NULL; const struct tls_cipher_desc *cipher_desc; char *iv, *rec_seq, *key, *salt; struct cipher_context *cctx; struct tls_prot_info *prot; struct crypto_aead **aead; struct tls_context *ctx; struct crypto_tfm *tfm; int rc = 0; ctx = tls_get_ctx(sk); prot = &ctx->prot_info; /* new_crypto_info != NULL means rekey */ if (!new_crypto_info) { if (tx) { ctx->priv_ctx_tx = init_ctx_tx(ctx, sk); if (!ctx->priv_ctx_tx) return -ENOMEM; } else { ctx->priv_ctx_rx = init_ctx_rx(ctx); if (!ctx->priv_ctx_rx) return -ENOMEM; } } if (tx) { sw_ctx_tx = ctx->priv_ctx_tx; crypto_info = &ctx->crypto_send.info; cctx = &ctx->tx; aead = &sw_ctx_tx->aead_send; } else { sw_ctx_rx = ctx->priv_ctx_rx; crypto_info = &ctx->crypto_recv.info; cctx = &ctx->rx; aead = &sw_ctx_rx->aead_recv; } src_crypto_info = new_crypto_info ?: crypto_info; cipher_desc = get_cipher_desc(src_crypto_info->cipher_type); if (!cipher_desc) { rc = -EINVAL; goto free_priv; } rc = init_prot_info(prot, src_crypto_info, cipher_desc); if (rc) goto free_priv; iv = crypto_info_iv(src_crypto_info, cipher_desc); key = crypto_info_key(src_crypto_info, cipher_desc); salt = crypto_info_salt(src_crypto_info, cipher_desc); rec_seq = crypto_info_rec_seq(src_crypto_info, cipher_desc); if (!*aead) { *aead = crypto_alloc_aead(cipher_desc->cipher_name, 0, 0); if (IS_ERR(*aead)) { rc = PTR_ERR(*aead); *aead = NULL; goto free_priv; } } ctx->push_pending_record = tls_sw_push_pending_record; /* setkey is the last operation that could fail during a * rekey. if it succeeds, we can start modifying the * context. */ rc = crypto_aead_setkey(*aead, key, cipher_desc->key); if (rc) { if (new_crypto_info) goto out; else goto free_aead; } if (!new_crypto_info) { rc = crypto_aead_setauthsize(*aead, prot->tag_size); if (rc) goto free_aead; } if (!tx && !new_crypto_info) { tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv); tls_update_rx_zc_capable(ctx); sw_ctx_rx->async_capable = src_crypto_info->version != TLS_1_3_VERSION && !!(tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC); rc = tls_strp_init(&sw_ctx_rx->strp, sk); if (rc) goto free_aead; } memcpy(cctx->iv, salt, cipher_desc->salt); memcpy(cctx->iv + cipher_desc->salt, iv, cipher_desc->iv); memcpy(cctx->rec_seq, rec_seq, cipher_desc->rec_seq); if (new_crypto_info) { unsafe_memcpy(crypto_info, new_crypto_info, cipher_desc->crypto_info, /* size was checked in do_tls_setsockopt_conf */); memzero_explicit(new_crypto_info, cipher_desc->crypto_info); if (!tx) tls_finish_key_update(sk, ctx); } goto out; free_aead: crypto_free_aead(*aead); *aead = NULL; free_priv: if (!new_crypto_info) { if (tx) { kfree(ctx->priv_ctx_tx); ctx->priv_ctx_tx = NULL; } else { kfree(ctx->priv_ctx_rx); ctx->priv_ctx_rx = NULL; } } out: return rc; }
5 5 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 // SPDX-License-Identifier: GPL-2.0-or-later /* * LAPB release 002 * * This code REQUIRES 2.1.15 or higher/ NET3.038 * * History * LAPB 001 Jonathan Naylor Started Coding * LAPB 002 Jonathan Naylor New timer architecture. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <net/lapb.h> /* * This procedure is passed a buffer descriptor for an iframe. It builds * the rest of the control part of the frame and then writes it out. */ static void lapb_send_iframe(struct lapb_cb *lapb, struct sk_buff *skb, int poll_bit) { unsigned char *frame; if (!skb) return; if (lapb->mode & LAPB_EXTENDED) { frame = skb_push(skb, 2); frame[0] = LAPB_I; frame[0] |= lapb->vs << 1; frame[1] = poll_bit ? LAPB_EPF : 0; frame[1] |= lapb->vr << 1; } else { frame = skb_push(skb, 1); *frame = LAPB_I; *frame |= poll_bit ? LAPB_SPF : 0; *frame |= lapb->vr << 5; *frame |= lapb->vs << 1; } lapb_dbg(1, "(%p) S%d TX I(%d) S%d R%d\n", lapb->dev, lapb->state, poll_bit, lapb->vs, lapb->vr); lapb_transmit_buffer(lapb, skb, LAPB_COMMAND); } void lapb_kick(struct lapb_cb *lapb) { struct sk_buff *skb, *skbn; unsigned short modulus, start, end; modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS : LAPB_SMODULUS; start = !skb_peek(&lapb->ack_queue) ? lapb->va : lapb->vs; end = (lapb->va + lapb->window) % modulus; if (!(lapb->condition & LAPB_PEER_RX_BUSY_CONDITION) && start != end && skb_peek(&lapb->write_queue)) { lapb->vs = start; /* * Dequeue the frame and copy it. */ skb = skb_dequeue(&lapb->write_queue); do { skbn = skb_copy(skb, GFP_ATOMIC); if (!skbn) { skb_queue_head(&lapb->write_queue, skb); break; } if (skb->sk) skb_set_owner_w(skbn, skb->sk); /* * Transmit the frame copy. */ lapb_send_iframe(lapb, skbn, LAPB_POLLOFF); lapb->vs = (lapb->vs + 1) % modulus; /* * Requeue the original data frame. */ skb_queue_tail(&lapb->ack_queue, skb); } while (lapb->vs != end && (skb = skb_dequeue(&lapb->write_queue)) != NULL); lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; if (!lapb_t1timer_running(lapb)) lapb_start_t1timer(lapb); } } void lapb_transmit_buffer(struct lapb_cb *lapb, struct sk_buff *skb, int type) { unsigned char *ptr; ptr = skb_push(skb, 1); if (lapb->mode & LAPB_MLP) { if (lapb->mode & LAPB_DCE) { if (type == LAPB_COMMAND) *ptr = LAPB_ADDR_C; if (type == LAPB_RESPONSE) *ptr = LAPB_ADDR_D; } else { if (type == LAPB_COMMAND) *ptr = LAPB_ADDR_D; if (type == LAPB_RESPONSE) *ptr = LAPB_ADDR_C; } } else { if (lapb->mode & LAPB_DCE) { if (type == LAPB_COMMAND) *ptr = LAPB_ADDR_A; if (type == LAPB_RESPONSE) *ptr = LAPB_ADDR_B; } else { if (type == LAPB_COMMAND) *ptr = LAPB_ADDR_B; if (type == LAPB_RESPONSE) *ptr = LAPB_ADDR_A; } } lapb_dbg(2, "(%p) S%d TX %3ph\n", lapb->dev, lapb->state, skb->data); if (!lapb_data_transmit(lapb, skb)) kfree_skb(skb); } void lapb_establish_data_link(struct lapb_cb *lapb) { lapb->condition = 0x00; lapb->n2count = 0; if (lapb->mode & LAPB_EXTENDED) { lapb_dbg(1, "(%p) S%d TX SABME(1)\n", lapb->dev, lapb->state); lapb_send_control(lapb, LAPB_SABME, LAPB_POLLON, LAPB_COMMAND); } else { lapb_dbg(1, "(%p) S%d TX SABM(1)\n", lapb->dev, lapb->state); lapb_send_control(lapb, LAPB_SABM, LAPB_POLLON, LAPB_COMMAND); } lapb_start_t1timer(lapb); lapb_stop_t2timer(lapb); } void lapb_enquiry_response(struct lapb_cb *lapb) { lapb_dbg(1, "(%p) S%d TX RR(1) R%d\n", lapb->dev, lapb->state, lapb->vr); lapb_send_control(lapb, LAPB_RR, LAPB_POLLON, LAPB_RESPONSE); lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; } void lapb_timeout_response(struct lapb_cb *lapb) { lapb_dbg(1, "(%p) S%d TX RR(0) R%d\n", lapb->dev, lapb->state, lapb->vr); lapb_send_control(lapb, LAPB_RR, LAPB_POLLOFF, LAPB_RESPONSE); lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; } void lapb_check_iframes_acked(struct lapb_cb *lapb, unsigned short nr) { if (lapb->vs == nr) { lapb_frames_acked(lapb, nr); lapb_stop_t1timer(lapb); lapb->n2count = 0; } else if (lapb->va != nr) { lapb_frames_acked(lapb, nr); lapb_start_t1timer(lapb); } } void lapb_check_need_response(struct lapb_cb *lapb, int type, int pf) { if (type == LAPB_COMMAND && pf) lapb_enquiry_response(lapb); }
1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * TEA, XTEA, and XETA crypto algorithms * * The TEA and Xtended TEA algorithms were developed by David Wheeler * and Roger Needham at the Computer Laboratory of Cambridge University. * * Due to the order of evaluation in XTEA many people have incorrectly * implemented it. XETA (XTEA in the wrong order), exists for * compatibility with these implementations. * * Copyright (c) 2004 Aaron Grothe ajgrothe@yahoo.com */ #include <crypto/algapi.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/unaligned.h> #include <linux/types.h> #define TEA_KEY_SIZE 16 #define TEA_BLOCK_SIZE 8 #define TEA_ROUNDS 32 #define TEA_DELTA 0x9e3779b9 #define XTEA_KEY_SIZE 16 #define XTEA_BLOCK_SIZE 8 #define XTEA_ROUNDS 32 #define XTEA_DELTA 0x9e3779b9 struct tea_ctx { u32 KEY[4]; }; struct xtea_ctx { u32 KEY[4]; }; static int tea_setkey(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct tea_ctx *ctx = crypto_tfm_ctx(tfm); ctx->KEY[0] = get_unaligned_le32(&in_key[0]); ctx->KEY[1] = get_unaligned_le32(&in_key[4]); ctx->KEY[2] = get_unaligned_le32(&in_key[8]); ctx->KEY[3] = get_unaligned_le32(&in_key[12]); return 0; } static void tea_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { u32 y, z, n, sum = 0; u32 k0, k1, k2, k3; struct tea_ctx *ctx = crypto_tfm_ctx(tfm); y = get_unaligned_le32(&src[0]); z = get_unaligned_le32(&src[4]); k0 = ctx->KEY[0]; k1 = ctx->KEY[1]; k2 = ctx->KEY[2]; k3 = ctx->KEY[3]; n = TEA_ROUNDS; while (n-- > 0) { sum += TEA_DELTA; y += ((z << 4) + k0) ^ (z + sum) ^ ((z >> 5) + k1); z += ((y << 4) + k2) ^ (y + sum) ^ ((y >> 5) + k3); } put_unaligned_le32(y, &dst[0]); put_unaligned_le32(z, &dst[4]); } static void tea_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { u32 y, z, n, sum; u32 k0, k1, k2, k3; struct tea_ctx *ctx = crypto_tfm_ctx(tfm); y = get_unaligned_le32(&src[0]); z = get_unaligned_le32(&src[4]); k0 = ctx->KEY[0]; k1 = ctx->KEY[1]; k2 = ctx->KEY[2]; k3 = ctx->KEY[3]; sum = TEA_DELTA << 5; n = TEA_ROUNDS; while (n-- > 0) { z -= ((y << 4) + k2) ^ (y + sum) ^ ((y >> 5) + k3); y -= ((z << 4) + k0) ^ (z + sum) ^ ((z >> 5) + k1); sum -= TEA_DELTA; } put_unaligned_le32(y, &dst[0]); put_unaligned_le32(z, &dst[4]); } static int xtea_setkey(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct xtea_ctx *ctx = crypto_tfm_ctx(tfm); ctx->KEY[0] = get_unaligned_le32(&in_key[0]); ctx->KEY[1] = get_unaligned_le32(&in_key[4]); ctx->KEY[2] = get_unaligned_le32(&in_key[8]); ctx->KEY[3] = get_unaligned_le32(&in_key[12]); return 0; } static void xtea_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { u32 y, z, sum = 0; u32 limit = XTEA_DELTA * XTEA_ROUNDS; struct xtea_ctx *ctx = crypto_tfm_ctx(tfm); y = get_unaligned_le32(&src[0]); z = get_unaligned_le32(&src[4]); while (sum != limit) { y += ((z << 4 ^ z >> 5) + z) ^ (sum + ctx->KEY[sum&3]); sum += XTEA_DELTA; z += ((y << 4 ^ y >> 5) + y) ^ (sum + ctx->KEY[sum>>11 &3]); } put_unaligned_le32(y, &dst[0]); put_unaligned_le32(z, &dst[4]); } static void xtea_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { u32 y, z, sum; struct tea_ctx *ctx = crypto_tfm_ctx(tfm); y = get_unaligned_le32(&src[0]); z = get_unaligned_le32(&src[4]); sum = XTEA_DELTA * XTEA_ROUNDS; while (sum) { z -= ((y << 4 ^ y >> 5) + y) ^ (sum + ctx->KEY[sum>>11 & 3]); sum -= XTEA_DELTA; y -= ((z << 4 ^ z >> 5) + z) ^ (sum + ctx->KEY[sum & 3]); } put_unaligned_le32(y, &dst[0]); put_unaligned_le32(z, &dst[4]); } static void xeta_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { u32 y, z, sum = 0; u32 limit = XTEA_DELTA * XTEA_ROUNDS; struct xtea_ctx *ctx = crypto_tfm_ctx(tfm); y = get_unaligned_le32(&src[0]); z = get_unaligned_le32(&src[4]); while (sum != limit) { y += (z << 4 ^ z >> 5) + (z ^ sum) + ctx->KEY[sum&3]; sum += XTEA_DELTA; z += (y << 4 ^ y >> 5) + (y ^ sum) + ctx->KEY[sum>>11 &3]; } put_unaligned_le32(y, &dst[0]); put_unaligned_le32(z, &dst[4]); } static void xeta_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { u32 y, z, sum; struct tea_ctx *ctx = crypto_tfm_ctx(tfm); y = get_unaligned_le32(&src[0]); z = get_unaligned_le32(&src[4]); sum = XTEA_DELTA * XTEA_ROUNDS; while (sum) { z -= (y << 4 ^ y >> 5) + (y ^ sum) + ctx->KEY[sum>>11 & 3]; sum -= XTEA_DELTA; y -= (z << 4 ^ z >> 5) + (z ^ sum) + ctx->KEY[sum & 3]; } put_unaligned_le32(y, &dst[0]); put_unaligned_le32(z, &dst[4]); } static struct crypto_alg tea_algs[3] = { { .cra_name = "tea", .cra_driver_name = "tea-generic", .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = TEA_BLOCK_SIZE, .cra_ctxsize = sizeof (struct tea_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = TEA_KEY_SIZE, .cia_max_keysize = TEA_KEY_SIZE, .cia_setkey = tea_setkey, .cia_encrypt = tea_encrypt, .cia_decrypt = tea_decrypt } } }, { .cra_name = "xtea", .cra_driver_name = "xtea-generic", .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = XTEA_BLOCK_SIZE, .cra_ctxsize = sizeof (struct xtea_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = XTEA_KEY_SIZE, .cia_max_keysize = XTEA_KEY_SIZE, .cia_setkey = xtea_setkey, .cia_encrypt = xtea_encrypt, .cia_decrypt = xtea_decrypt } } }, { .cra_name = "xeta", .cra_driver_name = "xeta-generic", .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = XTEA_BLOCK_SIZE, .cra_ctxsize = sizeof (struct xtea_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = XTEA_KEY_SIZE, .cia_max_keysize = XTEA_KEY_SIZE, .cia_setkey = xtea_setkey, .cia_encrypt = xeta_encrypt, .cia_decrypt = xeta_decrypt } } } }; static int __init tea_mod_init(void) { return crypto_register_algs(tea_algs, ARRAY_SIZE(tea_algs)); } static void __exit tea_mod_fini(void) { crypto_unregister_algs(tea_algs, ARRAY_SIZE(tea_algs)); } MODULE_ALIAS_CRYPTO("tea"); MODULE_ALIAS_CRYPTO("xtea"); MODULE_ALIAS_CRYPTO("xeta"); module_init(tea_mod_init); module_exit(tea_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("TEA, XTEA & XETA Cryptographic Algorithms");
465 26 75 650 301 145 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_EXTEND_H #define _NF_CONNTRACK_EXTEND_H #include <linux/slab.h> #include <net/netfilter/nf_conntrack.h> enum nf_ct_ext_id { NF_CT_EXT_HELPER, #if IS_ENABLED(CONFIG_NF_NAT) NF_CT_EXT_NAT, #endif NF_CT_EXT_SEQADJ, NF_CT_EXT_ACCT, #ifdef CONFIG_NF_CONNTRACK_EVENTS NF_CT_EXT_ECACHE, #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP NF_CT_EXT_TSTAMP, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT NF_CT_EXT_TIMEOUT, #endif #ifdef CONFIG_NF_CONNTRACK_LABELS NF_CT_EXT_LABELS, #endif #if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) NF_CT_EXT_SYNPROXY, #endif #if IS_ENABLED(CONFIG_NET_ACT_CT) NF_CT_EXT_ACT_CT, #endif NF_CT_EXT_NUM, }; /* Extensions: optional stuff which isn't permanently in struct. */ struct nf_ct_ext { u8 offset[NF_CT_EXT_NUM]; u8 len; unsigned int gen_id; char data[] __aligned(8); }; static inline bool __nf_ct_ext_exist(const struct nf_ct_ext *ext, u8 id) { return !!ext->offset[id]; } static inline bool nf_ct_ext_exist(const struct nf_conn *ct, u8 id) { return (ct->ext && __nf_ct_ext_exist(ct->ext, id)); } void *__nf_ct_ext_find(const struct nf_ct_ext *ext, u8 id); static inline void *nf_ct_ext_find(const struct nf_conn *ct, u8 id) { struct nf_ct_ext *ext = ct->ext; if (!ext || !__nf_ct_ext_exist(ext, id)) return NULL; if (unlikely(ext->gen_id)) return __nf_ct_ext_find(ext, id); return (void *)ct->ext + ct->ext->offset[id]; } /* Add this type, returns pointer to data or NULL. */ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp); /* ext genid. if ext->id != ext_genid, extensions cannot be used * anymore unless conntrack has CONFIRMED bit set. */ extern atomic_t nf_conntrack_ext_genid; void nf_ct_ext_bump_genid(void); #endif /* _NF_CONNTRACK_EXTEND_H */
28 9 33 21 16 35 21 16 35 35 34 35 35 26 35 1 26 35 9 33 29 5 26 26 19 5 14 19 12 20 11 21 21 15 12 12 26 26 26 26 3 15 12 2 2 26 26 21 2 25 26 23 26 12 12 12 12 12 12 12 12 11 2 12 11 3 12 11 1 12 12 20 6 26 26 26 26 51 20 35 11 50 26 25 1 26 25 1 12 22 16 22 22 22 22 22 22 26 26 2 2 22 24 17 29 5 4 63 6 4 10 60 72 72 70 60 6 14 48 4 22 2 21 8 2 22 78 43 44 16 24 7 4 3 3 2 5 4 2 2 25 8 18 15 9 4 9 3 3 8 19 20 20 18 18 18 20 20 18 18 18 12 12 8 6 1 1 4 12 133 5 12 124 5 13 111 3 13 103 12 147 126 8 12 211 184 1 28 212 19 291 22 6 19 324 12 312 16 11 282 12 227 211 14 190 7 25 151 168 2 3 14 39 21 10 62 62 62 45 38 39 39 16 3 3 1 3 3 61 43 17 11 6 12 1 2 2 5 4 4 1 9 1 8 5 5 9 1 8 2 1 1 42 42 42 101 1 6 6 2 2 2 2 10 5 5 10 4 1 3 5 3 8 5 6 8 27 28 14 14 4 15 13 13 2 2 5 4 12 1 8 2 1 4 27 6 26 3 25 4 24 4 20 7 27 12 11 6 7 6 2 108 101 3 4 4 4 101 41 99 99 99 98 93 6 98 18 80 5 77 7 70 8 69 9 1 69 63 8 43 166 167 19 150 29 130 130 11 113 2 4 1 110 49 56 7 63 1 37 12 38 2 2 1 1 1 2 1 1 1 1 1 1 1 1 1 13 3 2 1 1 36 16 2 18 6 12 13 3 16 13 12 1 1 2 11 11 1 39 1 2 36 12 1 23 2 1 1 1 1 2 1 1 58 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 // SPDX-License-Identifier: GPL-2.0 /* Connection tracking via netlink socket. Allows for user space * protocol helpers and general trouble making from userspace. * * (C) 2001 by Jay Schulist <jschlst@samba.org> * (C) 2002-2006 by Harald Welte <laforge@gnumonks.org> * (C) 2003 by Patrick Mchardy <kaber@trash.net> * (C) 2005-2012 by Pablo Neira Ayuso <pablo@netfilter.org> * * Initial connection tracking via netlink development funded and * generally made possible by Network Robots, Inc. (www.networkrobots.com) * * Further development of this code funded by Astaro AG (http://www.astaro.com) */ #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/rculist.h> #include <linux/rculist_nulls.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/security.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/netlink.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/siphash.h> #include <linux/netfilter.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/sock.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> #include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_conntrack_synproxy.h> #if IS_ENABLED(CONFIG_NF_NAT) #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> #endif #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> #include "nf_internals.h" MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("List and change connection tracking table"); struct ctnetlink_list_dump_ctx { unsigned long last_id; unsigned int cpu; bool done; }; static int ctnetlink_dump_tuples_proto(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l4proto *l4proto) { int ret = 0; struct nlattr *nest_parms; nest_parms = nla_nest_start(skb, CTA_TUPLE_PROTO); if (!nest_parms) goto nla_put_failure; if (nla_put_u8(skb, CTA_PROTO_NUM, tuple->dst.protonum)) goto nla_put_failure; if (likely(l4proto->tuple_to_nlattr)) ret = l4proto->tuple_to_nlattr(skb, tuple); nla_nest_end(skb, nest_parms); return ret; nla_put_failure: return -1; } static int ipv4_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) return -EMSGSIZE; return 0; } static int ipv6_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) || nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6)) return -EMSGSIZE; return 0; } static int ctnetlink_dump_tuples_ip(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { int ret = 0; struct nlattr *nest_parms; nest_parms = nla_nest_start(skb, CTA_TUPLE_IP); if (!nest_parms) goto nla_put_failure; switch (tuple->src.l3num) { case NFPROTO_IPV4: ret = ipv4_tuple_to_nlattr(skb, tuple); break; case NFPROTO_IPV6: ret = ipv6_tuple_to_nlattr(skb, tuple); break; } nla_nest_end(skb, nest_parms); return ret; nla_put_failure: return -1; } static int ctnetlink_dump_tuples(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { const struct nf_conntrack_l4proto *l4proto; int ret; rcu_read_lock(); ret = ctnetlink_dump_tuples_ip(skb, tuple); if (ret >= 0) { l4proto = nf_ct_l4proto_find(tuple->dst.protonum); ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto); } rcu_read_unlock(); return ret; } static int ctnetlink_dump_zone_id(struct sk_buff *skb, int attrtype, const struct nf_conntrack_zone *zone, int dir) { if (zone->id == NF_CT_DEFAULT_ZONE_ID || zone->dir != dir) return 0; if (nla_put_be16(skb, attrtype, htons(zone->id))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct) { if (nla_put_be32(skb, CTA_STATUS, htonl(ct->status))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct, bool skip_zero) { long timeout; if (nf_ct_is_confirmed(ct)) timeout = nf_ct_expires(ct) / HZ; else timeout = ct->timeout / HZ; if (skip_zero && timeout == 0) return 0; if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct, bool destroy) { const struct nf_conntrack_l4proto *l4proto; struct nlattr *nest_proto; int ret; l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); if (!l4proto->to_nlattr) return 0; nest_proto = nla_nest_start(skb, CTA_PROTOINFO); if (!nest_proto) goto nla_put_failure; ret = l4proto->to_nlattr(skb, nest_proto, ct, destroy); nla_nest_end(skb, nest_proto); return ret; nla_put_failure: return -1; } static int ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_helper; const struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_helper *helper; if (!help) return 0; rcu_read_lock(); helper = rcu_dereference(help->helper); if (!helper) goto out; nest_helper = nla_nest_start(skb, CTA_HELP); if (!nest_helper) goto nla_put_failure; if (nla_put_string(skb, CTA_HELP_NAME, helper->name)) goto nla_put_failure; if (helper->to_nlattr) helper->to_nlattr(skb, ct); nla_nest_end(skb, nest_helper); out: rcu_read_unlock(); return 0; nla_put_failure: rcu_read_unlock(); return -1; } static int dump_counters(struct sk_buff *skb, struct nf_conn_acct *acct, enum ip_conntrack_dir dir, int type) { enum ctattr_type attr = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; struct nf_conn_counter *counter = acct->counter; struct nlattr *nest_count; u64 pkts, bytes; if (type == IPCTNL_MSG_CT_GET_CTRZERO) { pkts = atomic64_xchg(&counter[dir].packets, 0); bytes = atomic64_xchg(&counter[dir].bytes, 0); } else { pkts = atomic64_read(&counter[dir].packets); bytes = atomic64_read(&counter[dir].bytes); } nest_count = nla_nest_start(skb, attr); if (!nest_count) goto nla_put_failure; if (nla_put_be64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts), CTA_COUNTERS_PAD) || nla_put_be64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes), CTA_COUNTERS_PAD)) goto nla_put_failure; nla_nest_end(skb, nest_count); return 0; nla_put_failure: return -1; } static int ctnetlink_dump_acct(struct sk_buff *skb, const struct nf_conn *ct, int type) { struct nf_conn_acct *acct = nf_conn_acct_find(ct); if (!acct) return 0; if (dump_counters(skb, acct, IP_CT_DIR_ORIGINAL, type) < 0) return -1; if (dump_counters(skb, acct, IP_CT_DIR_REPLY, type) < 0) return -1; return 0; } static int ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_count; const struct nf_conn_tstamp *tstamp; tstamp = nf_conn_tstamp_find(ct); if (!tstamp) return 0; nest_count = nla_nest_start(skb, CTA_TIMESTAMP); if (!nest_count) goto nla_put_failure; if (nla_put_be64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start), CTA_TIMESTAMP_PAD) || (tstamp->stop != 0 && nla_put_be64(skb, CTA_TIMESTAMP_STOP, cpu_to_be64(tstamp->stop), CTA_TIMESTAMP_PAD))) goto nla_put_failure; nla_nest_end(skb, nest_count); return 0; nla_put_failure: return -1; } #ifdef CONFIG_NF_CONNTRACK_MARK static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct, bool dump) { u32 mark = READ_ONCE(ct->mark); if (!mark && !dump) return 0; if (nla_put_be32(skb, CTA_MARK, htonl(mark))) goto nla_put_failure; return 0; nla_put_failure: return -1; } #else #define ctnetlink_dump_mark(a, b, c) (0) #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_secctx; struct lsm_context ctx; int ret; ret = security_secid_to_secctx(ct->secmark, &ctx); if (ret < 0) return 0; ret = -1; nest_secctx = nla_nest_start(skb, CTA_SECCTX); if (!nest_secctx) goto nla_put_failure; if (nla_put_string(skb, CTA_SECCTX_NAME, ctx.context)) goto nla_put_failure; nla_nest_end(skb, nest_secctx); ret = 0; nla_put_failure: security_release_secctx(&ctx); return ret; } #else #define ctnetlink_dump_secctx(a, b) (0) #endif #ifdef CONFIG_NF_CONNTRACK_EVENTS static int ctnetlink_dump_event_timestamp(struct sk_buff *skb, const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP const struct nf_conntrack_ecache *e = nf_ct_ecache_find(ct); if (e) { u64 ts = local64_read(&e->timestamp); if (ts) return nla_put_be64(skb, CTA_TIMESTAMP_EVENT, cpu_to_be64(ts), CTA_TIMESTAMP_PAD); } #endif return 0; } static inline int ctnetlink_label_size(const struct nf_conn *ct) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); if (!labels) return 0; return nla_total_size(sizeof(labels->bits)); } #endif static int ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); unsigned int i; if (!labels) return 0; i = 0; do { if (labels->bits[i] != 0) return nla_put(skb, CTA_LABELS, sizeof(labels->bits), labels->bits); i++; } while (i < ARRAY_SIZE(labels->bits)); return 0; } #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) static int ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_parms; if (!(ct->status & IPS_EXPECTED)) return 0; nest_parms = nla_nest_start(skb, CTA_TUPLE_MASTER); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, master_tuple(ct)) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); return 0; nla_put_failure: return -1; } static int dump_ct_seq_adj(struct sk_buff *skb, const struct nf_ct_seqadj *seq, int type) { struct nlattr *nest_parms; nest_parms = nla_nest_start(skb, type); if (!nest_parms) goto nla_put_failure; if (nla_put_be32(skb, CTA_SEQADJ_CORRECTION_POS, htonl(seq->correction_pos)) || nla_put_be32(skb, CTA_SEQADJ_OFFSET_BEFORE, htonl(seq->offset_before)) || nla_put_be32(skb, CTA_SEQADJ_OFFSET_AFTER, htonl(seq->offset_after))) goto nla_put_failure; nla_nest_end(skb, nest_parms); return 0; nla_put_failure: return -1; } static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, struct nf_conn *ct) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *seq; if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj) return 0; spin_lock_bh(&ct->lock); seq = &seqadj->seq[IP_CT_DIR_ORIGINAL]; if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1) goto err; seq = &seqadj->seq[IP_CT_DIR_REPLY]; if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1) goto err; spin_unlock_bh(&ct->lock); return 0; err: spin_unlock_bh(&ct->lock); return -1; } static int ctnetlink_dump_ct_synproxy(struct sk_buff *skb, struct nf_conn *ct) { struct nf_conn_synproxy *synproxy = nfct_synproxy(ct); struct nlattr *nest_parms; if (!synproxy) return 0; nest_parms = nla_nest_start(skb, CTA_SYNPROXY); if (!nest_parms) goto nla_put_failure; if (nla_put_be32(skb, CTA_SYNPROXY_ISN, htonl(synproxy->isn)) || nla_put_be32(skb, CTA_SYNPROXY_ITS, htonl(synproxy->its)) || nla_put_be32(skb, CTA_SYNPROXY_TSOFF, htonl(synproxy->tsoff))) goto nla_put_failure; nla_nest_end(skb, nest_parms); return 0; nla_put_failure: return -1; } static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) { __be32 id = (__force __be32)nf_ct_get_id(ct); if (nla_put_be32(skb, CTA_ID, id)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct) { if (nla_put_be32(skb, CTA_USE, htonl(refcount_read(&ct->ct_general.use)))) goto nla_put_failure; return 0; nla_put_failure: return -1; } /* all these functions access ct->ext. Caller must either hold a reference * on ct or prevent its deletion by holding either the bucket spinlock or * pcpu dying list lock. */ static int ctnetlink_dump_extinfo(struct sk_buff *skb, struct nf_conn *ct, u32 type) { if (ctnetlink_dump_acct(skb, ct, type) < 0 || ctnetlink_dump_timestamp(skb, ct) < 0 || ctnetlink_dump_helpinfo(skb, ct) < 0 || ctnetlink_dump_labels(skb, ct) < 0 || ctnetlink_dump_ct_seq_adj(skb, ct) < 0 || ctnetlink_dump_ct_synproxy(skb, ct) < 0) return -1; return 0; } static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) { if (ctnetlink_dump_status(skb, ct) < 0 || ctnetlink_dump_mark(skb, ct, true) < 0 || ctnetlink_dump_secctx(skb, ct) < 0 || ctnetlink_dump_id(skb, ct) < 0 || ctnetlink_dump_use(skb, ct) < 0 || ctnetlink_dump_master(skb, ct) < 0) return -1; if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) && (ctnetlink_dump_timeout(skb, ct, false) < 0 || ctnetlink_dump_protoinfo(skb, ct, false) < 0)) return -1; return 0; } static int ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct nf_conn *ct, bool extinfo, unsigned int flags) { const struct nf_conntrack_zone *zone; struct nlmsghdr *nlh; struct nlattr *nest_parms; unsigned int event; if (portid) flags |= NLM_F_MULTI; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_NEW); nlh = nfnl_msg_put(skb, portid, seq, event, flags, nf_ct_l3num(ct), NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; zone = nf_ct_zone(ct); nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) goto nla_put_failure; if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, NF_CT_ZONE_DIR_ORIG) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) goto nla_put_failure; if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, NF_CT_ZONE_DIR_REPL) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone, NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; if (ctnetlink_dump_info(skb, ct) < 0) goto nla_put_failure; if (extinfo && ctnetlink_dump_extinfo(skb, ct, type) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } static const struct nla_policy cta_ip_nla_policy[CTA_IP_MAX + 1] = { [CTA_IP_V4_SRC] = { .type = NLA_U32 }, [CTA_IP_V4_DST] = { .type = NLA_U32 }, [CTA_IP_V6_SRC] = { .len = sizeof(__be32) * 4 }, [CTA_IP_V6_DST] = { .len = sizeof(__be32) * 4 }, }; #if defined(CONFIG_NETFILTER_NETLINK_GLUE_CT) || defined(CONFIG_NF_CONNTRACK_EVENTS) static size_t ctnetlink_proto_size(const struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; size_t len, len4 = 0; len = nla_policy_len(cta_ip_nla_policy, CTA_IP_MAX + 1); len *= 3u; /* ORIG, REPLY, MASTER */ l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); len += l4proto->nlattr_size; if (l4proto->nlattr_tuple_size) { len4 = l4proto->nlattr_tuple_size(); len4 *= 3u; /* ORIG, REPLY, MASTER */ } return len + len4; } static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) { if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT)) return 0; return 2 * nla_total_size(0) /* CTA_COUNTERS_ORIG|REPL */ + 2 * nla_total_size_64bit(sizeof(uint64_t)) /* CTA_COUNTERS_PACKETS */ + 2 * nla_total_size_64bit(sizeof(uint64_t)) /* CTA_COUNTERS_BYTES */ ; } static inline int ctnetlink_secctx_size(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_SECMARK int ret; ret = security_secid_to_secctx(ct->secmark, NULL); if (ret < 0) return 0; return nla_total_size(0) /* CTA_SECCTX */ + nla_total_size(sizeof(char) * ret); /* CTA_SECCTX_NAME */ #else return 0; #endif } static inline size_t ctnetlink_timestamp_size(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP)) return 0; return nla_total_size(0) + 2 * nla_total_size_64bit(sizeof(uint64_t)); #else return 0; #endif } #endif #ifdef CONFIG_NF_CONNTRACK_EVENTS static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) { return NLMSG_ALIGN(sizeof(struct nfgenmsg)) + 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */ + 3 * nla_total_size(0) /* CTA_TUPLE_IP */ + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */ + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */ + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */ + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */ + ctnetlink_acct_size(ct) + ctnetlink_timestamp_size(ct) + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */ + nla_total_size(0) /* CTA_PROTOINFO */ + nla_total_size(0) /* CTA_HELP */ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ + ctnetlink_secctx_size(ct) #if IS_ENABLED(CONFIG_NF_NAT) + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ #endif #ifdef CONFIG_NF_CONNTRACK_MARK + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif #ifdef CONFIG_NF_CONNTRACK_ZONES + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */ #endif + ctnetlink_proto_size(ct) + ctnetlink_label_size(ct) #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + nla_total_size(sizeof(u64)) /* CTA_TIMESTAMP_EVENT */ #endif ; } static int ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) { const struct nf_conntrack_zone *zone; struct net *net; struct nlmsghdr *nlh; struct nlattr *nest_parms; struct nf_conn *ct = item->ct; struct sk_buff *skb; unsigned int type; unsigned int flags = 0, group; int err; if (events & (1 << IPCT_DESTROY)) { type = IPCTNL_MSG_CT_DELETE; group = NFNLGRP_CONNTRACK_DESTROY; } else if (events & ((1 << IPCT_NEW) | (1 << IPCT_RELATED))) { type = IPCTNL_MSG_CT_NEW; flags = NLM_F_CREATE|NLM_F_EXCL; group = NFNLGRP_CONNTRACK_NEW; } else if (events) { type = IPCTNL_MSG_CT_NEW; group = NFNLGRP_CONNTRACK_UPDATE; } else return 0; net = nf_ct_net(ct); if (!item->report && !nfnetlink_has_listeners(net, group)) return 0; skb = nlmsg_new(ctnetlink_nlmsg_size(ct), GFP_ATOMIC); if (skb == NULL) goto errout; type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, type); nlh = nfnl_msg_put(skb, item->portid, 0, type, flags, nf_ct_l3num(ct), NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; zone = nf_ct_zone(ct); nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) goto nla_put_failure; if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, NF_CT_ZONE_DIR_ORIG) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) goto nla_put_failure; if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, NF_CT_ZONE_DIR_REPL) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone, NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; if (ctnetlink_dump_id(skb, ct) < 0) goto nla_put_failure; if (ctnetlink_dump_status(skb, ct) < 0) goto nla_put_failure; if (events & (1 << IPCT_DESTROY)) { if (ctnetlink_dump_timeout(skb, ct, true) < 0) goto nla_put_failure; if (ctnetlink_dump_acct(skb, ct, type) < 0 || ctnetlink_dump_timestamp(skb, ct) < 0 || ctnetlink_dump_protoinfo(skb, ct, true) < 0) goto nla_put_failure; } else { if (ctnetlink_dump_timeout(skb, ct, false) < 0) goto nla_put_failure; if (events & (1 << IPCT_PROTOINFO) && ctnetlink_dump_protoinfo(skb, ct, false) < 0) goto nla_put_failure; if ((events & (1 << IPCT_HELPER) || nfct_help(ct)) && ctnetlink_dump_helpinfo(skb, ct) < 0) goto nla_put_failure; #ifdef CONFIG_NF_CONNTRACK_SECMARK if ((events & (1 << IPCT_SECMARK) || ct->secmark) && ctnetlink_dump_secctx(skb, ct) < 0) goto nla_put_failure; #endif if (events & (1 << IPCT_LABEL) && ctnetlink_dump_labels(skb, ct) < 0) goto nla_put_failure; if (events & (1 << IPCT_RELATED) && ctnetlink_dump_master(skb, ct) < 0) goto nla_put_failure; if (events & (1 << IPCT_SEQADJ) && ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; if (events & (1 << IPCT_SYNPROXY) && ctnetlink_dump_ct_synproxy(skb, ct) < 0) goto nla_put_failure; } #ifdef CONFIG_NF_CONNTRACK_MARK if (ctnetlink_dump_mark(skb, ct, events & (1 << IPCT_MARK))) goto nla_put_failure; #endif if (ctnetlink_dump_event_timestamp(skb, ct)) goto nla_put_failure; nlmsg_end(skb, nlh); err = nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC); if (err == -ENOBUFS || err == -EAGAIN) return -ENOBUFS; return 0; nla_put_failure: nlmsg_cancel(skb, nlh); nlmsg_failure: kfree_skb(skb); errout: if (nfnetlink_set_err(net, 0, group, -ENOBUFS) > 0) return -ENOBUFS; return 0; } #endif /* CONFIG_NF_CONNTRACK_EVENTS */ static int ctnetlink_done(struct netlink_callback *cb) { kfree(cb->data); return 0; } struct ctnetlink_filter_u32 { u32 val; u32 mask; }; struct ctnetlink_filter { u8 family; bool zone_filter; u_int32_t orig_flags; u_int32_t reply_flags; struct nf_conntrack_tuple orig; struct nf_conntrack_tuple reply; struct nf_conntrack_zone zone; struct ctnetlink_filter_u32 mark; struct ctnetlink_filter_u32 status; }; static const struct nla_policy cta_filter_nla_policy[CTA_FILTER_MAX + 1] = { [CTA_FILTER_ORIG_FLAGS] = NLA_POLICY_MASK(NLA_U32, CTA_FILTER_F_ALL), [CTA_FILTER_REPLY_FLAGS] = NLA_POLICY_MASK(NLA_U32, CTA_FILTER_F_ALL), }; static int ctnetlink_parse_filter(const struct nlattr *attr, struct ctnetlink_filter *filter) { struct nlattr *tb[CTA_FILTER_MAX + 1]; int ret = 0; ret = nla_parse_nested(tb, CTA_FILTER_MAX, attr, cta_filter_nla_policy, NULL); if (ret) return ret; if (tb[CTA_FILTER_ORIG_FLAGS]) filter->orig_flags = nla_get_u32(tb[CTA_FILTER_ORIG_FLAGS]); if (tb[CTA_FILTER_REPLY_FLAGS]) filter->reply_flags = nla_get_u32(tb[CTA_FILTER_REPLY_FLAGS]); return 0; } static int ctnetlink_parse_zone(const struct nlattr *attr, struct nf_conntrack_zone *zone); static int ctnetlink_parse_tuple_filter(const struct nlattr * const cda[], struct nf_conntrack_tuple *tuple, u32 type, u_int8_t l3num, struct nf_conntrack_zone *zone, u_int32_t flags); static int ctnetlink_filter_parse_mark(struct ctnetlink_filter_u32 *mark, const struct nlattr * const cda[]) { #ifdef CONFIG_NF_CONNTRACK_MARK if (cda[CTA_MARK]) { mark->val = ntohl(nla_get_be32(cda[CTA_MARK])); if (cda[CTA_MARK_MASK]) mark->mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); else mark->mask = 0xffffffff; } else if (cda[CTA_MARK_MASK]) { return -EINVAL; } #endif return 0; } static int ctnetlink_filter_parse_status(struct ctnetlink_filter_u32 *status, const struct nlattr * const cda[]) { if (cda[CTA_STATUS]) { status->val = ntohl(nla_get_be32(cda[CTA_STATUS])); if (cda[CTA_STATUS_MASK]) status->mask = ntohl(nla_get_be32(cda[CTA_STATUS_MASK])); else status->mask = status->val; /* status->val == 0? always true, else always false. */ if (status->mask == 0) return -EINVAL; } else if (cda[CTA_STATUS_MASK]) { return -EINVAL; } /* CTA_STATUS is NLA_U32, if this fires UAPI needs to be extended */ BUILD_BUG_ON(__IPS_MAX_BIT >= 32); return 0; } static struct ctnetlink_filter * ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) { struct ctnetlink_filter *filter; int err; #ifndef CONFIG_NF_CONNTRACK_MARK if (cda[CTA_MARK] || cda[CTA_MARK_MASK]) return ERR_PTR(-EOPNOTSUPP); #endif filter = kzalloc_obj(*filter); if (filter == NULL) return ERR_PTR(-ENOMEM); filter->family = family; err = ctnetlink_filter_parse_mark(&filter->mark, cda); if (err) goto err_filter; err = ctnetlink_filter_parse_status(&filter->status, cda); if (err) goto err_filter; if (cda[CTA_ZONE]) { err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone); if (err < 0) goto err_filter; filter->zone_filter = true; } if (!cda[CTA_FILTER]) return filter; err = ctnetlink_parse_filter(cda[CTA_FILTER], filter); if (err < 0) goto err_filter; if (filter->orig_flags) { if (!cda[CTA_TUPLE_ORIG]) { err = -EINVAL; goto err_filter; } err = ctnetlink_parse_tuple_filter(cda, &filter->orig, CTA_TUPLE_ORIG, filter->family, &filter->zone, filter->orig_flags); if (err < 0) goto err_filter; } if (filter->reply_flags) { if (!cda[CTA_TUPLE_REPLY]) { err = -EINVAL; goto err_filter; } err = ctnetlink_parse_tuple_filter(cda, &filter->reply, CTA_TUPLE_REPLY, filter->family, &filter->zone, filter->reply_flags); if (err < 0) goto err_filter; } return filter; err_filter: kfree(filter); return ERR_PTR(err); } static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda) { return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS] || cda[CTA_ZONE]; } static int ctnetlink_start(struct netlink_callback *cb) { const struct nlattr * const *cda = cb->data; struct ctnetlink_filter *filter = NULL; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u8 family = nfmsg->nfgen_family; if (ctnetlink_needs_filter(family, cda)) { filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); } cb->data = filter; return 0; } static int ctnetlink_filter_match_tuple(struct nf_conntrack_tuple *filter_tuple, struct nf_conntrack_tuple *ct_tuple, u_int32_t flags, int family) { switch (family) { case NFPROTO_IPV4: if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) && filter_tuple->src.u3.ip != ct_tuple->src.u3.ip) return 0; if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) && filter_tuple->dst.u3.ip != ct_tuple->dst.u3.ip) return 0; break; case NFPROTO_IPV6: if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) && !ipv6_addr_cmp(&filter_tuple->src.u3.in6, &ct_tuple->src.u3.in6)) return 0; if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) && !ipv6_addr_cmp(&filter_tuple->dst.u3.in6, &ct_tuple->dst.u3.in6)) return 0; break; } if ((flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) && filter_tuple->dst.protonum != ct_tuple->dst.protonum) return 0; switch (ct_tuple->dst.protonum) { case IPPROTO_TCP: case IPPROTO_UDP: if ((flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) && filter_tuple->src.u.tcp.port != ct_tuple->src.u.tcp.port) return 0; if ((flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) && filter_tuple->dst.u.tcp.port != ct_tuple->dst.u.tcp.port) return 0; break; case IPPROTO_ICMP: if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) && filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type) return 0; if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) && filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code) return 0; if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) && filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id) return 0; break; case IPPROTO_ICMPV6: if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) && filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type) return 0; if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) && filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code) return 0; if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) && filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id) return 0; break; } return 1; } static int ctnetlink_filter_match(struct nf_conn *ct, void *data) { struct ctnetlink_filter *filter = data; struct nf_conntrack_tuple *tuple; u32 status; if (filter == NULL) goto out; /* Match entries of a given L3 protocol number. * If it is not specified, ie. l3proto == 0, * then match everything. */ if (filter->family && nf_ct_l3num(ct) != filter->family) goto ignore_entry; if (filter->zone_filter && !nf_ct_zone_equal_any(ct, &filter->zone)) goto ignore_entry; if (filter->orig_flags) { tuple = nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL); if (!ctnetlink_filter_match_tuple(&filter->orig, tuple, filter->orig_flags, filter->family)) goto ignore_entry; } if (filter->reply_flags) { tuple = nf_ct_tuple(ct, IP_CT_DIR_REPLY); if (!ctnetlink_filter_match_tuple(&filter->reply, tuple, filter->reply_flags, filter->family)) goto ignore_entry; } #ifdef CONFIG_NF_CONNTRACK_MARK if ((READ_ONCE(ct->mark) & filter->mark.mask) != filter->mark.val) goto ignore_entry; #endif status = (u32)READ_ONCE(ct->status); if ((status & filter->status.mask) != filter->status.val) goto ignore_entry; out: return 1; ignore_entry: return 0; } static unsigned long ctnetlink_get_id(const struct nf_conn *ct) { unsigned long id = nf_ct_get_id(ct); return id ? id : 1; } static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0; struct net *net = sock_net(skb->sk); unsigned long last_id = cb->args[1]; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; struct nf_conn *nf_ct_evict[8]; struct nf_conn *ct; int res, i; spinlock_t *lockp; i = 0; local_bh_disable(); for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { restart: while (i) { i--; if (nf_ct_should_gc(nf_ct_evict[i])) nf_ct_kill(nf_ct_evict[i]); nf_ct_put(nf_ct_evict[i]); } lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; nf_conntrack_lock(lockp); if (cb->args[0] >= nf_conntrack_htable_size) { spin_unlock(lockp); goto out; } hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (nf_ct_is_expired(ct)) { /* need to defer nf_ct_kill() until lock is released */ if (i < ARRAY_SIZE(nf_ct_evict) && refcount_inc_not_zero(&ct->ct_general.use)) nf_ct_evict[i++] = ct; continue; } if (!net_eq(net, nf_ct_net(ct))) continue; if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; if (cb->args[1]) { if (ctnetlink_get_id(ct) != last_id) continue; cb->args[1] = 0; } if (!ctnetlink_filter_match(ct, cb->data)) continue; res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), ct, true, flags); if (res < 0) { cb->args[1] = ctnetlink_get_id(ct); spin_unlock(lockp); goto out; } } spin_unlock(lockp); if (cb->args[1]) { cb->args[1] = 0; goto restart; } } out: local_bh_enable(); if (last_id) { /* nf ct hash resize happened, now clear the leftover. */ if (cb->args[1] == last_id) cb->args[1] = 0; } while (i) { i--; if (nf_ct_should_gc(nf_ct_evict[i])) nf_ct_kill(nf_ct_evict[i]); nf_ct_put(nf_ct_evict[i]); } return skb->len; } static int ipv4_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t, u_int32_t flags) { if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { if (!tb[CTA_IP_V4_SRC]) return -EINVAL; t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); } if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) { if (!tb[CTA_IP_V4_DST]) return -EINVAL; t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); } return 0; } static int ipv6_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t, u_int32_t flags) { if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { if (!tb[CTA_IP_V6_SRC]) return -EINVAL; t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]); } if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) { if (!tb[CTA_IP_V6_DST]) return -EINVAL; t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]); } return 0; } static int ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple, u_int32_t flags) { struct nlattr *tb[CTA_IP_MAX+1]; int ret = 0; ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr, cta_ip_nla_policy, NULL); if (ret < 0) return ret; switch (tuple->src.l3num) { case NFPROTO_IPV4: ret = ipv4_nlattr_to_tuple(tb, tuple, flags); break; case NFPROTO_IPV6: ret = ipv6_nlattr_to_tuple(tb, tuple, flags); break; } return ret; } static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = { [CTA_PROTO_NUM] = { .type = NLA_U8 }, }; static int ctnetlink_parse_tuple_proto(struct nlattr *attr, struct nf_conntrack_tuple *tuple, u_int32_t flags) { const struct nf_conntrack_l4proto *l4proto; struct nlattr *tb[CTA_PROTO_MAX+1]; int ret = 0; ret = nla_parse_nested_deprecated(tb, CTA_PROTO_MAX, attr, proto_nla_policy, NULL); if (ret < 0) return ret; if (!(flags & CTA_FILTER_FLAG(CTA_PROTO_NUM))) return 0; if (!tb[CTA_PROTO_NUM]) return -EINVAL; tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]); rcu_read_lock(); l4proto = nf_ct_l4proto_find(tuple->dst.protonum); if (likely(l4proto->nlattr_to_tuple)) { ret = nla_validate_nested_deprecated(attr, CTA_PROTO_MAX, l4proto->nla_policy, NULL); if (ret == 0) ret = l4proto->nlattr_to_tuple(tb, tuple, flags); } rcu_read_unlock(); return ret; } static int ctnetlink_parse_zone(const struct nlattr *attr, struct nf_conntrack_zone *zone) { nf_ct_zone_init(zone, NF_CT_DEFAULT_ZONE_ID, NF_CT_DEFAULT_ZONE_DIR, 0); #ifdef CONFIG_NF_CONNTRACK_ZONES if (attr) zone->id = ntohs(nla_get_be16(attr)); #else if (attr) return -EOPNOTSUPP; #endif return 0; } static int ctnetlink_parse_tuple_zone(struct nlattr *attr, enum ctattr_type type, struct nf_conntrack_zone *zone) { int ret; if (zone->id != NF_CT_DEFAULT_ZONE_ID) return -EINVAL; ret = ctnetlink_parse_zone(attr, zone); if (ret < 0) return ret; if (type == CTA_TUPLE_REPLY) zone->dir = NF_CT_ZONE_DIR_REPL; else zone->dir = NF_CT_ZONE_DIR_ORIG; return 0; } static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = { [CTA_TUPLE_IP] = { .type = NLA_NESTED }, [CTA_TUPLE_PROTO] = { .type = NLA_NESTED }, [CTA_TUPLE_ZONE] = { .type = NLA_U16 }, }; #define CTA_FILTER_F_ALL_CTA_PROTO \ (CTA_FILTER_F_CTA_PROTO_SRC_PORT | \ CTA_FILTER_F_CTA_PROTO_DST_PORT | \ CTA_FILTER_F_CTA_PROTO_ICMP_TYPE | \ CTA_FILTER_F_CTA_PROTO_ICMP_CODE | \ CTA_FILTER_F_CTA_PROTO_ICMP_ID | \ CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE | \ CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE | \ CTA_FILTER_F_CTA_PROTO_ICMPV6_ID) static int ctnetlink_parse_tuple_filter(const struct nlattr * const cda[], struct nf_conntrack_tuple *tuple, u32 type, u_int8_t l3num, struct nf_conntrack_zone *zone, u_int32_t flags) { struct nlattr *tb[CTA_TUPLE_MAX+1]; int err; memset(tuple, 0, sizeof(*tuple)); err = nla_parse_nested_deprecated(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy, NULL); if (err < 0) return err; if (l3num != NFPROTO_IPV4 && l3num != NFPROTO_IPV6) return -EOPNOTSUPP; tuple->src.l3num = l3num; if (flags & CTA_FILTER_FLAG(CTA_IP_DST) || flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { if (!tb[CTA_TUPLE_IP]) return -EINVAL; err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple, flags); if (err < 0) return err; } if (flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) { if (!tb[CTA_TUPLE_PROTO]) return -EINVAL; err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple, flags); if (err < 0) return err; } else if (flags & CTA_FILTER_FLAG(ALL_CTA_PROTO)) { /* Can't manage proto flags without a protonum */ return -EINVAL; } if ((flags & CTA_FILTER_FLAG(CTA_TUPLE_ZONE)) && tb[CTA_TUPLE_ZONE]) { if (!zone) return -EINVAL; err = ctnetlink_parse_tuple_zone(tb[CTA_TUPLE_ZONE], type, zone); if (err < 0) return err; } /* orig and expect tuples get DIR_ORIGINAL */ if (type == CTA_TUPLE_REPLY) tuple->dst.dir = IP_CT_DIR_REPLY; else tuple->dst.dir = IP_CT_DIR_ORIGINAL; return 0; } static int ctnetlink_parse_tuple(const struct nlattr * const cda[], struct nf_conntrack_tuple *tuple, u32 type, u_int8_t l3num, struct nf_conntrack_zone *zone) { return ctnetlink_parse_tuple_filter(cda, tuple, type, l3num, zone, CTA_FILTER_FLAG(ALL)); } static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = { [CTA_HELP_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN - 1 }, }; static int ctnetlink_parse_help(const struct nlattr *attr, char **helper_name, struct nlattr **helpinfo) { int err; struct nlattr *tb[CTA_HELP_MAX+1]; err = nla_parse_nested_deprecated(tb, CTA_HELP_MAX, attr, help_nla_policy, NULL); if (err < 0) return err; if (!tb[CTA_HELP_NAME]) return -EINVAL; *helper_name = nla_data(tb[CTA_HELP_NAME]); if (tb[CTA_HELP_INFO]) *helpinfo = tb[CTA_HELP_INFO]; return 0; } static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { [CTA_TUPLE_ORIG] = { .type = NLA_NESTED }, [CTA_TUPLE_REPLY] = { .type = NLA_NESTED }, [CTA_STATUS] = { .type = NLA_U32 }, [CTA_PROTOINFO] = { .type = NLA_NESTED }, [CTA_HELP] = { .type = NLA_NESTED }, [CTA_NAT_SRC] = { .type = NLA_NESTED }, [CTA_TIMEOUT] = { .type = NLA_U32 }, [CTA_MARK] = { .type = NLA_U32 }, [CTA_ID] = { .type = NLA_U32 }, [CTA_NAT_DST] = { .type = NLA_NESTED }, [CTA_TUPLE_MASTER] = { .type = NLA_NESTED }, [CTA_NAT_SEQ_ADJ_ORIG] = { .type = NLA_NESTED }, [CTA_NAT_SEQ_ADJ_REPLY] = { .type = NLA_NESTED }, [CTA_ZONE] = { .type = NLA_U16 }, [CTA_MARK_MASK] = { .type = NLA_U32 }, [CTA_LABELS] = { .type = NLA_BINARY, .len = NF_CT_LABELS_MAX_SIZE }, [CTA_LABELS_MASK] = { .type = NLA_BINARY, .len = NF_CT_LABELS_MAX_SIZE }, [CTA_FILTER] = { .type = NLA_NESTED }, [CTA_STATUS_MASK] = { .type = NLA_U32 }, [CTA_TIMESTAMP_EVENT] = { .type = NLA_REJECT }, }; static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data) { return ctnetlink_filter_match(ct, data); } static int ctnetlink_flush_conntrack(struct net *net, const struct nlattr * const cda[], u32 portid, int report, u8 family) { struct ctnetlink_filter *filter = NULL; struct nf_ct_iter_data iter = { .net = net, .portid = portid, .report = report, }; if (ctnetlink_needs_filter(family, cda)) { filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); iter.data = filter; } nf_ct_iterate_cleanup_net(ctnetlink_flush_iterate, &iter); kfree(filter); return 0; } static int ctnetlink_del_conntrack(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { u8 family = info->nfmsg->nfgen_family; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; struct nf_conn *ct; int err; err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); if (err < 0) return err; if (cda[CTA_TUPLE_ORIG] && !cda[CTA_FILTER]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, family, &zone); else if (cda[CTA_TUPLE_REPLY] && !cda[CTA_FILTER]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, family, &zone); else { u8 u3 = info->nfmsg->version || cda[CTA_FILTER] ? family : AF_UNSPEC; return ctnetlink_flush_conntrack(info->net, cda, NETLINK_CB(skb).portid, nlmsg_report(info->nlh), u3); } if (err < 0) return err; h = nf_conntrack_find_get(info->net, &zone, &tuple); if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); if (cda[CTA_ID]) { __be32 id = nla_get_be32(cda[CTA_ID]); if (id != (__force __be32)nf_ct_get_id(ct)) { nf_ct_put(ct); return -ENOENT; } } nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); nf_ct_put(ct); return 0; } static int ctnetlink_get_conntrack(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; struct sk_buff *skb2; struct nf_conn *ct; int err; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = ctnetlink_start, .dump = ctnetlink_dump_table, .done = ctnetlink_done, .data = (void *)cda, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); if (err < 0) return err; if (cda[CTA_TUPLE_ORIG]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3, &zone); else if (cda[CTA_TUPLE_REPLY]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3, &zone); else return -EINVAL; if (err < 0) return err; h = nf_conntrack_find_get(info->net, &zone, &tuple); if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) { nf_ct_put(ct); return -ENOMEM; } err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), ct, true, 0); nf_ct_put(ct); if (err <= 0) { kfree_skb(skb2); return -ENOMEM; } return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } #ifdef CONFIG_NF_CONNTRACK_EVENTS static int ctnetlink_dump_one_entry(struct sk_buff *skb, struct netlink_callback *cb, struct nf_conn *ct, bool dying) { struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u8 l3proto = nfmsg->nfgen_family; int res; if (l3proto && nf_ct_l3num(ct) != l3proto) return 0; if (ctx->last_id) { if (ctnetlink_get_id(ct) != ctx->last_id) return 0; ctx->last_id = 0; } /* We can't dump extension info for the unconfirmed * list because unconfirmed conntracks can have * ct->ext reallocated (and thus freed). * * In the dying list case ct->ext can't be free'd * until after we drop pcpu->lock. */ res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), ct, dying, 0); if (res < 0) ctx->last_id = ctnetlink_get_id(ct); return res; } #endif static int ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb) { return 0; } static int ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) { struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; #ifdef CONFIG_NF_CONNTRACK_EVENTS const struct net *net = sock_net(skb->sk); struct nf_conntrack_net_ecache *ecache_net; unsigned long last_id = ctx->last_id; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; #endif if (ctx->done) return 0; ctx->last_id = 0; #ifdef CONFIG_NF_CONNTRACK_EVENTS ecache_net = nf_conn_pernet_ecache(net); spin_lock_bh(&ecache_net->dying_lock); hlist_nulls_for_each_entry(h, n, &ecache_net->dying_list, hnnode) { struct nf_conn *ct; int res; ct = nf_ct_tuplehash_to_ctrack(h); if (last_id && last_id != ctnetlink_get_id(ct)) continue; res = ctnetlink_dump_one_entry(skb, cb, ct, true); if (res < 0) { spin_unlock_bh(&ecache_net->dying_lock); return skb->len; } last_id = 0; } spin_unlock_bh(&ecache_net->dying_lock); #endif ctx->done = true; return skb->len; } static int ctnetlink_get_ct_dying(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_dying, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } return -EOPNOTSUPP; } static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_unconfirmed, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } return -EOPNOTSUPP; } #if IS_ENABLED(CONFIG_NF_NAT) static int ctnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) __must_hold(RCU) { const struct nf_nat_hook *nat_hook; int err; nat_hook = rcu_dereference(nf_nat_hook); if (!nat_hook) { #ifdef CONFIG_MODULES rcu_read_unlock(); nfnl_unlock(NFNL_SUBSYS_CTNETLINK); if (request_module("nf-nat") < 0) { nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); return -EOPNOTSUPP; } nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); nat_hook = rcu_dereference(nf_nat_hook); if (nat_hook) return -EAGAIN; #endif return -EOPNOTSUPP; } err = nat_hook->parse_nat_setup(ct, manip, attr); if (err == -EAGAIN) { #ifdef CONFIG_MODULES rcu_read_unlock(); nfnl_unlock(NFNL_SUBSYS_CTNETLINK); if (request_module("nf-nat-%u", nf_ct_l3num(ct)) < 0) { nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); return -EOPNOTSUPP; } nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); #else err = -EOPNOTSUPP; #endif } return err; } #endif static int ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) { return nf_ct_change_status_common(ct, ntohl(nla_get_be32(cda[CTA_STATUS]))); } static int ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[]) { #if IS_ENABLED(CONFIG_NF_NAT) int ret; if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) return 0; ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_DST, cda[CTA_NAT_DST]); if (ret < 0) return ret; return ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_SRC, cda[CTA_NAT_SRC]); #else if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) return 0; return -EOPNOTSUPP; #endif } static int ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[]) { struct nf_conntrack_helper *helper; struct nf_conn_help *help = nfct_help(ct); char *helpname = NULL; struct nlattr *helpinfo = NULL; int err; err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo); if (err < 0) return err; /* don't change helper of sibling connections */ if (ct->master) { /* If we try to change the helper to the same thing twice, * treat the second attempt as a no-op instead of returning * an error. */ err = -EBUSY; if (help) { rcu_read_lock(); helper = rcu_dereference(help->helper); if (helper && !strcmp(helper->name, helpname)) err = 0; rcu_read_unlock(); } return err; } if (!strcmp(helpname, "")) { if (help && help->helper) { /* we had a helper before ... */ nf_ct_remove_expectations(ct); RCU_INIT_POINTER(help->helper, NULL); } return 0; } rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); if (helper == NULL) { rcu_read_unlock(); return -EOPNOTSUPP; } if (help) { if (rcu_access_pointer(help->helper) == helper) { /* update private helper data if allowed. */ if (helper->from_nlattr) helper->from_nlattr(helpinfo, ct); err = 0; } else err = -EBUSY; } else { /* we cannot set a helper for an existing conntrack */ err = -EOPNOTSUPP; } rcu_read_unlock(); return err; } static int ctnetlink_change_timeout(struct nf_conn *ct, const struct nlattr * const cda[]) { return __nf_ct_change_timeout(ct, (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ); } #if defined(CONFIG_NF_CONNTRACK_MARK) static void ctnetlink_change_mark(struct nf_conn *ct, const struct nlattr * const cda[]) { u32 mark, newmark, mask = 0; if (cda[CTA_MARK_MASK]) mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK])); mark = ntohl(nla_get_be32(cda[CTA_MARK])); newmark = (READ_ONCE(ct->mark) & mask) ^ mark; if (newmark != READ_ONCE(ct->mark)) WRITE_ONCE(ct->mark, newmark); } #endif static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = { [CTA_PROTOINFO_TCP] = { .type = NLA_NESTED }, [CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED }, }; static int ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[]) { const struct nlattr *attr = cda[CTA_PROTOINFO]; const struct nf_conntrack_l4proto *l4proto; struct nlattr *tb[CTA_PROTOINFO_MAX+1]; int err = 0; err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy, NULL); if (err < 0) return err; l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); if (l4proto->from_nlattr) err = l4proto->from_nlattr(tb, ct); return err; } static const struct nla_policy seqadj_policy[CTA_SEQADJ_MAX+1] = { [CTA_SEQADJ_CORRECTION_POS] = { .type = NLA_U32 }, [CTA_SEQADJ_OFFSET_BEFORE] = { .type = NLA_U32 }, [CTA_SEQADJ_OFFSET_AFTER] = { .type = NLA_U32 }, }; static int change_seq_adj(struct nf_ct_seqadj *seq, const struct nlattr * const attr) { int err; struct nlattr *cda[CTA_SEQADJ_MAX+1]; err = nla_parse_nested_deprecated(cda, CTA_SEQADJ_MAX, attr, seqadj_policy, NULL); if (err < 0) return err; if (!cda[CTA_SEQADJ_CORRECTION_POS]) return -EINVAL; seq->correction_pos = ntohl(nla_get_be32(cda[CTA_SEQADJ_CORRECTION_POS])); if (!cda[CTA_SEQADJ_OFFSET_BEFORE]) return -EINVAL; seq->offset_before = ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_BEFORE])); if (!cda[CTA_SEQADJ_OFFSET_AFTER]) return -EINVAL; seq->offset_after = ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_AFTER])); return 0; } static int ctnetlink_change_seq_adj(struct nf_conn *ct, const struct nlattr * const cda[]) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); int ret = 0; if (!seqadj) return 0; spin_lock_bh(&ct->lock); if (cda[CTA_SEQ_ADJ_ORIG]) { ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL], cda[CTA_SEQ_ADJ_ORIG]); if (ret < 0) goto err; set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); } if (cda[CTA_SEQ_ADJ_REPLY]) { ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY], cda[CTA_SEQ_ADJ_REPLY]); if (ret < 0) goto err; set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); } spin_unlock_bh(&ct->lock); return 0; err: spin_unlock_bh(&ct->lock); return ret; } static const struct nla_policy synproxy_policy[CTA_SYNPROXY_MAX + 1] = { [CTA_SYNPROXY_ISN] = { .type = NLA_U32 }, [CTA_SYNPROXY_ITS] = { .type = NLA_U32 }, [CTA_SYNPROXY_TSOFF] = { .type = NLA_U32 }, }; static int ctnetlink_change_synproxy(struct nf_conn *ct, const struct nlattr * const cda[]) { struct nf_conn_synproxy *synproxy = nfct_synproxy(ct); struct nlattr *tb[CTA_SYNPROXY_MAX + 1]; int err; if (!synproxy) return 0; err = nla_parse_nested_deprecated(tb, CTA_SYNPROXY_MAX, cda[CTA_SYNPROXY], synproxy_policy, NULL); if (err < 0) return err; if (!tb[CTA_SYNPROXY_ISN] || !tb[CTA_SYNPROXY_ITS] || !tb[CTA_SYNPROXY_TSOFF]) return -EINVAL; synproxy->isn = ntohl(nla_get_be32(tb[CTA_SYNPROXY_ISN])); synproxy->its = ntohl(nla_get_be32(tb[CTA_SYNPROXY_ITS])); synproxy->tsoff = ntohl(nla_get_be32(tb[CTA_SYNPROXY_TSOFF])); return 0; } static int ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[]) { #ifdef CONFIG_NF_CONNTRACK_LABELS size_t len = nla_len(cda[CTA_LABELS]); const void *mask = cda[CTA_LABELS_MASK]; if (len & (sizeof(u32)-1)) /* must be multiple of u32 */ return -EINVAL; if (mask) { if (nla_len(cda[CTA_LABELS_MASK]) == 0 || nla_len(cda[CTA_LABELS_MASK]) != len) return -EINVAL; mask = nla_data(cda[CTA_LABELS_MASK]); } len /= sizeof(u32); return nf_connlabels_replace(ct, nla_data(cda[CTA_LABELS]), mask, len); #else return -EOPNOTSUPP; #endif } static int ctnetlink_change_conntrack(struct nf_conn *ct, const struct nlattr * const cda[]) { int err; /* only allow NAT changes and master assignation for new conntracks */ if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST] || cda[CTA_TUPLE_MASTER]) return -EOPNOTSUPP; if (cda[CTA_HELP]) { err = ctnetlink_change_helper(ct, cda); if (err < 0) return err; } if (cda[CTA_TIMEOUT]) { err = ctnetlink_change_timeout(ct, cda); if (err < 0) return err; } if (cda[CTA_STATUS]) { err = ctnetlink_change_status(ct, cda); if (err < 0) return err; } if (cda[CTA_PROTOINFO]) { err = ctnetlink_change_protoinfo(ct, cda); if (err < 0) return err; } #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) ctnetlink_change_mark(ct, cda); #endif if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) { err = ctnetlink_change_seq_adj(ct, cda); if (err < 0) return err; } if (cda[CTA_SYNPROXY]) { err = ctnetlink_change_synproxy(ct, cda); if (err < 0) return err; } if (cda[CTA_LABELS]) { err = ctnetlink_attach_labels(ct, cda); if (err < 0) return err; } return 0; } static struct nf_conn * ctnetlink_create_conntrack(struct net *net, const struct nf_conntrack_zone *zone, const struct nlattr * const cda[], struct nf_conntrack_tuple *otuple, struct nf_conntrack_tuple *rtuple, u8 u3) { struct nf_conn *ct; int err = -EINVAL; struct nf_conntrack_helper *helper; struct nf_conn_tstamp *tstamp; u64 timeout; ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC); if (IS_ERR(ct)) return ERR_PTR(-ENOMEM); if (!cda[CTA_TIMEOUT]) goto err1; rcu_read_lock(); if (cda[CTA_HELP]) { char *helpname = NULL; struct nlattr *helpinfo = NULL; err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo); if (err < 0) goto err2; helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); if (helper == NULL) { rcu_read_unlock(); #ifdef CONFIG_MODULES if (request_module("nfct-helper-%s", helpname) < 0) { err = -EOPNOTSUPP; goto err1; } rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); if (helper) { err = -EAGAIN; goto err2; } rcu_read_unlock(); #endif err = -EOPNOTSUPP; goto err1; } else { struct nf_conn_help *help; help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help == NULL) { err = -ENOMEM; goto err2; } /* set private helper data if allowed. */ if (helper->from_nlattr) helper->from_nlattr(helpinfo, ct); /* disable helper auto-assignment for this entry */ ct->status |= IPS_HELPER; RCU_INIT_POINTER(help->helper, helper); } } err = ctnetlink_setup_nat(ct, cda); if (err < 0) goto err2; nf_ct_acct_ext_add(ct, GFP_ATOMIC); nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); nf_ct_labels_ext_add(ct); nfct_seqadj_ext_add(ct); nfct_synproxy_ext_add(ct); /* we must add conntrack extensions before confirmation. */ ct->status |= IPS_CONFIRMED; timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; __nf_ct_set_timeout(ct, timeout); if (cda[CTA_STATUS]) { err = ctnetlink_change_status(ct, cda); if (err < 0) goto err2; } if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) { err = ctnetlink_change_seq_adj(ct, cda); if (err < 0) goto err2; } memset(&ct->proto, 0, sizeof(ct->proto)); if (cda[CTA_PROTOINFO]) { err = ctnetlink_change_protoinfo(ct, cda); if (err < 0) goto err2; } if (cda[CTA_SYNPROXY]) { err = ctnetlink_change_synproxy(ct, cda); if (err < 0) goto err2; } #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) ctnetlink_change_mark(ct, cda); #endif /* setup master conntrack: this is a confirmed expectation */ if (cda[CTA_TUPLE_MASTER]) { struct nf_conntrack_tuple master; struct nf_conntrack_tuple_hash *master_h; struct nf_conn *master_ct; err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, u3, NULL); if (err < 0) goto err2; master_h = nf_conntrack_find_get(net, zone, &master); if (master_h == NULL) { err = -ENOENT; goto err2; } master_ct = nf_ct_tuplehash_to_ctrack(master_h); __set_bit(IPS_EXPECTED_BIT, &ct->status); ct->master = master_ct; } tstamp = nf_conn_tstamp_find(ct); if (tstamp) tstamp->start = ktime_get_real_ns(); err = nf_conntrack_hash_check_insert(ct); if (err < 0) goto err3; rcu_read_unlock(); return ct; err3: if (ct->master) nf_ct_put(ct->master); err2: rcu_read_unlock(); err1: nf_conntrack_free(ct); return ERR_PTR(err); } static int ctnetlink_new_conntrack(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { struct nf_conntrack_tuple otuple, rtuple; struct nf_conntrack_tuple_hash *h = NULL; u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_zone zone; struct nf_conn *ct; int err; err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); if (err < 0) return err; if (cda[CTA_TUPLE_ORIG]) { err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3, &zone); if (err < 0) return err; } if (cda[CTA_TUPLE_REPLY]) { err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3, &zone); if (err < 0) return err; } if (cda[CTA_TUPLE_ORIG]) h = nf_conntrack_find_get(info->net, &zone, &otuple); else if (cda[CTA_TUPLE_REPLY]) h = nf_conntrack_find_get(info->net, &zone, &rtuple); if (h == NULL) { err = -ENOENT; if (info->nlh->nlmsg_flags & NLM_F_CREATE) { enum ip_conntrack_events events; if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY]) return -EINVAL; if (otuple.dst.protonum != rtuple.dst.protonum) return -EINVAL; ct = ctnetlink_create_conntrack(info->net, &zone, cda, &otuple, &rtuple, u3); if (IS_ERR(ct)) return PTR_ERR(ct); err = 0; if (test_bit(IPS_EXPECTED_BIT, &ct->status)) events = 1 << IPCT_RELATED; else events = 1 << IPCT_NEW; if (cda[CTA_LABELS] && ctnetlink_attach_labels(ct, cda) == 0) events |= (1 << IPCT_LABEL); nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | (1 << IPCT_HELPER) | (1 << IPCT_PROTOINFO) | (1 << IPCT_SEQADJ) | (1 << IPCT_MARK) | (1 << IPCT_SYNPROXY) | events, ct, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); nf_ct_put(ct); } return err; } /* implicit 'else' */ err = -EEXIST; ct = nf_ct_tuplehash_to_ctrack(h); if (!(info->nlh->nlmsg_flags & NLM_F_EXCL)) { err = ctnetlink_change_conntrack(ct, cda); if (err == 0) { nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | (1 << IPCT_HELPER) | (1 << IPCT_LABEL) | (1 << IPCT_PROTOINFO) | (1 << IPCT_SEQADJ) | (1 << IPCT_MARK) | (1 << IPCT_SYNPROXY), ct, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); } } nf_ct_put(ct); return err; } static int ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, __u16 cpu, const struct ip_conntrack_stat *st) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0, event; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS_CPU); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, htons(cpu)); if (!nlh) goto nlmsg_failure; if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) || nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) || nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) || nla_put_be32(skb, CTA_STATS_INSERT_FAILED, htonl(st->insert_failed)) || nla_put_be32(skb, CTA_STATS_DROP, htonl(st->drop)) || nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) || nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) || nla_put_be32(skb, CTA_STATS_SEARCH_RESTART, htonl(st->search_restart)) || nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE, htonl(st->clash_resolve)) || nla_put_be32(skb, CTA_STATS_CHAIN_TOOLONG, htonl(st->chaintoolong))) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nla_put_failure: nlmsg_failure: nlmsg_cancel(skb, nlh); return -1; } static int ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) { int cpu; struct net *net = sock_net(skb->sk); if (cb->args[0] == nr_cpu_ids) return 0; for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { const struct ip_conntrack_stat *st; if (!cpu_possible(cpu)) continue; st = per_cpu_ptr(net->ct.stat, cpu); if (ctnetlink_ct_stat_cpu_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cpu, st) < 0) break; } cb->args[0] = cpu; return skb->len; } static int ctnetlink_stat_ct_cpu(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_ct_stat_cpu_dump, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } return 0; } static int ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct net *net) { unsigned int flags = portid ? NLM_F_MULTI : 0, event; unsigned int nr_conntracks; struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; nr_conntracks = nf_conntrack_count(net); if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks))) goto nla_put_failure; if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max))) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nla_put_failure: nlmsg_failure: nlmsg_cancel(skb, nlh); return -1; } static int ctnetlink_stat_ct(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { struct sk_buff *skb2; int err; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) return -ENOMEM; err = ctnetlink_stat_ct_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), sock_net(skb->sk)); if (err <= 0) { kfree_skb(skb2); return -ENOMEM; } return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { [CTA_EXPECT_MASTER] = { .type = NLA_NESTED }, [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED }, [CTA_EXPECT_MASK] = { .type = NLA_NESTED }, [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 }, [CTA_EXPECT_ID] = { .type = NLA_U32 }, [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN - 1 }, [CTA_EXPECT_ZONE] = { .type = NLA_U16 }, [CTA_EXPECT_FLAGS] = NLA_POLICY_MASK(NLA_BE32, NF_CT_EXPECT_MASK), [CTA_EXPECT_CLASS] = { .type = NLA_U32 }, [CTA_EXPECT_NAT] = { .type = NLA_NESTED }, [CTA_EXPECT_FN] = { .type = NLA_NUL_STRING }, }; static struct nf_conntrack_expect * ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct, const struct nf_conntrack_helper *assign_helper, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask); #ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT static size_t ctnetlink_glue_build_size(const struct nf_conn *ct) { return 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */ + 3 * nla_total_size(0) /* CTA_TUPLE_IP */ + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */ + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */ + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */ + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */ + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */ + nla_total_size(0) /* CTA_PROTOINFO */ + nla_total_size(0) /* CTA_HELP */ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ + ctnetlink_secctx_size(ct) + ctnetlink_acct_size(ct) + ctnetlink_timestamp_size(ct) #if IS_ENABLED(CONFIG_NF_NAT) + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ #endif #ifdef CONFIG_NF_CONNTRACK_MARK + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif #ifdef CONFIG_NF_CONNTRACK_ZONES + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */ #endif + ctnetlink_proto_size(ct) ; } static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) { const struct nf_conntrack_zone *zone; struct nlattr *nest_parms; zone = nf_ct_zone(ct); nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) goto nla_put_failure; if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, NF_CT_ZONE_DIR_ORIG) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) goto nla_put_failure; if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, NF_CT_ZONE_DIR_REPL) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone, NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; if (ctnetlink_dump_id(skb, ct) < 0) goto nla_put_failure; if (ctnetlink_dump_status(skb, ct) < 0) goto nla_put_failure; if (ctnetlink_dump_timeout(skb, ct, false) < 0) goto nla_put_failure; if (ctnetlink_dump_protoinfo(skb, ct, false) < 0) goto nla_put_failure; if (ctnetlink_dump_acct(skb, ct, IPCTNL_MSG_CT_GET) < 0 || ctnetlink_dump_timestamp(skb, ct) < 0) goto nla_put_failure; if (ctnetlink_dump_helpinfo(skb, ct) < 0) goto nla_put_failure; #ifdef CONFIG_NF_CONNTRACK_SECMARK if (ct->secmark && ctnetlink_dump_secctx(skb, ct) < 0) goto nla_put_failure; #endif if (ct->master && ctnetlink_dump_master(skb, ct) < 0) goto nla_put_failure; if ((ct->status & IPS_SEQ_ADJUST) && ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; if (ctnetlink_dump_ct_synproxy(skb, ct) < 0) goto nla_put_failure; #ifdef CONFIG_NF_CONNTRACK_MARK if (ctnetlink_dump_mark(skb, ct, true) < 0) goto nla_put_failure; #endif if (ctnetlink_dump_labels(skb, ct) < 0) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static int ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, u_int16_t ct_attr, u_int16_t ct_info_attr) { struct nlattr *nest_parms; nest_parms = nla_nest_start(skb, ct_attr); if (!nest_parms) goto nla_put_failure; if (__ctnetlink_glue_build(skb, ct) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); if (nla_put_be32(skb, ct_info_attr, htonl(ctinfo))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static int ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[]) { unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS])); unsigned long d = ct->status ^ status; if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) /* SEEN_REPLY bit can only be set */ return -EBUSY; if (d & IPS_ASSURED && !(status & IPS_ASSURED)) /* ASSURED bit can only be set */ return -EBUSY; /* This check is less strict than ctnetlink_change_status() * because callers often flip IPS_EXPECTED bits when sending * an NFQA_CT attribute to the kernel. So ignore the * unchangeable bits but do not error out. Also user programs * are allowed to clear the bits that they are allowed to change. */ __nf_ct_change_status(ct, status, ~status); return 0; } static int ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct) { int err; if (cda[CTA_TIMEOUT]) { err = ctnetlink_change_timeout(ct, cda); if (err < 0) return err; } if (cda[CTA_STATUS]) { err = ctnetlink_update_status(ct, cda); if (err < 0) return err; } if (cda[CTA_HELP]) { err = ctnetlink_change_helper(ct, cda); if (err < 0) return err; } if (cda[CTA_LABELS]) { err = ctnetlink_attach_labels(ct, cda); if (err < 0) return err; } #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) { ctnetlink_change_mark(ct, cda); } #endif return 0; } static int ctnetlink_glue_parse(const struct nlattr *attr, struct nf_conn *ct) { struct nlattr *cda[CTA_MAX+1]; int ret; ret = nla_parse_nested_deprecated(cda, CTA_MAX, attr, ct_nla_policy, NULL); if (ret < 0) return ret; return ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct); } static int ctnetlink_glue_exp_parse(const struct nlattr * const *cda, const struct nf_conn *ct, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask) { int err; err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE, nf_ct_l3num(ct), NULL); if (err < 0) return err; return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK, nf_ct_l3num(ct), NULL); } static int ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, u32 portid, u32 report) { struct nf_conntrack_helper *assign_helper = NULL; struct nlattr *cda[CTA_EXPECT_MAX+1]; struct nf_conntrack_tuple tuple, mask; struct nf_conntrack_expect *exp; int err; err = nla_parse_nested_deprecated(cda, CTA_EXPECT_MAX, attr, exp_nla_policy, NULL); if (err < 0) return err; if (!cda[CTA_EXPECT_TUPLE] || !cda[CTA_EXPECT_MASK]) return -EINVAL; err = ctnetlink_glue_exp_parse((const struct nlattr * const *)cda, ct, &tuple, &mask); if (err < 0) return err; if (cda[CTA_EXPECT_HELP_NAME]) { const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); assign_helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), tuple.dst.protonum); if (!assign_helper) return -EOPNOTSUPP; } exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct, assign_helper, &tuple, &mask); if (IS_ERR(exp)) return PTR_ERR(exp); err = nf_ct_expect_related_report(exp, portid, report, 0); nf_ct_expect_put(exp); return err; } static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, int diff) { if (!(ct->status & IPS_NAT_MASK)) return; nf_ct_tcp_seqadj_set(skb, ct, ctinfo, diff); } static const struct nfnl_ct_hook ctnetlink_glue_hook = { .build_size = ctnetlink_glue_build_size, .build = ctnetlink_glue_build, .parse = ctnetlink_glue_parse, .attach_expect = ctnetlink_glue_attach_expect, .seq_adjust = ctnetlink_glue_seqadj, }; #endif /* CONFIG_NETFILTER_NETLINK_GLUE_CT */ /*********************************************************************** * EXPECT ***********************************************************************/ static int ctnetlink_exp_dump_tuple(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, u32 type) { struct nlattr *nest_parms; nest_parms = nla_nest_start(skb, type); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, tuple) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); return 0; nla_put_failure: return -1; } static int ctnetlink_exp_dump_mask(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple_mask *mask) { const struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple m; struct nlattr *nest_parms; int ret; memset(&m, 0xFF, sizeof(m)); memcpy(&m.src.u3, &mask->src.u3, sizeof(m.src.u3)); m.src.u.all = mask->src.u.all; m.src.l3num = tuple->src.l3num; m.dst.protonum = tuple->dst.protonum; nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK); if (!nest_parms) goto nla_put_failure; rcu_read_lock(); ret = ctnetlink_dump_tuples_ip(skb, &m); if (ret >= 0) { l4proto = nf_ct_l4proto_find(tuple->dst.protonum); ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto); } rcu_read_unlock(); if (unlikely(ret < 0)) goto nla_put_failure; nla_nest_end(skb, nest_parms); return 0; nla_put_failure: return -1; } #if IS_ENABLED(CONFIG_NF_NAT) static const union nf_inet_addr any_addr; #endif static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp) { static siphash_aligned_key_t exp_id_seed; unsigned long a, b, c, d; net_get_random_once(&exp_id_seed, sizeof(exp_id_seed)); a = (unsigned long)exp; b = (unsigned long)exp->helper; c = (unsigned long)exp->master; d = (unsigned long)siphash(&exp->tuple, sizeof(exp->tuple), &exp_id_seed); #ifdef CONFIG_64BIT return (__force __be32)siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &exp_id_seed); #else return (__force __be32)siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &exp_id_seed); #endif } static int ctnetlink_exp_dump_expect(struct sk_buff *skb, const struct nf_conntrack_expect *exp) { struct nf_conn *master = exp->master; long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ; struct nf_conntrack_helper *helper; #if IS_ENABLED(CONFIG_NF_NAT) struct nlattr *nest_parms; struct nf_conntrack_tuple nat_tuple = {}; #endif struct nf_ct_helper_expectfn *expfn; if (timeout < 0) timeout = 0; if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0) goto nla_put_failure; if (ctnetlink_exp_dump_mask(skb, &exp->tuple, &exp->mask) < 0) goto nla_put_failure; if (ctnetlink_exp_dump_tuple(skb, &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple, CTA_EXPECT_MASTER) < 0) goto nla_put_failure; #if IS_ENABLED(CONFIG_NF_NAT) if (!nf_inet_addr_cmp(&exp->saved_addr, &any_addr) || exp->saved_proto.all) { nest_parms = nla_nest_start(skb, CTA_EXPECT_NAT); if (!nest_parms) goto nla_put_failure; if (nla_put_be32(skb, CTA_EXPECT_NAT_DIR, htonl(exp->dir))) goto nla_put_failure; nat_tuple.src.l3num = nf_ct_l3num(master); nat_tuple.src.u3 = exp->saved_addr; nat_tuple.dst.protonum = nf_ct_protonum(master); nat_tuple.src.u = exp->saved_proto; if (ctnetlink_exp_dump_tuple(skb, &nat_tuple, CTA_EXPECT_NAT_TUPLE) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); } #endif if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) || nla_put_be32(skb, CTA_EXPECT_ID, nf_expect_get_id(exp)) || nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) || nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class))) goto nla_put_failure; helper = rcu_dereference(exp->helper); if (helper && nla_put_string(skb, CTA_EXPECT_HELP_NAME, helper->name)) goto nla_put_failure; expfn = nf_ct_helper_expectfn_find_by_symbol(exp->expectfn); if (expfn != NULL && nla_put_string(skb, CTA_EXPECT_FN, expfn->name)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int event, const struct nf_conntrack_expect *exp) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, exp->tuple.src.l3num, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (ctnetlink_exp_dump_expect(skb, exp) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } #ifdef CONFIG_NF_CONNTRACK_EVENTS static int ctnetlink_expect_event(unsigned int events, const struct nf_exp_event *item) { struct nf_conntrack_expect *exp = item->exp; struct net *net = nf_ct_exp_net(exp); struct nlmsghdr *nlh; struct sk_buff *skb; unsigned int type, group; int flags = 0; if (events & (1 << IPEXP_DESTROY)) { type = IPCTNL_MSG_EXP_DELETE; group = NFNLGRP_CONNTRACK_EXP_DESTROY; } else if (events & (1 << IPEXP_NEW)) { type = IPCTNL_MSG_EXP_NEW; flags = NLM_F_CREATE|NLM_F_EXCL; group = NFNLGRP_CONNTRACK_EXP_NEW; } else return 0; if (!item->report && !nfnetlink_has_listeners(net, group)) return 0; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (skb == NULL) goto errout; type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, type); nlh = nfnl_msg_put(skb, item->portid, 0, type, flags, exp->tuple.src.l3num, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (ctnetlink_exp_dump_expect(skb, exp) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); nlmsg_failure: kfree_skb(skb); errout: nfnetlink_set_err(net, 0, 0, -ENOBUFS); return 0; } #endif static unsigned long ctnetlink_exp_id(const struct nf_conntrack_expect *exp) { unsigned long id = (unsigned long)exp; id += nf_ct_get_id(exp->master); id += exp->class; return id ? id : 1; } static int ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; unsigned long last_id = cb->args[1]; struct nf_conntrack_expect *exp; rcu_read_lock(); for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { restart: hlist_for_each_entry_rcu(exp, &nf_ct_expect_hash[cb->args[0]], hnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; if (!net_eq(nf_ct_exp_net(exp), net)) continue; if (cb->args[1]) { if (ctnetlink_exp_id(exp) != last_id) continue; cb->args[1] = 0; } if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp) < 0) { cb->args[1] = ctnetlink_exp_id(exp); goto out; } } if (cb->args[1]) { cb->args[1] = 0; goto restart; } } out: rcu_read_unlock(); return skb->len; } static int ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); struct nf_conn *ct = cb->data; struct nf_conn_help *help; u_int8_t l3proto = nfmsg->nfgen_family; unsigned long last_id = cb->args[1]; struct nf_conntrack_expect *exp; if (cb->args[0]) return 0; help = nfct_help(ct); if (!help) return 0; rcu_read_lock(); restart: hlist_for_each_entry_rcu(exp, &help->expectations, lnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; if (cb->args[1]) { if (ctnetlink_exp_id(exp) != last_id) continue; cb->args[1] = 0; } if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp) < 0) { cb->args[1] = ctnetlink_exp_id(exp); goto out; } } if (cb->args[1]) { cb->args[1] = 0; goto restart; } cb->args[0] = 1; out: rcu_read_unlock(); return skb->len; } static int ctnetlink_dump_exp_ct_start(struct netlink_callback *cb) { struct nf_conn *ct = cb->data; if (!refcount_inc_not_zero(&ct->ct_general.use)) return -ENOENT; return 0; } static int ctnetlink_dump_exp_ct_done(struct netlink_callback *cb) { struct nf_conn *ct = cb->data; if (ct) nf_ct_put(ct); return 0; } static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const cda[], struct netlink_ext_ack *extack) { int err; struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct nf_conntrack_zone zone; struct netlink_dump_control c = { .dump = ctnetlink_exp_ct_dump_table, .start = ctnetlink_dump_exp_ct_start, .done = ctnetlink_dump_exp_ct_done, }; err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3, NULL); if (err < 0) return err; err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); if (err < 0) return err; h = nf_conntrack_find_get(net, &zone, &tuple); if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); /* No expectation linked to this connection tracking. */ if (!nfct_help(ct)) { nf_ct_put(ct); return 0; } c.data = ct; err = netlink_dump_start(ctnl, skb, nlh, &c); nf_ct_put(ct); return err; } static int ctnetlink_get_expect(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; struct nf_conntrack_zone zone; struct sk_buff *skb2; int err; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { if (cda[CTA_EXPECT_MASTER]) return ctnetlink_dump_exp_ct(info->net, info->sk, skb, info->nlh, cda, info->extack); else { struct netlink_dump_control c = { .dump = ctnetlink_exp_dump_table, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } } err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); if (err < 0) return err; if (cda[CTA_EXPECT_TUPLE]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3, NULL); else if (cda[CTA_EXPECT_MASTER]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3, NULL); else return -EINVAL; if (err < 0) return err; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; spin_lock_bh(&nf_conntrack_expect_lock); exp = nf_ct_expect_find_get(info->net, &zone, &tuple); if (!exp) { spin_unlock_bh(&nf_conntrack_expect_lock); kfree_skb(skb2); return -ENOENT; } if (cda[CTA_EXPECT_ID]) { __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); if (id != nf_expect_get_id(exp)) { nf_ct_expect_put(exp); spin_unlock_bh(&nf_conntrack_expect_lock); kfree_skb(skb2); return -ENOENT; } } rcu_read_lock(); err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp); rcu_read_unlock(); nf_ct_expect_put(exp); spin_unlock_bh(&nf_conntrack_expect_lock); if (err <= 0) { kfree_skb(skb2); return -ENOMEM; } return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static bool expect_iter_name(struct nf_conntrack_expect *exp, void *data) { struct nf_conntrack_helper *helper; const char *name = data; helper = rcu_dereference(exp->helper); if (!helper) return false; return strcmp(helper->name, name) == 0; } static bool expect_iter_all(struct nf_conntrack_expect *exp, void *data) { return true; } static int ctnetlink_del_expect(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_expect *exp; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; int err; if (cda[CTA_EXPECT_TUPLE]) { /* delete a single expect by tuple */ err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); if (err < 0) return err; err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3, NULL); if (err < 0) return err; spin_lock_bh(&nf_conntrack_expect_lock); /* bump usage count to 2 */ exp = nf_ct_expect_find_get(info->net, &zone, &tuple); if (!exp) { spin_unlock_bh(&nf_conntrack_expect_lock); return -ENOENT; } if (cda[CTA_EXPECT_ID]) { __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); if (id != nf_expect_get_id(exp)) { nf_ct_expect_put(exp); spin_unlock_bh(&nf_conntrack_expect_lock); return -ENOENT; } } /* after list removal, usage count == 1 */ if (timer_delete(&exp->timeout)) { nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); nf_ct_expect_put(exp); } spin_unlock_bh(&nf_conntrack_expect_lock); /* have to put what we 'get' above. * after this line usage count == 0 */ nf_ct_expect_put(exp); } else if (cda[CTA_EXPECT_HELP_NAME]) { char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]); nf_ct_expect_iterate_net(info->net, expect_iter_name, name, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); } else { /* This basically means we have to flush everything*/ nf_ct_expect_iterate_net(info->net, expect_iter_all, NULL, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); } return 0; } static int ctnetlink_change_expect(struct nf_conntrack_expect *x, const struct nlattr * const cda[]) { if (cda[CTA_EXPECT_TIMEOUT]) { if (!timer_delete(&x->timeout)) return -ETIME; x->timeout.expires = jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ; add_timer(&x->timeout); } return 0; } #if IS_ENABLED(CONFIG_NF_NAT) static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = { [CTA_EXPECT_NAT_DIR] = NLA_POLICY_MAX(NLA_BE32, IP_CT_DIR_REPLY), [CTA_EXPECT_NAT_TUPLE] = { .type = NLA_NESTED }, }; #endif static int ctnetlink_parse_expect_nat(const struct nlattr *attr, struct nf_conntrack_expect *exp, u_int8_t u3) { #if IS_ENABLED(CONFIG_NF_NAT) struct nlattr *tb[CTA_EXPECT_NAT_MAX+1]; struct nf_conntrack_tuple nat_tuple = {}; int err; err = nla_parse_nested_deprecated(tb, CTA_EXPECT_NAT_MAX, attr, exp_nat_nla_policy, NULL); if (err < 0) return err; if (!tb[CTA_EXPECT_NAT_DIR] || !tb[CTA_EXPECT_NAT_TUPLE]) return -EINVAL; err = ctnetlink_parse_tuple((const struct nlattr * const *)tb, &nat_tuple, CTA_EXPECT_NAT_TUPLE, u3, NULL); if (err < 0) return err; exp->saved_addr = nat_tuple.src.u3; exp->saved_proto = nat_tuple.src.u; exp->dir = ntohl(nla_get_be32(tb[CTA_EXPECT_NAT_DIR])); return 0; #else return -EOPNOTSUPP; #endif } static struct nf_conntrack_expect * ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, const struct nf_conntrack_helper *assign_helper, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask) { struct net *net = read_pnet(&ct->ct_net); struct nf_conntrack_helper *helper; struct nf_conntrack_expect *exp; struct nf_conn_help *help; u32 class = 0; int err; help = nfct_help(ct); if (!help) return ERR_PTR(-EOPNOTSUPP); helper = rcu_dereference(help->helper); if (!helper) return ERR_PTR(-EOPNOTSUPP); if (cda[CTA_EXPECT_CLASS]) { class = ntohl(nla_get_be32(cda[CTA_EXPECT_CLASS])); if (class > helper->expect_class_max) return ERR_PTR(-EINVAL); } exp = nf_ct_expect_alloc(ct); if (!exp) return ERR_PTR(-ENOMEM); if (cda[CTA_EXPECT_FLAGS]) { exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); exp->flags &= ~NF_CT_EXPECT_USERSPACE; } else { exp->flags = 0; } if (cda[CTA_EXPECT_FN]) { const char *name = nla_data(cda[CTA_EXPECT_FN]); struct nf_ct_helper_expectfn *expfn; expfn = nf_ct_helper_expectfn_find_by_name(name); if (expfn == NULL) { err = -EINVAL; goto err_out; } exp->expectfn = expfn->expectfn; } else exp->expectfn = NULL; exp->class = class; exp->master = ct; write_pnet(&exp->net, net); #ifdef CONFIG_NF_CONNTRACK_ZONES exp->zone = ct->zone; #endif rcu_assign_pointer(exp->helper, helper); rcu_assign_pointer(exp->assign_helper, assign_helper); exp->tuple = *tuple; exp->mask.src.u3 = mask->src.u3; exp->mask.src.u.all = mask->src.u.all; if (cda[CTA_EXPECT_NAT]) { err = ctnetlink_parse_expect_nat(cda[CTA_EXPECT_NAT], exp, nf_ct_l3num(ct)); if (err < 0) goto err_out; #if IS_ENABLED(CONFIG_NF_NAT) } else { memset(&exp->saved_addr, 0, sizeof(exp->saved_addr)); memset(&exp->saved_proto, 0, sizeof(exp->saved_proto)); exp->dir = 0; #endif } return exp; err_out: nf_ct_expect_put(exp); return ERR_PTR(err); } static int ctnetlink_create_expect(struct net *net, const struct nf_conntrack_zone *zone, const struct nlattr * const cda[], u_int8_t u3, u32 portid, int report) { struct nf_conntrack_tuple tuple, mask, master_tuple; struct nf_conntrack_tuple_hash *h = NULL; struct nf_conntrack_expect *exp; struct nf_conn *ct; int err; /* caller guarantees that those three CTA_EXPECT_* exist */ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3, NULL); if (err < 0) return err; err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3, NULL); if (err < 0) return err; err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3, NULL); if (err < 0) return err; /* Look for master conntrack of this expectation */ h = nf_conntrack_find_get(net, zone, &master_tuple); if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); rcu_read_lock(); exp = ctnetlink_alloc_expect(cda, ct, NULL, &tuple, &mask); if (IS_ERR(exp)) { err = PTR_ERR(exp); goto err_rcu; } err = nf_ct_expect_related_report(exp, portid, report, 0); nf_ct_expect_put(exp); err_rcu: rcu_read_unlock(); nf_ct_put(ct); return err; } static int ctnetlink_new_expect(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; struct nf_conntrack_zone zone; int err; if (!cda[CTA_EXPECT_TUPLE] || !cda[CTA_EXPECT_MASK] || !cda[CTA_EXPECT_MASTER]) return -EINVAL; err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); if (err < 0) return err; err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3, NULL); if (err < 0) return err; spin_lock_bh(&nf_conntrack_expect_lock); exp = __nf_ct_expect_find(info->net, &zone, &tuple); if (!exp) { spin_unlock_bh(&nf_conntrack_expect_lock); err = -ENOENT; if (info->nlh->nlmsg_flags & NLM_F_CREATE) { err = ctnetlink_create_expect(info->net, &zone, cda, u3, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); } return err; } err = -EEXIST; if (!(info->nlh->nlmsg_flags & NLM_F_EXCL)) err = ctnetlink_change_expect(exp, cda); spin_unlock_bh(&nf_conntrack_expect_lock); return err; } static int ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu, const struct ip_conntrack_stat *st) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0, event; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_EXP_GET_STATS_CPU); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, htons(cpu)); if (!nlh) goto nlmsg_failure; if (nla_put_be32(skb, CTA_STATS_EXP_NEW, htonl(st->expect_new)) || nla_put_be32(skb, CTA_STATS_EXP_CREATE, htonl(st->expect_create)) || nla_put_be32(skb, CTA_STATS_EXP_DELETE, htonl(st->expect_delete))) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nla_put_failure: nlmsg_failure: nlmsg_cancel(skb, nlh); return -1; } static int ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) { int cpu; struct net *net = sock_net(skb->sk); if (cb->args[0] == nr_cpu_ids) return 0; for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { const struct ip_conntrack_stat *st; if (!cpu_possible(cpu)) continue; st = per_cpu_ptr(net->ct.stat, cpu); if (ctnetlink_exp_stat_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cpu, st) < 0) break; } cb->args[0] = cpu; return skb->len; } static int ctnetlink_stat_exp_cpu(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_exp_stat_cpu_dump, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } return 0; } #ifdef CONFIG_NF_CONNTRACK_EVENTS static struct nf_ct_event_notifier ctnl_notifier = { .ct_event = ctnetlink_conntrack_event, .exp_event = ctnetlink_expect_event, }; #endif static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, .type = NFNL_CB_MUTEX, .attr_count = CTA_MAX, .policy = ct_nla_policy }, [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, .type = NFNL_CB_MUTEX, .attr_count = CTA_MAX, .policy = ct_nla_policy }, [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, .type = NFNL_CB_MUTEX, .attr_count = CTA_MAX, .policy = ct_nla_policy }, [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, .type = NFNL_CB_MUTEX, .attr_count = CTA_MAX, .policy = ct_nla_policy }, [IPCTNL_MSG_CT_GET_STATS_CPU] = { .call = ctnetlink_stat_ct_cpu, .type = NFNL_CB_MUTEX, }, [IPCTNL_MSG_CT_GET_STATS] = { .call = ctnetlink_stat_ct, .type = NFNL_CB_MUTEX, }, [IPCTNL_MSG_CT_GET_DYING] = { .call = ctnetlink_get_ct_dying, .type = NFNL_CB_MUTEX, }, [IPCTNL_MSG_CT_GET_UNCONFIRMED] = { .call = ctnetlink_get_ct_unconfirmed, .type = NFNL_CB_MUTEX, }, }; static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, .type = NFNL_CB_MUTEX, .attr_count = CTA_EXPECT_MAX, .policy = exp_nla_policy }, [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, .type = NFNL_CB_MUTEX, .attr_count = CTA_EXPECT_MAX, .policy = exp_nla_policy }, [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, .type = NFNL_CB_MUTEX, .attr_count = CTA_EXPECT_MAX, .policy = exp_nla_policy }, [IPCTNL_MSG_EXP_GET_STATS_CPU] = { .call = ctnetlink_stat_exp_cpu, .type = NFNL_CB_MUTEX, }, }; static const struct nfnetlink_subsystem ctnl_subsys = { .name = "conntrack", .subsys_id = NFNL_SUBSYS_CTNETLINK, .cb_count = IPCTNL_MSG_MAX, .cb = ctnl_cb, }; static const struct nfnetlink_subsystem ctnl_exp_subsys = { .name = "conntrack_expect", .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP, .cb_count = IPCTNL_MSG_EXP_MAX, .cb = ctnl_exp_cb, }; MODULE_ALIAS("ip_conntrack_netlink"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP); static int __net_init ctnetlink_net_init(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_EVENTS nf_conntrack_register_notifier(net, &ctnl_notifier); #endif return 0; } static void ctnetlink_net_pre_exit(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_EVENTS nf_conntrack_unregister_notifier(net); #endif } static struct pernet_operations ctnetlink_net_ops = { .init = ctnetlink_net_init, .pre_exit = ctnetlink_net_pre_exit, }; static int __init ctnetlink_init(void) { int ret; NL_ASSERT_CTX_FITS(struct ctnetlink_list_dump_ctx); ret = nfnetlink_subsys_register(&ctnl_subsys); if (ret < 0) { pr_err("ctnetlink_init: cannot register with nfnetlink.\n"); goto err_out; } ret = nfnetlink_subsys_register(&ctnl_exp_subsys); if (ret < 0) { pr_err("ctnetlink_init: cannot register exp with nfnetlink.\n"); goto err_unreg_subsys; } ret = register_pernet_subsys(&ctnetlink_net_ops); if (ret < 0) { pr_err("ctnetlink_init: cannot register pernet operations\n"); goto err_unreg_exp_subsys; } #ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT /* setup interaction between nf_queue and nf_conntrack_netlink. */ RCU_INIT_POINTER(nfnl_ct_hook, &ctnetlink_glue_hook); #endif return 0; err_unreg_exp_subsys: nfnetlink_subsys_unregister(&ctnl_exp_subsys); err_unreg_subsys: nfnetlink_subsys_unregister(&ctnl_subsys); err_out: return ret; } static void __exit ctnetlink_exit(void) { unregister_pernet_subsys(&ctnetlink_net_ops); nfnetlink_subsys_unregister(&ctnl_exp_subsys); nfnetlink_subsys_unregister(&ctnl_subsys); #ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT RCU_INIT_POINTER(nfnl_ct_hook, NULL); #endif synchronize_rcu(); } module_init(ctnetlink_init); module_exit(ctnetlink_exit);
16 26 76 24 24 24 58 1718 1719 1718 1718 1715 1752 44 1719 712 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 // SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- * sysctl_net.c: sysctl interface to net subsystem. * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net directories for each protocol family. [MS] * * Revision 1.2 1996/05/08 20:24:40 shaver * Added bits for NET_BRIDGE and the NET_IPV4_ARP stuff and * NET_IPV4_IP_FORWARD. * * */ #include <linux/mm.h> #include <linux/export.h> #include <linux/sysctl.h> #include <linux/nsproxy.h> #include <net/sock.h> #ifdef CONFIG_INET #include <net/ip.h> #endif #ifdef CONFIG_NET #include <linux/if_ether.h> #endif static struct ctl_table_set * net_ctl_header_lookup(struct ctl_table_root *root) { return &current->nsproxy->net_ns->sysctls; } static int is_seen(struct ctl_table_set *set) { return &current->nsproxy->net_ns->sysctls == set; } /* Return standard mode bits for table entry. */ static int net_ctl_permissions(struct ctl_table_header *head, const struct ctl_table *table) { struct net *net = container_of(head->set, struct net, sysctls); /* Allow network administrator to have same access as root. */ if (ns_capable_noaudit(net->user_ns, CAP_NET_ADMIN)) { int mode = (table->mode >> 6) & 7; return (mode << 6) | (mode << 3) | mode; } return table->mode; } static void net_ctl_set_ownership(struct ctl_table_header *head, kuid_t *uid, kgid_t *gid) { struct net *net = container_of(head->set, struct net, sysctls); kuid_t ns_root_uid; kgid_t ns_root_gid; ns_root_uid = make_kuid(net->user_ns, 0); if (uid_valid(ns_root_uid)) *uid = ns_root_uid; ns_root_gid = make_kgid(net->user_ns, 0); if (gid_valid(ns_root_gid)) *gid = ns_root_gid; } static struct ctl_table_root net_sysctl_root = { .lookup = net_ctl_header_lookup, .permissions = net_ctl_permissions, .set_ownership = net_ctl_set_ownership, }; static int __net_init sysctl_net_init(struct net *net) { setup_sysctl_set(&net->sysctls, &net_sysctl_root, is_seen); return 0; } static void __net_exit sysctl_net_exit(struct net *net) { retire_sysctl_set(&net->sysctls); } static struct pernet_operations sysctl_pernet_ops = { .init = sysctl_net_init, .exit = sysctl_net_exit, }; static struct ctl_table_header *net_header; __init int net_sysctl_init(void) { static struct ctl_table empty[1]; int ret = -ENOMEM; /* Avoid limitations in the sysctl implementation by * registering "/proc/sys/net" as an empty directory not in a * network namespace. */ net_header = register_sysctl_sz("net", empty, 0); if (!net_header) goto out; ret = register_pernet_subsys(&sysctl_pernet_ops); if (ret) goto out1; out: return ret; out1: unregister_sysctl_table(net_header); net_header = NULL; goto out; } /* Verify that sysctls for non-init netns are safe by either: * 1) being read-only, or * 2) having a data pointer which points outside of the global kernel/module * data segment, and rather into the heap where a per-net object was * allocated. */ static void ensure_safe_net_sysctl(struct net *net, const char *path, struct ctl_table *table, size_t table_size) { struct ctl_table *ent; pr_debug("Registering net sysctl (net %p): %s\n", net, path); ent = table; for (size_t i = 0; i < table_size; ent++, i++) { unsigned long addr; const char *where; pr_debug(" procname=%s mode=%o proc_handler=%ps data=%p\n", ent->procname, ent->mode, ent->proc_handler, ent->data); /* If it's not writable inside the netns, then it can't hurt. */ if ((ent->mode & 0222) == 0) { pr_debug(" Not writable by anyone\n"); continue; } /* Where does data point? */ addr = (unsigned long)ent->data; if (is_module_address(addr)) where = "module"; else if (is_kernel_core_data(addr)) where = "kernel"; else continue; /* If it is writable and points to kernel/module global * data, then it's probably a netns leak. */ WARN(1, "sysctl %s/%s: data points to %s global data: %ps\n", path, ent->procname, where, ent->data); /* Make it "safe" by dropping writable perms */ ent->mode &= ~0222; } } struct ctl_table_header *register_net_sysctl_sz(struct net *net, const char *path, struct ctl_table *table, size_t table_size) { if (!net_eq(net, &init_net)) ensure_safe_net_sysctl(net, path, table, table_size); return __register_sysctl_table(&net->sysctls, path, table, table_size); } EXPORT_SYMBOL_GPL(register_net_sysctl_sz); void unregister_net_sysctl_table(struct ctl_table_header *header) { unregister_sysctl_table(header); } EXPORT_SYMBOL_GPL(unregister_net_sysctl_table);
13 14 13 14 14 14 14 14 14 14 13 14 14 14 26 27 27 22 10 25 9 25 24 27 1 1 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/extents_status.c * * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> * Modified by * Allison Henderson <achender@linux.vnet.ibm.com> * Hugh Dickins <hughd@google.com> * Zheng Liu <wenqing.lz@taobao.com> * * Ext4 extents status tree core functions. */ #include <linux/list_sort.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include "ext4.h" #include <trace/events/ext4.h> #include <kunit/static_stub.h> /* * According to previous discussion in Ext4 Developer Workshop, we * will introduce a new structure called io tree to track all extent * status in order to solve some problems that we have met * (e.g. Reservation space warning), and provide extent-level locking. * Delay extent tree is the first step to achieve this goal. It is * original built by Yongqiang Yang. At that time it is called delay * extent tree, whose goal is only track delayed extents in memory to * simplify the implementation of fiemap and bigalloc, and introduce * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called * delay extent tree at the first commit. But for better understand * what it does, it has been rename to extent status tree. * * Step1: * Currently the first step has been done. All delayed extents are * tracked in the tree. It maintains the delayed extent when a delayed * allocation is issued, and the delayed extent is written out or * invalidated. Therefore the implementation of fiemap and bigalloc * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. * * The following comment describes the implemenmtation of extent * status tree and future works. * * Step2: * In this step all extent status are tracked by extent status tree. * Thus, we can first try to lookup a block mapping in this tree before * finding it in extent tree. Hence, single extent cache can be removed * because extent status tree can do a better job. Extents in status * tree are loaded on-demand. Therefore, the extent status tree may not * contain all of the extents in a file. Meanwhile we define a shrinker * to reclaim memory from extent status tree because fragmented extent * tree will make status tree cost too much memory. written/unwritten/- * hole extents in the tree will be reclaimed by this shrinker when we * are under high memory pressure. Delayed extents will not be * reclimed because fiemap, bigalloc, and seek_data/hole need it. */ /* * Extent status tree implementation for ext4. * * * ========================================================================== * Extent status tree tracks all extent status. * * 1. Why we need to implement extent status tree? * * Without extent status tree, ext4 identifies a delayed extent by looking * up page cache, this has several deficiencies - complicated, buggy, * and inefficient code. * * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a * block or a range of blocks are belonged to a delayed extent. * * Let us have a look at how they do without extent status tree. * -- FIEMAP * FIEMAP looks up page cache to identify delayed allocations from holes. * * -- SEEK_HOLE/DATA * SEEK_HOLE/DATA has the same problem as FIEMAP. * * -- bigalloc * bigalloc looks up page cache to figure out if a block is * already under delayed allocation or not to determine whether * quota reserving is needed for the cluster. * * -- writeout * Writeout looks up whole page cache to see if a buffer is * mapped, If there are not very many delayed buffers, then it is * time consuming. * * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA, * bigalloc and writeout can figure out if a block or a range of * blocks is under delayed allocation(belonged to a delayed extent) or * not by searching the extent tree. * * * ========================================================================== * 2. Ext4 extent status tree impelmentation * * -- extent * A extent is a range of blocks which are contiguous logically and * physically. Unlike extent in extent tree, this extent in ext4 is * a in-memory struct, there is no corresponding on-disk data. There * is no limit on length of extent, so an extent can contain as many * blocks as they are contiguous logically and physically. * * -- extent status tree * Every inode has an extent status tree and all allocation blocks * are added to the tree with different status. The extent in the * tree are ordered by logical block no. * * -- operations on a extent status tree * There are three important operations on a delayed extent tree: find * next extent, adding a extent(a range of blocks) and removing a extent. * * -- race on a extent status tree * Extent status tree is protected by inode->i_es_lock. * * -- memory consumption * Fragmented extent tree will make extent status tree cost too much * memory. Hence, we will reclaim written/unwritten/hole extents from * the tree under a heavy memory pressure. * * ========================================================================== * 3. Assurance of Ext4 extent status tree consistency * * When mapping blocks, Ext4 queries the extent status tree first and should * always trusts that the extent status tree is consistent and up to date. * Therefore, it is important to adheres to the following rules when createing, * modifying and removing extents. * * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings, * the extent information should always be processed through the extent * status tree instead of being organized manually through the on-disk * extent tree. * * 2. When updating the extent tree, Ext4 should acquire the i_data_sem * exclusively and update the extent status tree atomically. If the extents * to be modified are large enough to exceed the range that a single * i_data_sem can process (as ext4_datasem_ensure_credits() may drop * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole() * does): * * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures * exclusion against page faults, as well as reads and writes that may * concurrently modify the extent status tree. * b) Evict all page cache in the affected range and recommend rebuilding * or dropping the extent status tree after modifying the on-disk * extent tree. This ensures exclusion against concurrent writebacks * that do not hold those locks but only holds a folio lock. * * 3. Based on the rules above, when querying block mappings, Ext4 should at * least hold the i_rwsem or invalidate_lock or folio lock(s) for the * specified querying range. * * ========================================================================== * 4. Performance analysis * * -- overhead * 1. There is a cache extent for write access, so if writes are * not very random, adding space operaions are in O(1) time. * * -- gain * 2. Code is much simpler, more readable, more maintainable and * more efficient. * * * ========================================================================== * 5. TODO list * * -- Refactor delayed space reservation * * -- Extent-level locking */ static struct kmem_cache *ext4_es_cachep; static struct kmem_cache *ext4_pending_cachep; static int __es_insert_extent(struct inode *inode, struct extent_status *newes, struct extent_status *prealloc); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end, unsigned int status, int *reserved, struct extent_status *res, struct extent_status *prealloc); static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei); static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, struct pending_reservation **prealloc); int __init ext4_init_es(void) { ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); if (ext4_es_cachep == NULL) return -ENOMEM; return 0; } void ext4_exit_es(void) { kmem_cache_destroy(ext4_es_cachep); } void ext4_es_init_tree(struct ext4_es_tree *tree) { tree->root = RB_ROOT; tree->cache_es = NULL; } #ifdef ES_DEBUG__ static void ext4_es_print_tree(struct inode *inode) { struct ext4_es_tree *tree; struct rb_node *node; printk(KERN_DEBUG "status extents for inode %llu:", inode->i_ino); tree = &EXT4_I(inode)->i_es_tree; node = rb_first(&tree->root); while (node) { struct extent_status *es; es = rb_entry(node, struct extent_status, rb_node); printk(KERN_DEBUG " [%u/%u) %llu %x", es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); node = rb_next(node); } printk(KERN_DEBUG "\n"); } #else #define ext4_es_print_tree(inode) #endif static inline ext4_lblk_t ext4_es_end(struct extent_status *es) { BUG_ON(es->es_lblk + es->es_len < es->es_lblk); return es->es_lblk + es->es_len - 1; } static inline void ext4_es_inc_seq(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1); } static inline int __es_check_extent_status(struct extent_status *es, unsigned int status, struct extent_status *res) { if (ext4_es_type(es) & status) return 0; if (res) { res->es_lblk = es->es_lblk; res->es_len = es->es_len; res->es_pblk = es->es_pblk; } return -EINVAL; } /* * search through the tree for an delayed extent with a given offset. If * it can't be found, try to find next extent. */ static struct extent_status *__es_tree_search(struct rb_root *root, ext4_lblk_t lblk) { struct rb_node *node = root->rb_node; struct extent_status *es = NULL; while (node) { es = rb_entry(node, struct extent_status, rb_node); if (lblk < es->es_lblk) node = node->rb_left; else if (lblk > ext4_es_end(es)) node = node->rb_right; else return es; } if (es && lblk < es->es_lblk) return es; if (es && lblk > ext4_es_end(es)) { node = rb_next(&es->rb_node); return node ? rb_entry(node, struct extent_status, rb_node) : NULL; } return NULL; } /* * ext4_es_find_extent_range - find extent with specified status within block * range or next extent following block range in * extents status tree * * @inode - file containing the range * @matching_fn - pointer to function that matches extents with desired status * @lblk - logical block defining start of range * @end - logical block defining end of range * @es - extent found, if any * * Find the first extent within the block range specified by @lblk and @end * in the extents status tree that satisfies @matching_fn. If a match * is found, it's returned in @es. If not, and a matching extent is found * beyond the block range, it's returned in @es. If no match is found, an * extent is returned in @es whose es_lblk, es_len, and es_pblk components * are 0. */ static void __es_find_extent_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es) { struct ext4_es_tree *tree = NULL; struct extent_status *es1 = NULL; struct rb_node *node; WARN_ON(es == NULL); WARN_ON(end < lblk); tree = &EXT4_I(inode)->i_es_tree; /* see if the extent has been cached */ es->es_lblk = es->es_len = es->es_pblk = 0; es1 = READ_ONCE(tree->cache_es); if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) { es_debug("%u cached by [%u/%u) %llu %x\n", lblk, es1->es_lblk, es1->es_len, ext4_es_pblock(es1), ext4_es_status(es1)); goto out; } es1 = __es_tree_search(&tree->root, lblk); out: if (es1 && !matching_fn(es1)) { while ((node = rb_next(&es1->rb_node)) != NULL) { es1 = rb_entry(node, struct extent_status, rb_node); if (es1->es_lblk > end) { es1 = NULL; break; } if (matching_fn(es1)) break; } } if (es1 && matching_fn(es1)) { WRITE_ONCE(tree->cache_es, es1); es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; } } /* * Locking for __es_find_extent_range() for external use */ void ext4_es_find_extent_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es) { es->es_lblk = es->es_len = es->es_pblk = 0; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; trace_ext4_es_find_extent_range_enter(inode, lblk); read_lock(&EXT4_I(inode)->i_es_lock); __es_find_extent_range(inode, matching_fn, lblk, end, es); read_unlock(&EXT4_I(inode)->i_es_lock); trace_ext4_es_find_extent_range_exit(inode, es); } /* * __es_scan_range - search block range for block with specified status * in extents status tree * * @inode - file containing the range * @matching_fn - pointer to function that matches extents with desired status * @lblk - logical block defining start of range * @end - logical block defining end of range * * Returns true if at least one block in the specified block range satisfies * the criterion specified by @matching_fn, and false if not. If at least * one extent has the specified status, then there is at least one block * in the cluster with that status. Should only be called by code that has * taken i_es_lock. */ static bool __es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t start, ext4_lblk_t end) { struct extent_status es; __es_find_extent_range(inode, matching_fn, start, end, &es); if (es.es_len == 0) return false; /* no matching extent in the tree */ else if (es.es_lblk <= start && start < es.es_lblk + es.es_len) return true; else if (start <= es.es_lblk && es.es_lblk <= end) return true; else return false; } /* * Locking for __es_scan_range() for external use */ bool ext4_es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end) { bool ret; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return false; read_lock(&EXT4_I(inode)->i_es_lock); ret = __es_scan_range(inode, matching_fn, lblk, end); read_unlock(&EXT4_I(inode)->i_es_lock); return ret; } /* * __es_scan_clu - search cluster for block with specified status in * extents status tree * * @inode - file containing the cluster * @matching_fn - pointer to function that matches extents with desired status * @lblk - logical block in cluster to be searched * * Returns true if at least one extent in the cluster containing @lblk * satisfies the criterion specified by @matching_fn, and false if not. If at * least one extent has the specified status, then there is at least one block * in the cluster with that status. Should only be called by code that has * taken i_es_lock. */ static bool __es_scan_clu(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t lblk_start, lblk_end; lblk_start = EXT4_LBLK_CMASK(sbi, lblk); lblk_end = lblk_start + sbi->s_cluster_ratio - 1; return __es_scan_range(inode, matching_fn, lblk_start, lblk_end); } /* * Locking for __es_scan_clu() for external use */ bool ext4_es_scan_clu(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk) { bool ret; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return false; read_lock(&EXT4_I(inode)->i_es_lock); ret = __es_scan_clu(inode, matching_fn, lblk); read_unlock(&EXT4_I(inode)->i_es_lock); return ret; } static void ext4_es_list_add(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (!list_empty(&ei->i_es_list)) return; spin_lock(&sbi->s_es_lock); if (list_empty(&ei->i_es_list)) { list_add_tail(&ei->i_es_list, &sbi->s_es_list); sbi->s_es_nr_inode++; } spin_unlock(&sbi->s_es_lock); } static void ext4_es_list_del(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); spin_lock(&sbi->s_es_lock); if (!list_empty(&ei->i_es_list)) { list_del_init(&ei->i_es_list); sbi->s_es_nr_inode--; WARN_ON_ONCE(sbi->s_es_nr_inode < 0); } spin_unlock(&sbi->s_es_lock); } static inline struct pending_reservation *__alloc_pending(bool nofail) { if (!nofail) return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC); return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL); } static inline void __free_pending(struct pending_reservation *pr) { kmem_cache_free(ext4_pending_cachep, pr); } /* * Returns true if we cannot fail to allocate memory for this extent_status * entry and cannot reclaim it until its status changes. */ static inline bool ext4_es_must_keep(struct extent_status *es) { /* fiemap, bigalloc, and seek_data/hole need to use it. */ if (ext4_es_is_delayed(es)) return true; return false; } static inline struct extent_status *__es_alloc_extent(bool nofail) { if (!nofail) return kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); return kmem_cache_zalloc(ext4_es_cachep, GFP_KERNEL | __GFP_NOFAIL); } static void ext4_es_init_extent(struct inode *inode, struct extent_status *es, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk) { es->es_lblk = lblk; es->es_len = len; es->es_pblk = pblk; /* We never try to reclaim a must kept extent, so we don't count it. */ if (!ext4_es_must_keep(es)) { if (!EXT4_I(inode)->i_es_shk_nr++) ext4_es_list_add(inode); percpu_counter_inc(&EXT4_SB(inode->i_sb)-> s_es_stats.es_stats_shk_cnt); } EXT4_I(inode)->i_es_all_nr++; percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); } static inline void __es_free_extent(struct extent_status *es) { kmem_cache_free(ext4_es_cachep, es); } static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) { EXT4_I(inode)->i_es_all_nr--; percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); /* Decrease the shrink counter when we can reclaim the extent. */ if (!ext4_es_must_keep(es)) { BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0); if (!--EXT4_I(inode)->i_es_shk_nr) ext4_es_list_del(inode); percpu_counter_dec(&EXT4_SB(inode->i_sb)-> s_es_stats.es_stats_shk_cnt); } __es_free_extent(es); } /* * Check whether or not two extents can be merged * Condition: * - logical block number is contiguous * - physical block number is contiguous * - status is equal */ static int ext4_es_can_be_merged(struct extent_status *es1, struct extent_status *es2) { if (ext4_es_type(es1) != ext4_es_type(es2)) return 0; if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) { pr_warn("ES assertion failed when merging extents. " "The sum of lengths of es1 (%d) and es2 (%d) " "is bigger than allowed file size (%d)\n", es1->es_len, es2->es_len, EXT_MAX_BLOCKS); WARN_ON(1); return 0; } if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk) return 0; if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2))) return 1; if (ext4_es_is_hole(es1)) return 1; /* we need to check delayed extent */ if (ext4_es_is_delayed(es1)) return 1; return 0; } static struct extent_status * ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct extent_status *es1; struct rb_node *node; node = rb_prev(&es->rb_node); if (!node) return es; es1 = rb_entry(node, struct extent_status, rb_node); if (ext4_es_can_be_merged(es1, es)) { es1->es_len += es->es_len; if (ext4_es_is_referenced(es)) ext4_es_set_referenced(es1); rb_erase(&es->rb_node, &tree->root); ext4_es_free_extent(inode, es); es = es1; } return es; } static struct extent_status * ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct extent_status *es1; struct rb_node *node; node = rb_next(&es->rb_node); if (!node) return es; es1 = rb_entry(node, struct extent_status, rb_node); if (ext4_es_can_be_merged(es, es1)) { es->es_len += es1->es_len; if (ext4_es_is_referenced(es1)) ext4_es_set_referenced(es); rb_erase(node, &tree->root); ext4_es_free_extent(inode, es1); } return es; } #ifdef ES_AGGRESSIVE_TEST #include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */ static void ext4_es_insert_extent_ext_check(struct inode *inode, struct extent_status *es) { struct ext4_ext_path *path = NULL; struct ext4_extent *ex; ext4_lblk_t ee_block; ext4_fsblk_t ee_start; unsigned short ee_len; int depth, ee_status, es_status; path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return; depth = ext_depth(inode); ex = path[depth].p_ext; if (ex) { ee_block = le32_to_cpu(ex->ee_block); ee_start = ext4_ext_pblock(ex); ee_len = ext4_ext_get_actual_len(ex); ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0; es_status = ext4_es_is_unwritten(es) ? 1 : 0; /* * Make sure ex and es are not overlap when we try to insert * a delayed/hole extent. */ if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { if (in_range(es->es_lblk, ee_block, ee_len)) { pr_warn("ES insert assertion failed for " "inode: %llu we can find an extent " "at block [%d/%d/%llu/%c], but we " "want to add a delayed/hole extent " "[%d/%d/%llu/%x]\n", inode->i_ino, ee_block, ee_len, ee_start, ee_status ? 'u' : 'w', es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); } goto out; } /* * We don't check ee_block == es->es_lblk, etc. because es * might be a part of whole extent, vice versa. */ if (es->es_lblk < ee_block || ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { pr_warn("ES insert assertion failed for inode: %llu " "ex_status [%d/%d/%llu/%c] != " "es_status [%d/%d/%llu/%c]\n", inode->i_ino, ee_block, ee_len, ee_start, ee_status ? 'u' : 'w', es->es_lblk, es->es_len, ext4_es_pblock(es), es_status ? 'u' : 'w'); goto out; } if (ee_status ^ es_status) { pr_warn("ES insert assertion failed for inode: %llu " "ex_status [%d/%d/%llu/%c] != " "es_status [%d/%d/%llu/%c]\n", inode->i_ino, ee_block, ee_len, ee_start, ee_status ? 'u' : 'w', es->es_lblk, es->es_len, ext4_es_pblock(es), es_status ? 'u' : 'w'); } } else { /* * We can't find an extent on disk. So we need to make sure * that we don't want to add an written/unwritten extent. */ if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { pr_warn("ES insert assertion failed for inode: %llu " "can't find an extent at block %d but we want " "to add a written/unwritten extent " "[%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); } } out: ext4_free_ext_path(path); } static void ext4_es_insert_extent_ind_check(struct inode *inode, struct extent_status *es) { struct ext4_map_blocks map; int retval; /* * Here we call ext4_ind_map_blocks to lookup a block mapping because * 'Indirect' structure is defined in indirect.c. So we couldn't * access direct/indirect tree from outside. It is too dirty to define * this function in indirect.c file. */ map.m_lblk = es->es_lblk; map.m_len = es->es_len; retval = ext4_ind_map_blocks(NULL, inode, &map, 0); if (retval > 0) { if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) { /* * We want to add a delayed/hole extent but this * block has been allocated. */ pr_warn("ES insert assertion failed for inode: %llu " "We can find blocks but we want to add a " "delayed/hole extent [%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); return; } else if (ext4_es_is_written(es)) { if (retval != es->es_len) { pr_warn("ES insert assertion failed for " "inode: %llu retval %d != es_len %d\n", inode->i_ino, retval, es->es_len); return; } if (map.m_pblk != ext4_es_pblock(es)) { pr_warn("ES insert assertion failed for " "inode: %llu m_pblk %llu != " "es_pblk %llu\n", inode->i_ino, map.m_pblk, ext4_es_pblock(es)); return; } } else { /* * We don't need to check unwritten extent because * indirect-based file doesn't have it. */ BUG(); } } else if (retval == 0) { if (ext4_es_is_written(es)) { pr_warn("ES insert assertion failed for inode: %llu " "We can't find the block but we want to add " "a written extent [%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); return; } } } static inline void ext4_es_insert_extent_check(struct inode *inode, struct extent_status *es) { /* * We don't need to worry about the race condition because * caller takes i_data_sem locking. */ BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ext4_es_insert_extent_ext_check(inode, es); else ext4_es_insert_extent_ind_check(inode, es); } #else static inline void ext4_es_insert_extent_check(struct inode *inode, struct extent_status *es) { } #endif static int __es_insert_extent(struct inode *inode, struct extent_status *newes, struct extent_status *prealloc) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct rb_node **p = &tree->root.rb_node; struct rb_node *parent = NULL; struct extent_status *es; while (*p) { parent = *p; es = rb_entry(parent, struct extent_status, rb_node); if (newes->es_lblk < es->es_lblk) { if (ext4_es_can_be_merged(newes, es)) { /* * Here we can modify es_lblk directly * because it isn't overlapped. */ es->es_lblk = newes->es_lblk; es->es_len += newes->es_len; if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) ext4_es_store_pblock(es, newes->es_pblk); es = ext4_es_try_to_merge_left(inode, es); goto out; } p = &(*p)->rb_left; } else if (newes->es_lblk > ext4_es_end(es)) { if (ext4_es_can_be_merged(es, newes)) { es->es_len += newes->es_len; es = ext4_es_try_to_merge_right(inode, es); goto out; } p = &(*p)->rb_right; } else { BUG(); return -EINVAL; } } if (prealloc) es = prealloc; else es = __es_alloc_extent(false); if (!es) return -ENOMEM; ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len, newes->es_pblk); rb_link_node(&es->rb_node, parent, p); rb_insert_color(&es->rb_node, &tree->root); out: tree->cache_es = es; return 0; } /* * ext4_es_insert_extent() adds information to an inode's extent * status tree. This interface is used for modifying extents. To cache * on-disk extents, use ext4_es_cache_extent() instead. */ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status, bool delalloc_reserve_used) { struct extent_status newes; ext4_lblk_t end = lblk + len - 1; int err1 = 0, err2 = 0, err3 = 0; int resv_used = 0, pending = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct extent_status *es1 = NULL; struct extent_status *es2 = NULL; struct pending_reservation *pr = NULL; bool revise_pending = false; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %llu\n", lblk, len, pblk, status, delalloc_reserve_used, inode->i_ino); if (!len) return; BUG_ON(end < lblk); WARN_ON_ONCE(status & EXTENT_STATUS_DELAYED); newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, pblk, status); ext4_es_insert_extent_check(inode, &newes); revise_pending = sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && (status & (EXTENT_STATUS_WRITTEN | EXTENT_STATUS_UNWRITTEN)); retry: if (err1 && !es1) es1 = __es_alloc_extent(true); if ((err1 || err2) && !es2) es2 = __es_alloc_extent(true); if ((err1 || err2 || err3 < 0) && revise_pending && !pr) pr = __alloc_pending(true); write_lock(&EXT4_I(inode)->i_es_lock); err1 = __es_remove_extent(inode, lblk, end, 0, &resv_used, NULL, es1); if (err1 != 0) goto error; /* Free preallocated extent if it didn't get used. */ if (es1) { if (!es1->es_len) __es_free_extent(es1); es1 = NULL; } err2 = __es_insert_extent(inode, &newes, es2); if (err2 == -ENOMEM && !ext4_es_must_keep(&newes)) err2 = 0; if (err2 != 0) goto error; /* Free preallocated extent if it didn't get used. */ if (es2) { if (!es2->es_len) __es_free_extent(es2); es2 = NULL; } if (revise_pending) { err3 = __revise_pending(inode, lblk, len, &pr); if (err3 < 0) goto error; if (pr) { __free_pending(pr); pr = NULL; } pending = err3; } ext4_es_inc_seq(inode); error: write_unlock(&EXT4_I(inode)->i_es_lock); /* * Reduce the reserved cluster count to reflect successful deferred * allocation of delayed allocated clusters or direct allocation of * clusters discovered to be delayed allocated. Once allocated, a * cluster is not included in the reserved count. * * When direct allocating (from fallocate, filemap, DIO, or clusters * allocated when delalloc has been disabled by ext4_nonda_switch()) * an extent either 1) contains delayed blocks but start with * non-delayed allocated blocks (e.g. hole) or 2) contains non-delayed * allocated blocks which belong to delayed allocated clusters when * bigalloc feature is enabled, quota has already been claimed by * ext4_mb_new_blocks(), so release the quota reservations made for * any previously delayed allocated clusters instead of claim them * again. */ resv_used += pending; if (resv_used) ext4_da_update_reserve_space(inode, resv_used, delalloc_reserve_used); if (err1 || err2 || err3 < 0) goto retry; trace_ext4_es_insert_extent(inode, &newes); ext4_es_print_tree(inode); return; } /* * ext4_es_cache_extent() inserts information into the extent status tree * only if there is no existing information about the specified range or * if the existing extents have the same status. * * Note that this interface is only used for caching on-disk extent * information and cannot be used to convert existing extents in the extent * status tree. To convert existing extents, use ext4_es_insert_extent() * instead. */ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status) { struct extent_status *es; struct extent_status chkes, newes; ext4_lblk_t end = lblk + len - 1; bool conflict = false; int err; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, pblk, status); if (!len) return; BUG_ON(end < lblk); write_lock(&EXT4_I(inode)->i_es_lock); es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); if (es && es->es_lblk <= end) { /* Found an extent that covers the entire range. */ if (es->es_lblk <= lblk && es->es_lblk + es->es_len > end) { if (__es_check_extent_status(es, status, &chkes)) conflict = true; goto unlock; } /* Check and remove all extents in range. */ err = __es_remove_extent(inode, lblk, end, status, NULL, &chkes, NULL); if (err) { if (err == -EINVAL) conflict = true; goto unlock; } } __es_insert_extent(inode, &newes, NULL); trace_ext4_es_cache_extent(inode, &newes); ext4_es_print_tree(inode); unlock: write_unlock(&EXT4_I(inode)->i_es_lock); if (!conflict) return; /* * A hole in the on-disk extent but a delayed extent in the extent * status tree, is allowed. */ if (status == EXTENT_STATUS_HOLE && ext4_es_type(&chkes) == EXTENT_STATUS_DELAYED) return; ext4_warning_inode(inode, "ES cache extent failed: add [%d,%d,%llu,0x%x] conflict with existing [%d,%d,%llu,0x%x]\n", lblk, len, pblk, status, chkes.es_lblk, chkes.es_len, ext4_es_pblock(&chkes), ext4_es_status(&chkes)); } /* * ext4_es_lookup_extent() looks up an extent in extent status tree. * * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks. * * Return: 1 on found, 0 on not */ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t *next_lblk, struct extent_status *es, u64 *pseq) { struct ext4_es_tree *tree; struct ext4_es_stats *stats; struct extent_status *es1 = NULL; struct rb_node *node; int found = 0; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return 0; trace_ext4_es_lookup_extent_enter(inode, lblk); es_debug("lookup extent in block %u\n", lblk); tree = &EXT4_I(inode)->i_es_tree; read_lock(&EXT4_I(inode)->i_es_lock); /* find extent in cache firstly */ es->es_lblk = es->es_len = es->es_pblk = 0; es1 = READ_ONCE(tree->cache_es); if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) { es_debug("%u cached by [%u/%u)\n", lblk, es1->es_lblk, es1->es_len); found = 1; goto out; } node = tree->root.rb_node; while (node) { es1 = rb_entry(node, struct extent_status, rb_node); if (lblk < es1->es_lblk) node = node->rb_left; else if (lblk > ext4_es_end(es1)) node = node->rb_right; else { found = 1; break; } } out: stats = &EXT4_SB(inode->i_sb)->s_es_stats; if (found) { BUG_ON(!es1); es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; if (!ext4_es_is_referenced(es1)) ext4_es_set_referenced(es1); percpu_counter_inc(&stats->es_stats_cache_hits); if (next_lblk) { node = rb_next(&es1->rb_node); if (node) { es1 = rb_entry(node, struct extent_status, rb_node); *next_lblk = es1->es_lblk; } else *next_lblk = 0; } if (pseq) *pseq = EXT4_I(inode)->i_es_seq; } else { percpu_counter_inc(&stats->es_stats_cache_misses); } read_unlock(&EXT4_I(inode)->i_es_lock); trace_ext4_es_lookup_extent_exit(inode, es, found); return found; } struct rsvd_count { int ndelayed; bool first_do_lblk_found; ext4_lblk_t first_do_lblk; ext4_lblk_t last_do_lblk; struct extent_status *left_es; bool partial; ext4_lblk_t lclu; }; /* * init_rsvd - initialize reserved count data before removing block range * in file from extent status tree * * @inode - file containing range * @lblk - first block in range * @es - pointer to first extent in range * @rc - pointer to reserved count data * * Assumes es is not NULL */ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es, struct rsvd_count *rc) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct rb_node *node; rc->ndelayed = 0; /* * for bigalloc, note the first delayed block in the range has not * been found, record the extent containing the block to the left of * the region to be removed, if any, and note that there's no partial * cluster to track */ if (sbi->s_cluster_ratio > 1) { rc->first_do_lblk_found = false; if (lblk > es->es_lblk) { rc->left_es = es; } else { node = rb_prev(&es->rb_node); rc->left_es = node ? rb_entry(node, struct extent_status, rb_node) : NULL; } rc->partial = false; } } /* * count_rsvd - count the clusters containing delayed blocks in a range * within an extent and add to the running tally in rsvd_count * * @inode - file containing extent * @lblk - first block in range * @len - length of range in blocks * @es - pointer to extent containing clusters to be counted * @rc - pointer to reserved count data * * Tracks partial clusters found at the beginning and end of extents so * they aren't overcounted when they span adjacent extents */ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, struct extent_status *es, struct rsvd_count *rc) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t i, end, nclu; if (!ext4_es_is_delayed(es)) return; WARN_ON(len <= 0); if (sbi->s_cluster_ratio == 1) { rc->ndelayed += (int) len; return; } /* bigalloc */ i = (lblk < es->es_lblk) ? es->es_lblk : lblk; end = lblk + (ext4_lblk_t) len - 1; end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end; /* record the first block of the first delayed extent seen */ if (!rc->first_do_lblk_found) { rc->first_do_lblk = i; rc->first_do_lblk_found = true; } /* update the last lblk in the region seen so far */ rc->last_do_lblk = end; /* * if we're tracking a partial cluster and the current extent * doesn't start with it, count it and stop tracking */ if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) { rc->ndelayed++; rc->partial = false; } /* * if the first cluster doesn't start on a cluster boundary but * ends on one, count it */ if (EXT4_LBLK_COFF(sbi, i) != 0) { if (end >= EXT4_LBLK_CFILL(sbi, i)) { rc->ndelayed++; rc->partial = false; i = EXT4_LBLK_CFILL(sbi, i) + 1; } } /* * if the current cluster starts on a cluster boundary, count the * number of whole delayed clusters in the extent */ if ((i + sbi->s_cluster_ratio - 1) <= end) { nclu = (end - i + 1) >> sbi->s_cluster_bits; rc->ndelayed += nclu; i += nclu << sbi->s_cluster_bits; } /* * start tracking a partial cluster if there's a partial at the end * of the current extent and we're not already tracking one */ if (!rc->partial && i <= end) { rc->partial = true; rc->lclu = EXT4_B2C(sbi, i); } } /* * __pr_tree_search - search for a pending cluster reservation * * @root - root of pending reservation tree * @lclu - logical cluster to search for * * Returns the pending reservation for the cluster identified by @lclu * if found. If not, returns a reservation for the next cluster if any, * and if not, returns NULL. */ static struct pending_reservation *__pr_tree_search(struct rb_root *root, ext4_lblk_t lclu) { struct rb_node *node = root->rb_node; struct pending_reservation *pr = NULL; while (node) { pr = rb_entry(node, struct pending_reservation, rb_node); if (lclu < pr->lclu) node = node->rb_left; else if (lclu > pr->lclu) node = node->rb_right; else return pr; } if (pr && lclu < pr->lclu) return pr; if (pr && lclu > pr->lclu) { node = rb_next(&pr->rb_node); return node ? rb_entry(node, struct pending_reservation, rb_node) : NULL; } return NULL; } /* * get_rsvd - calculates and returns the number of cluster reservations to be * released when removing a block range from the extent status tree * and releases any pending reservations within the range * * @inode - file containing block range * @end - last block in range * @right_es - pointer to extent containing next block beyond end or NULL * @rc - pointer to reserved count data * * The number of reservations to be released is equal to the number of * clusters containing delayed blocks within the range, minus the number of * clusters still containing delayed blocks at the ends of the range, and * minus the number of pending reservations within the range. */ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, struct extent_status *right_es, struct rsvd_count *rc) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct pending_reservation *pr; struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; struct rb_node *node; ext4_lblk_t first_lclu, last_lclu; bool left_delayed, right_delayed, count_pending; struct extent_status *es; if (sbi->s_cluster_ratio > 1) { /* count any remaining partial cluster */ if (rc->partial) rc->ndelayed++; if (rc->ndelayed == 0) return 0; first_lclu = EXT4_B2C(sbi, rc->first_do_lblk); last_lclu = EXT4_B2C(sbi, rc->last_do_lblk); /* * decrease the delayed count by the number of clusters at the * ends of the range that still contain delayed blocks - * these clusters still need to be reserved */ left_delayed = right_delayed = false; es = rc->left_es; while (es && ext4_es_end(es) >= EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) { if (ext4_es_is_delayed(es)) { rc->ndelayed--; left_delayed = true; break; } node = rb_prev(&es->rb_node); if (!node) break; es = rb_entry(node, struct extent_status, rb_node); } if (right_es && (!left_delayed || first_lclu != last_lclu)) { if (end < ext4_es_end(right_es)) { es = right_es; } else { node = rb_next(&right_es->rb_node); es = node ? rb_entry(node, struct extent_status, rb_node) : NULL; } while (es && es->es_lblk <= EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) { if (ext4_es_is_delayed(es)) { rc->ndelayed--; right_delayed = true; break; } node = rb_next(&es->rb_node); if (!node) break; es = rb_entry(node, struct extent_status, rb_node); } } /* * Determine the block range that should be searched for * pending reservations, if any. Clusters on the ends of the * original removed range containing delayed blocks are * excluded. They've already been accounted for and it's not * possible to determine if an associated pending reservation * should be released with the information available in the * extents status tree. */ if (first_lclu == last_lclu) { if (left_delayed | right_delayed) count_pending = false; else count_pending = true; } else { if (left_delayed) first_lclu++; if (right_delayed) last_lclu--; if (first_lclu <= last_lclu) count_pending = true; else count_pending = false; } /* * a pending reservation found between first_lclu and last_lclu * represents an allocated cluster that contained at least one * delayed block, so the delayed total must be reduced by one * for each pending reservation found and released */ if (count_pending) { pr = __pr_tree_search(&tree->root, first_lclu); while (pr && pr->lclu <= last_lclu) { rc->ndelayed--; node = rb_next(&pr->rb_node); rb_erase(&pr->rb_node, &tree->root); __free_pending(pr); if (!node) break; pr = rb_entry(node, struct pending_reservation, rb_node); } } } return rc->ndelayed; } /* * __es_remove_extent - removes block range from extent status tree * * @inode - file containing range * @lblk - first block in range * @end - last block in range * @status - the extent status to be checked * @reserved - number of cluster reservations released * @res - return the extent if the status is not match * @prealloc - pre-allocated es to avoid memory allocation failures * * If @reserved is not NULL and delayed allocation is enabled, counts * block/cluster reservations freed by removing range and if bigalloc * enabled cancels pending reservations as needed. If @status is not * zero, check extent status type while removing extent, return -EINVAL * and pass out the extent through @res if not match. Returns 0 on * success, error code on failure. */ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end, unsigned int status, int *reserved, struct extent_status *res, struct extent_status *prealloc) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct rb_node *node; struct extent_status *es; struct extent_status orig_es; ext4_lblk_t len1, len2; ext4_fsblk_t block; int err; bool count_reserved = true; struct rsvd_count rc; if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC)) count_reserved = false; if (status == 0) status = ES_TYPE_MASK; es = __es_tree_search(&tree->root, lblk); if (!es) return 0; if (es->es_lblk > end) return 0; err = __es_check_extent_status(es, status, res); if (err) return err; /* Simply invalidate cache_es. */ tree->cache_es = NULL; if (count_reserved) init_rsvd(inode, lblk, es, &rc); orig_es.es_lblk = es->es_lblk; orig_es.es_len = es->es_len; orig_es.es_pblk = es->es_pblk; len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0; len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0; if (len1 > 0) es->es_len = len1; if (len2 > 0) { if (len1 > 0) { struct extent_status newes; newes.es_lblk = end + 1; newes.es_len = len2; block = 0x7FDEADBEEFULL; if (ext4_es_is_written(&orig_es) || ext4_es_is_unwritten(&orig_es)) block = ext4_es_pblock(&orig_es) + orig_es.es_len - len2; ext4_es_store_pblock_status(&newes, block, ext4_es_status(&orig_es)); err = __es_insert_extent(inode, &newes, prealloc); if (err) { if (!ext4_es_must_keep(&newes)) return 0; es->es_lblk = orig_es.es_lblk; es->es_len = orig_es.es_len; return err; } } else { es->es_lblk = end + 1; es->es_len = len2; if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) { block = orig_es.es_pblk + orig_es.es_len - len2; ext4_es_store_pblock(es, block); } } if (count_reserved) count_rsvd(inode, orig_es.es_lblk + len1, orig_es.es_len - len1 - len2, &orig_es, &rc); goto out; } if (len1 > 0) { if (count_reserved) count_rsvd(inode, lblk, orig_es.es_len - len1, &orig_es, &rc); node = rb_next(&es->rb_node); if (node) es = rb_entry(node, struct extent_status, rb_node); else es = NULL; } while (es && ext4_es_end(es) <= end) { err = __es_check_extent_status(es, status, res); if (err) return err; if (count_reserved) count_rsvd(inode, es->es_lblk, es->es_len, es, &rc); node = rb_next(&es->rb_node); rb_erase(&es->rb_node, &tree->root); ext4_es_free_extent(inode, es); if (!node) { es = NULL; break; } es = rb_entry(node, struct extent_status, rb_node); } if (es && es->es_lblk < end + 1) { ext4_lblk_t orig_len = es->es_len; err = __es_check_extent_status(es, status, res); if (err) return err; len1 = ext4_es_end(es) - end; if (count_reserved) count_rsvd(inode, es->es_lblk, orig_len - len1, es, &rc); es->es_lblk = end + 1; es->es_len = len1; if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) { block = es->es_pblk + orig_len - len1; ext4_es_store_pblock(es, block); } } out: if (count_reserved) *reserved = get_rsvd(inode, end, es, &rc); return 0; } /* * ext4_es_remove_extent - removes block range from extent status tree * * @inode - file containing range * @lblk - first block in range * @len - number of blocks to remove * * Reduces block/cluster reservation count and for bigalloc cancels pending * reservations as needed. */ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len) { ext4_lblk_t end; int err = 0; int reserved = 0; struct extent_status *es = NULL; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; es_debug("remove [%u/%u) from extent status tree of inode %llu\n", lblk, len, inode->i_ino); if (!len) return; end = lblk + len - 1; BUG_ON(end < lblk); retry: if (err && !es) es = __es_alloc_extent(true); /* * ext4_clear_inode() depends on us taking i_es_lock unconditionally * so that we are sure __es_shrink() is done with the inode before it * is reclaimed. */ write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end, 0, &reserved, NULL, es); if (err) goto error; /* Free preallocated extent if it didn't get used. */ if (es) { if (!es->es_len) __es_free_extent(es); es = NULL; } ext4_es_inc_seq(inode); error: write_unlock(&EXT4_I(inode)->i_es_lock); if (err) goto retry; trace_ext4_es_remove_extent(inode, lblk, len); ext4_es_print_tree(inode); ext4_da_release_space(inode, reserved); } static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei) { struct ext4_inode_info *ei; struct ext4_es_stats *es_stats; ktime_t start_time; u64 scan_time; int nr_to_walk; int nr_shrunk = 0; int retried = 0, nr_skipped = 0; es_stats = &sbi->s_es_stats; start_time = ktime_get(); retry: spin_lock(&sbi->s_es_lock); nr_to_walk = sbi->s_es_nr_inode; while (nr_to_walk-- > 0) { if (list_empty(&sbi->s_es_list)) { spin_unlock(&sbi->s_es_lock); goto out; } ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info, i_es_list); /* Move the inode to the tail */ list_move_tail(&ei->i_es_list, &sbi->s_es_list); /* * Normally we try hard to avoid shrinking precached inodes, * but we will as a last resort. */ if (!retried && ext4_test_inode_state(&ei->vfs_inode, EXT4_STATE_EXT_PRECACHED)) { nr_skipped++; continue; } if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) { nr_skipped++; continue; } /* * Now we hold i_es_lock which protects us from inode reclaim * freeing inode under us */ spin_unlock(&sbi->s_es_lock); nr_shrunk += es_reclaim_extents(ei, &nr_to_scan); write_unlock(&ei->i_es_lock); if (nr_to_scan <= 0) goto out; spin_lock(&sbi->s_es_lock); } spin_unlock(&sbi->s_es_lock); /* * If we skipped any inodes, and we weren't able to make any * forward progress, try again to scan precached inodes. */ if ((nr_shrunk == 0) && nr_skipped && !retried) { retried++; goto retry; } if (locked_ei && nr_shrunk == 0) nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan); out: scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); if (likely(es_stats->es_stats_scan_time)) es_stats->es_stats_scan_time = (scan_time + es_stats->es_stats_scan_time*3) / 4; else es_stats->es_stats_scan_time = scan_time; if (scan_time > es_stats->es_stats_max_scan_time) es_stats->es_stats_max_scan_time = scan_time; if (likely(es_stats->es_stats_shrunk)) es_stats->es_stats_shrunk = (nr_shrunk + es_stats->es_stats_shrunk*3) / 4; else es_stats->es_stats_shrunk = nr_shrunk; trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, nr_skipped, retried); return nr_shrunk; } static unsigned long ext4_es_count(struct shrinker *shrink, struct shrink_control *sc) { unsigned long nr; struct ext4_sb_info *sbi; sbi = shrink->private_data; nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); return nr; } static unsigned long ext4_es_scan(struct shrinker *shrink, struct shrink_control *sc) { struct ext4_sb_info *sbi = shrink->private_data; int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk; ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL); ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); return nr_shrunk; } int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v) { struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private); struct ext4_es_stats *es_stats = &sbi->s_es_stats; struct ext4_inode_info *ei, *max = NULL; unsigned int inode_cnt = 0; if (v != SEQ_START_TOKEN) return 0; /* here we just find an inode that has the max nr. of objects */ spin_lock(&sbi->s_es_lock); list_for_each_entry(ei, &sbi->s_es_list, i_es_list) { inode_cnt++; if (max && max->i_es_all_nr < ei->i_es_all_nr) max = ei; else if (!max) max = ei; } spin_unlock(&sbi->s_es_lock); seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt)); seq_printf(seq, " %lld/%lld cache hits/misses\n", percpu_counter_sum_positive(&es_stats->es_stats_cache_hits), percpu_counter_sum_positive(&es_stats->es_stats_cache_misses)); if (inode_cnt) seq_printf(seq, " %d inodes on list\n", inode_cnt); seq_printf(seq, "average:\n %llu us scan time\n", div_u64(es_stats->es_stats_scan_time, 1000)); seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk); if (inode_cnt) seq_printf(seq, "maximum:\n %llu inode (%u objects, %u reclaimable)\n" " %llu us max scan time\n", max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr, div_u64(es_stats->es_stats_max_scan_time, 1000)); return 0; } int ext4_es_register_shrinker(struct ext4_sb_info *sbi) { int err; /* Make sure we have enough bits for physical block number */ BUILD_BUG_ON(ES_SHIFT < 48); INIT_LIST_HEAD(&sbi->s_es_list); sbi->s_es_nr_inode = 0; spin_lock_init(&sbi->s_es_lock); sbi->s_es_stats.es_stats_shrunk = 0; err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0, GFP_KERNEL); if (err) return err; err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0, GFP_KERNEL); if (err) goto err1; sbi->s_es_stats.es_stats_scan_time = 0; sbi->s_es_stats.es_stats_max_scan_time = 0; err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL); if (err) goto err2; err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL); if (err) goto err3; sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id); if (!sbi->s_es_shrinker) { err = -ENOMEM; goto err4; } sbi->s_es_shrinker->scan_objects = ext4_es_scan; sbi->s_es_shrinker->count_objects = ext4_es_count; sbi->s_es_shrinker->private_data = sbi; shrinker_register(sbi->s_es_shrinker); return 0; err4: percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); err3: percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); err2: percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); err1: percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits); return err; } void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) { percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits); percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); shrinker_free(sbi->s_es_shrinker); } /* * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at * most *nr_to_scan extents, update *nr_to_scan accordingly. * * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan. * Increment *nr_shrunk by the number of reclaimed extents. Also update * ei->i_es_shrink_lblk to where we should continue scanning. */ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end, int *nr_to_scan, int *nr_shrunk) { struct inode *inode = &ei->vfs_inode; struct ext4_es_tree *tree = &ei->i_es_tree; struct extent_status *es; struct rb_node *node; es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk); if (!es) goto out_wrap; while (*nr_to_scan > 0) { if (es->es_lblk > end) { ei->i_es_shrink_lblk = end + 1; return 0; } (*nr_to_scan)--; node = rb_next(&es->rb_node); if (ext4_es_must_keep(es)) goto next; if (ext4_es_is_referenced(es)) { ext4_es_clear_referenced(es); goto next; } rb_erase(&es->rb_node, &tree->root); ext4_es_free_extent(inode, es); (*nr_shrunk)++; next: if (!node) goto out_wrap; es = rb_entry(node, struct extent_status, rb_node); } ei->i_es_shrink_lblk = es->es_lblk; return 1; out_wrap: ei->i_es_shrink_lblk = 0; return 0; } static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan) { struct inode *inode = &ei->vfs_inode; int nr_shrunk = 0; ext4_lblk_t start = ei->i_es_shrink_lblk; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); if (ei->i_es_shk_nr == 0) return 0; if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && __ratelimit(&_rs)) ext4_warning(inode->i_sb, "forced shrink of precached extents"); if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) && start != 0) es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk); ei->i_es_tree.cache_es = NULL; return nr_shrunk; } /* * Called to support EXT4_IOC_CLEAR_ES_CACHE. We can only remove * discretionary entries from the extent status cache. (Some entries * must be present for proper operations.) */ void ext4_clear_inode_es(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct extent_status *es; struct ext4_es_tree *tree; struct rb_node *node; write_lock(&ei->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; tree->cache_es = NULL; node = rb_first(&tree->root); while (node) { es = rb_entry(node, struct extent_status, rb_node); node = rb_next(node); if (!ext4_es_must_keep(es)) { rb_erase(&es->rb_node, &tree->root); ext4_es_free_extent(inode, es); } } ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED); write_unlock(&ei->i_es_lock); } #ifdef ES_DEBUG__ static void ext4_print_pending_tree(struct inode *inode) { struct ext4_pending_tree *tree; struct rb_node *node; struct pending_reservation *pr; printk(KERN_DEBUG "pending reservations for inode %llu:", inode->i_ino); tree = &EXT4_I(inode)->i_pending_tree; node = rb_first(&tree->root); while (node) { pr = rb_entry(node, struct pending_reservation, rb_node); printk(KERN_DEBUG " %u", pr->lclu); node = rb_next(node); } printk(KERN_DEBUG "\n"); } #else #define ext4_print_pending_tree(inode) #endif int __init ext4_init_pending(void) { ext4_pending_cachep = KMEM_CACHE(pending_reservation, SLAB_RECLAIM_ACCOUNT); if (ext4_pending_cachep == NULL) return -ENOMEM; return 0; } void ext4_exit_pending(void) { kmem_cache_destroy(ext4_pending_cachep); } void ext4_init_pending_tree(struct ext4_pending_tree *tree) { tree->root = RB_ROOT; } /* * __get_pending - retrieve a pointer to a pending reservation * * @inode - file containing the pending cluster reservation * @lclu - logical cluster of interest * * Returns a pointer to a pending reservation if it's a member of * the set, and NULL if not. Must be called holding i_es_lock. */ static struct pending_reservation *__get_pending(struct inode *inode, ext4_lblk_t lclu) { struct ext4_pending_tree *tree; struct rb_node *node; struct pending_reservation *pr = NULL; tree = &EXT4_I(inode)->i_pending_tree; node = (&tree->root)->rb_node; while (node) { pr = rb_entry(node, struct pending_reservation, rb_node); if (lclu < pr->lclu) node = node->rb_left; else if (lclu > pr->lclu) node = node->rb_right; else if (lclu == pr->lclu) return pr; } return NULL; } /* * __insert_pending - adds a pending cluster reservation to the set of * pending reservations * * @inode - file containing the cluster * @lblk - logical block in the cluster to be added * @prealloc - preallocated pending entry * * Returns 1 on successful insertion and -ENOMEM on failure. If the * pending reservation is already in the set, returns successfully. */ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk, struct pending_reservation **prealloc) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; struct rb_node **p = &tree->root.rb_node; struct rb_node *parent = NULL; struct pending_reservation *pr; ext4_lblk_t lclu; int ret = 0; lclu = EXT4_B2C(sbi, lblk); /* search to find parent for insertion */ while (*p) { parent = *p; pr = rb_entry(parent, struct pending_reservation, rb_node); if (lclu < pr->lclu) { p = &(*p)->rb_left; } else if (lclu > pr->lclu) { p = &(*p)->rb_right; } else { /* pending reservation already inserted */ goto out; } } if (likely(*prealloc == NULL)) { pr = __alloc_pending(false); if (!pr) { ret = -ENOMEM; goto out; } } else { pr = *prealloc; *prealloc = NULL; } pr->lclu = lclu; rb_link_node(&pr->rb_node, parent, p); rb_insert_color(&pr->rb_node, &tree->root); ret = 1; out: return ret; } /* * __remove_pending - removes a pending cluster reservation from the set * of pending reservations * * @inode - file containing the cluster * @lblk - logical block in the pending cluster reservation to be removed * * Returns successfully if pending reservation is not a member of the set. */ static void __remove_pending(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct pending_reservation *pr; struct ext4_pending_tree *tree; pr = __get_pending(inode, EXT4_B2C(sbi, lblk)); if (pr != NULL) { tree = &EXT4_I(inode)->i_pending_tree; rb_erase(&pr->rb_node, &tree->root); __free_pending(pr); } } /* * ext4_remove_pending - removes a pending cluster reservation from the set * of pending reservations * * @inode - file containing the cluster * @lblk - logical block in the pending cluster reservation to be removed * * Locking for external use of __remove_pending. */ void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk) { struct ext4_inode_info *ei = EXT4_I(inode); write_lock(&ei->i_es_lock); __remove_pending(inode, lblk); write_unlock(&ei->i_es_lock); } /* * ext4_is_pending - determine whether a cluster has a pending reservation * on it * * @inode - file containing the cluster * @lblk - logical block in the cluster * * Returns true if there's a pending reservation for the cluster in the * set of pending reservations, and false if not. */ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); bool ret; read_lock(&ei->i_es_lock); ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL); read_unlock(&ei->i_es_lock); return ret; } /* * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents * status tree, adding a pending reservation * where needed * * @inode - file containing the newly added block * @lblk - start logical block to be added * @len - length of blocks to be added * @lclu_allocated/end_allocated - indicates whether a physical cluster has * been allocated for the logical cluster * that contains the start/end block. Note that * end_allocated should always be set to false * if the start and the end block are in the * same cluster */ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, bool lclu_allocated, bool end_allocated) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct extent_status newes; ext4_lblk_t end = lblk + len - 1; int err1 = 0, err2 = 0, err3 = 0; struct extent_status *es1 = NULL; struct extent_status *es2 = NULL; struct pending_reservation *pr1 = NULL; struct pending_reservation *pr2 = NULL; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; es_debug("add [%u/%u) delayed to extent status tree of inode %llu\n", lblk, len, inode->i_ino); if (!len) return; WARN_ON_ONCE((EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) && end_allocated); newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); ext4_es_insert_extent_check(inode, &newes); retry: if (err1 && !es1) es1 = __es_alloc_extent(true); if ((err1 || err2) && !es2) es2 = __es_alloc_extent(true); if (err1 || err2 || err3 < 0) { if (lclu_allocated && !pr1) pr1 = __alloc_pending(true); if (end_allocated && !pr2) pr2 = __alloc_pending(true); } write_lock(&EXT4_I(inode)->i_es_lock); err1 = __es_remove_extent(inode, lblk, end, 0, NULL, NULL, es1); if (err1 != 0) goto error; /* Free preallocated extent if it didn't get used. */ if (es1) { if (!es1->es_len) __es_free_extent(es1); es1 = NULL; } err2 = __es_insert_extent(inode, &newes, es2); if (err2 != 0) goto error; /* Free preallocated extent if it didn't get used. */ if (es2) { if (!es2->es_len) __es_free_extent(es2); es2 = NULL; } if (lclu_allocated) { err3 = __insert_pending(inode, lblk, &pr1); if (err3 < 0) goto error; if (pr1) { __free_pending(pr1); pr1 = NULL; } } if (end_allocated) { err3 = __insert_pending(inode, end, &pr2); if (err3 < 0) goto error; if (pr2) { __free_pending(pr2); pr2 = NULL; } } ext4_es_inc_seq(inode); error: write_unlock(&EXT4_I(inode)->i_es_lock); if (err1 || err2 || err3 < 0) goto retry; trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated, end_allocated); ext4_es_print_tree(inode); ext4_print_pending_tree(inode); return; } /* * __revise_pending - makes, cancels, or leaves unchanged pending cluster * reservations for a specified block range depending * upon the presence or absence of delayed blocks * outside the range within clusters at the ends of the * range * * @inode - file containing the range * @lblk - logical block defining the start of range * @len - length of range in blocks * @prealloc - preallocated pending entry * * Used after a newly allocated extent is added to the extents status tree. * Requires that the extents in the range have either written or unwritten * status. Must be called while holding i_es_lock. Returns number of new * inserts pending cluster on insert pendings, returns 0 on remove pendings, * return -ENOMEM on failure. */ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, struct pending_reservation **prealloc) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t end = lblk + len - 1; ext4_lblk_t first, last; bool f_del = false, l_del = false; int pendings = 0; int ret = 0; if (len == 0) return 0; /* * Two cases - block range within single cluster and block range * spanning two or more clusters. Note that a cluster belonging * to a range starting and/or ending on a cluster boundary is treated * as if it does not contain a delayed extent. The new range may * have allocated space for previously delayed blocks out to the * cluster boundary, requiring that any pre-existing pending * reservation be canceled. Because this code only looks at blocks * outside the range, it should revise pending reservations * correctly even if the extent represented by the range can't be * inserted in the extents status tree due to ENOSPC. */ if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) { first = EXT4_LBLK_CMASK(sbi, lblk); if (first != lblk) f_del = __es_scan_range(inode, &ext4_es_is_delayed, first, lblk - 1); if (f_del) { ret = __insert_pending(inode, first, prealloc); if (ret < 0) goto out; pendings += ret; } else { last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; if (last != end) l_del = __es_scan_range(inode, &ext4_es_is_delayed, end + 1, last); if (l_del) { ret = __insert_pending(inode, last, prealloc); if (ret < 0) goto out; pendings += ret; } else __remove_pending(inode, last); } } else { first = EXT4_LBLK_CMASK(sbi, lblk); if (first != lblk) f_del = __es_scan_range(inode, &ext4_es_is_delayed, first, lblk - 1); if (f_del) { ret = __insert_pending(inode, first, prealloc); if (ret < 0) goto out; pendings += ret; } else __remove_pending(inode, first); last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; if (last != end) l_del = __es_scan_range(inode, &ext4_es_is_delayed, end + 1, last); if (l_del) { ret = __insert_pending(inode, last, prealloc); if (ret < 0) goto out; pendings += ret; } else __remove_pending(inode, last); } out: return (ret < 0) ? ret : pendings; }
16 1 1 2 12 3 1 1 2 1 8 6 2 8 3 8 1 5 4 1 6 6 4 3 6 7 7 7 7 7 7 7 58 58 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/act_sample.c - Packet sampling tc action * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <linux/gfp.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_sample.h> #include <net/tc_act/tc_sample.h> #include <net/psample.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> #include <linux/if_arp.h> static struct tc_action_ops act_sample_ops; static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { [TCA_SAMPLE_PARMS] = { .len = sizeof(struct tc_sample) }, [TCA_SAMPLE_RATE] = { .type = NLA_U32 }, [TCA_SAMPLE_TRUNC_SIZE] = { .type = NLA_U32 }, [TCA_SAMPLE_PSAMPLE_GROUP] = { .type = NLA_U32 }, }; static int tcf_sample_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_sample_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_SAMPLE_MAX + 1]; struct psample_group *psample_group; u32 psample_group_num, rate, index; struct tcf_chain *goto_ch = NULL; struct tc_sample *parm; struct tcf_sample *s; bool exists = false; int ret, err; if (!nla) return -EINVAL; ret = nla_parse_nested_deprecated(tb, TCA_SAMPLE_MAX, nla, sample_policy, NULL); if (ret < 0) return ret; if (!tb[TCA_SAMPLE_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_SAMPLE_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return ACT_P_BOUND; if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_sample_ops, bind, true, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } if (!tb[TCA_SAMPLE_RATE] || !tb[TCA_SAMPLE_PSAMPLE_GROUP]) { NL_SET_ERR_MSG(extack, "sample rate and group are required"); err = -EINVAL; goto release_idr; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; rate = nla_get_u32(tb[TCA_SAMPLE_RATE]); if (!rate) { NL_SET_ERR_MSG(extack, "invalid sample rate"); err = -EINVAL; goto put_chain; } psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]); psample_group = psample_group_get(net, psample_group_num); if (!psample_group) { err = -ENOMEM; goto put_chain; } s = to_sample(*a); spin_lock_bh(&s->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); s->rate = rate; s->psample_group_num = psample_group_num; psample_group = rcu_replace_pointer(s->psample_group, psample_group, lockdep_is_held(&s->tcf_lock)); if (tb[TCA_SAMPLE_TRUNC_SIZE]) { s->truncate = true; s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]); } spin_unlock_bh(&s->tcf_lock); if (psample_group) psample_group_put(psample_group); if (goto_ch) tcf_chain_put_by_act(goto_ch); return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static void tcf_sample_cleanup(struct tc_action *a) { struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; /* last reference to action, no need to lock */ psample_group = rcu_dereference_protected(s->psample_group, 1); RCU_INIT_POINTER(s->psample_group, NULL); if (psample_group) psample_group_put(psample_group); } static bool tcf_sample_dev_ok_push(struct net_device *dev) { switch (dev->type) { case ARPHRD_TUNNEL: case ARPHRD_TUNNEL6: case ARPHRD_SIT: case ARPHRD_IPGRE: case ARPHRD_IP6GRE: case ARPHRD_VOID: case ARPHRD_NONE: return false; default: return true; } } TC_INDIRECT_SCOPE int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; u8 cookie_data[TC_COOKIE_MAX_SIZE]; struct psample_metadata md = {}; struct tc_cookie *user_cookie; int retval; tcf_lastuse_update(&s->tcf_tm); bstats_update(this_cpu_ptr(s->common.cpu_bstats), skb); retval = READ_ONCE(s->tcf_action); psample_group = rcu_dereference_bh(s->psample_group); /* randomly sample packets according to rate */ if (psample_group && (get_random_u32_below(s->rate) == 0)) { if (!skb_at_tc_ingress(skb)) { md.in_ifindex = skb->skb_iif; md.out_ifindex = skb->dev->ifindex; } else { md.in_ifindex = skb->dev->ifindex; } /* on ingress, the mac header gets popped, so push it back */ if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev)) skb_push(skb, skb->mac_len); rcu_read_lock(); user_cookie = rcu_dereference(a->user_cookie); if (user_cookie) { memcpy(cookie_data, user_cookie->data, user_cookie->len); md.user_cookie = cookie_data; md.user_cookie_len = user_cookie->len; } rcu_read_unlock(); md.trunc_size = s->truncate ? s->trunc_size : skb->len; psample_sample_packet(psample_group, skb, s->rate, &md); if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev)) skb_pull(skb, skb->mac_len); } return retval; } static void tcf_sample_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_sample *s = to_sample(a); struct tcf_t *tm = &s->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_sample *s = to_sample(a); struct tc_sample opt = { .index = s->tcf_index, .refcnt = refcount_read(&s->tcf_refcnt) - ref, .bindcnt = atomic_read(&s->tcf_bindcnt) - bind, }; struct tcf_t t; spin_lock_bh(&s->tcf_lock); opt.action = s->tcf_action; if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; tcf_tm_dump(&t, &s->tcf_tm); if (nla_put_64bit(skb, TCA_SAMPLE_TM, sizeof(t), &t, TCA_SAMPLE_PAD)) goto nla_put_failure; if (nla_put_u32(skb, TCA_SAMPLE_RATE, s->rate)) goto nla_put_failure; if (s->truncate) if (nla_put_u32(skb, TCA_SAMPLE_TRUNC_SIZE, s->trunc_size)) goto nla_put_failure; if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num)) goto nla_put_failure; spin_unlock_bh(&s->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&s->tcf_lock); nlmsg_trim(skb, b); return -1; } static void tcf_psample_group_put(void *priv) { struct psample_group *group = priv; psample_group_put(group); } static struct psample_group * tcf_sample_get_group(const struct tc_action *a, tc_action_priv_destructor *destructor) { struct tcf_sample *s = to_sample(a); struct psample_group *group; group = rcu_dereference_protected(s->psample_group, lockdep_is_held(&s->tcf_lock)); if (group) { psample_group_take(group); *destructor = tcf_psample_group_put; } return group; } static void tcf_offload_sample_get_group(struct flow_action_entry *entry, const struct tc_action *act) { entry->sample.psample_group = act->ops->get_psample_group(act, &entry->destructor); entry->destructor_priv = entry->sample.psample_group; } static int tcf_sample_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; entry->id = FLOW_ACTION_SAMPLE; entry->sample.trunc_size = tcf_sample_trunc_size(act); entry->sample.truncate = tcf_sample_truncate(act); entry->sample.rate = tcf_sample_rate(act); tcf_offload_sample_get_group(entry, act); *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; fl_action->id = FLOW_ACTION_SAMPLE; } return 0; } static struct tc_action_ops act_sample_ops = { .kind = "sample", .id = TCA_ID_SAMPLE, .owner = THIS_MODULE, .act = tcf_sample_act, .stats_update = tcf_sample_stats_update, .dump = tcf_sample_dump, .init = tcf_sample_init, .cleanup = tcf_sample_cleanup, .get_psample_group = tcf_sample_get_group, .offload_act_setup = tcf_sample_offload_act_setup, .size = sizeof(struct tcf_sample), }; MODULE_ALIAS_NET_ACT("sample"); static __net_init int sample_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_sample_ops.net_id); return tc_action_net_init(net, tn, &act_sample_ops); } static void __net_exit sample_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_sample_ops.net_id); } static struct pernet_operations sample_net_ops = { .init = sample_init_net, .exit_batch = sample_exit_net, .id = &act_sample_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init sample_init_module(void) { return tcf_register_action(&act_sample_ops, &sample_net_ops); } static void __exit sample_cleanup_module(void) { tcf_unregister_action(&act_sample_ops, &sample_net_ops); } module_init(sample_init_module); module_exit(sample_cleanup_module); MODULE_AUTHOR("Yotam Gigi <yotam.gi@gmail.com>"); MODULE_DESCRIPTION("Packet sampling action"); MODULE_LICENSE("GPL v2");
1 2 1 3 1 1 1 1 2 2 2 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" struct devlink_linecard { struct list_head list; struct devlink *devlink; unsigned int index; const struct devlink_linecard_ops *ops; void *priv; enum devlink_linecard_state state; struct mutex state_lock; /* Protects state */ const char *type; struct devlink_linecard_type *types; unsigned int types_count; u32 rel_index; }; unsigned int devlink_linecard_index(struct devlink_linecard *linecard) { return linecard->index; } static struct devlink_linecard * devlink_linecard_get_by_index(struct devlink *devlink, unsigned int linecard_index) { struct devlink_linecard *devlink_linecard; list_for_each_entry(devlink_linecard, &devlink->linecard_list, list) { if (devlink_linecard->index == linecard_index) return devlink_linecard; } return NULL; } static bool devlink_linecard_index_exists(struct devlink *devlink, unsigned int linecard_index) { return devlink_linecard_get_by_index(devlink, linecard_index); } static struct devlink_linecard * devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { if (attrs[DEVLINK_ATTR_LINECARD_INDEX]) { u32 linecard_index = nla_get_u32(attrs[DEVLINK_ATTR_LINECARD_INDEX]); struct devlink_linecard *linecard; linecard = devlink_linecard_get_by_index(devlink, linecard_index); if (!linecard) return ERR_PTR(-ENODEV); return linecard; } return ERR_PTR(-EINVAL); } static struct devlink_linecard * devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_linecard_get_from_attrs(devlink, info->attrs); } struct devlink_linecard_type { const char *type; const void *priv; }; static int devlink_nl_linecard_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_linecard *linecard, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { struct devlink_linecard_type *linecard_type; struct nlattr *attr; void *hdr; int i; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, linecard->index)) goto nla_put_failure; if (nla_put_u8(msg, DEVLINK_ATTR_LINECARD_STATE, linecard->state)) goto nla_put_failure; if (linecard->type && nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, linecard->type)) goto nla_put_failure; if (linecard->types_count) { attr = nla_nest_start(msg, DEVLINK_ATTR_LINECARD_SUPPORTED_TYPES); if (!attr) goto nla_put_failure; for (i = 0; i < linecard->types_count; i++) { linecard_type = &linecard->types[i]; if (nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, linecard_type->type)) { nla_nest_cancel(msg, attr); goto nla_put_failure; } } nla_nest_end(msg, attr); } if (devlink_rel_devlink_handle_put(msg, devlink, linecard->rel_index, DEVLINK_ATTR_NESTED_DEVLINK, NULL)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void devlink_linecard_notify(struct devlink_linecard *linecard, enum devlink_command cmd) { struct devlink *devlink = linecard->devlink; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_LINECARD_NEW && cmd != DEVLINK_CMD_LINECARD_DEL); if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_linecard_fill(msg, devlink, linecard, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } devlink_nl_notify_send(devlink, msg); } void devlink_linecards_notify_register(struct devlink *devlink) { struct devlink_linecard *linecard; list_for_each_entry(linecard, &devlink->linecard_list, list) devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); } void devlink_linecards_notify_unregister(struct devlink *devlink) { struct devlink_linecard *linecard; list_for_each_entry_reverse(linecard, &devlink->linecard_list, list) devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); } int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_linecard *linecard; struct sk_buff *msg; int err; linecard = devlink_linecard_get_from_info(devlink, info); if (IS_ERR(linecard)) return PTR_ERR(linecard); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; mutex_lock(&linecard->state_lock); err = devlink_nl_linecard_fill(msg, devlink, linecard, DEVLINK_CMD_LINECARD_NEW, info->snd_portid, info->snd_seq, 0, info->extack); mutex_unlock(&linecard->state_lock); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int devlink_nl_linecard_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_linecard *linecard; int idx = 0; int err = 0; list_for_each_entry(linecard, &devlink->linecard_list, list) { if (idx < state->idx) { idx++; continue; } mutex_lock(&linecard->state_lock); err = devlink_nl_linecard_fill(msg, devlink, linecard, DEVLINK_CMD_LINECARD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, cb->extack); mutex_unlock(&linecard->state_lock); if (err) { state->idx = idx; break; } idx++; } return err; } int devlink_nl_linecard_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_linecard_get_dump_one); } static struct devlink_linecard_type * devlink_linecard_type_lookup(struct devlink_linecard *linecard, const char *type) { struct devlink_linecard_type *linecard_type; int i; for (i = 0; i < linecard->types_count; i++) { linecard_type = &linecard->types[i]; if (!strcmp(type, linecard_type->type)) return linecard_type; } return NULL; } static int devlink_linecard_type_set(struct devlink_linecard *linecard, const char *type, struct netlink_ext_ack *extack) { const struct devlink_linecard_ops *ops = linecard->ops; struct devlink_linecard_type *linecard_type; int err; mutex_lock(&linecard->state_lock); if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); err = -EBUSY; goto out; } if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); err = -EBUSY; goto out; } linecard_type = devlink_linecard_type_lookup(linecard, type); if (!linecard_type) { NL_SET_ERR_MSG(extack, "Unsupported line card type provided"); err = -EINVAL; goto out; } if (linecard->state != DEVLINK_LINECARD_STATE_UNPROVISIONED && linecard->state != DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { NL_SET_ERR_MSG(extack, "Line card already provisioned"); err = -EBUSY; /* Check if the line card is provisioned in the same * way the user asks. In case it is, make the operation * to return success. */ if (ops->same_provision && ops->same_provision(linecard, linecard->priv, linecard_type->type, linecard_type->priv)) err = 0; goto out; } linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING; linecard->type = linecard_type->type; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); err = ops->provision(linecard, linecard->priv, linecard_type->type, linecard_type->priv, extack); if (err) { /* Provisioning failed. Assume the linecard is unprovisioned * for future operations. */ mutex_lock(&linecard->state_lock); linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; linecard->type = NULL; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); } return err; out: mutex_unlock(&linecard->state_lock); return err; } static int devlink_linecard_type_unset(struct devlink_linecard *linecard, struct netlink_ext_ack *extack) { int err; mutex_lock(&linecard->state_lock); if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); err = -EBUSY; goto out; } if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); err = -EBUSY; goto out; } if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; linecard->type = NULL; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); err = 0; goto out; } if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONED) { NL_SET_ERR_MSG(extack, "Line card is not provisioned"); err = 0; goto out; } linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONING; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); err = linecard->ops->unprovision(linecard, linecard->priv, extack); if (err) { /* Unprovisioning failed. Assume the linecard is unprovisioned * for future operations. */ mutex_lock(&linecard->state_lock); linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; linecard->type = NULL; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); } return err; out: mutex_unlock(&linecard->state_lock); return err; } int devlink_nl_linecard_set_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_linecard *linecard; int err; linecard = devlink_linecard_get_from_info(devlink, info); if (IS_ERR(linecard)) return PTR_ERR(linecard); if (info->attrs[DEVLINK_ATTR_LINECARD_TYPE]) { const char *type; type = nla_data(info->attrs[DEVLINK_ATTR_LINECARD_TYPE]); if (strcmp(type, "")) { err = devlink_linecard_type_set(linecard, type, extack); if (err) return err; } else { err = devlink_linecard_type_unset(linecard, extack); if (err) return err; } } return 0; } static int devlink_linecard_types_init(struct devlink_linecard *linecard) { struct devlink_linecard_type *linecard_type; unsigned int count; int i; count = linecard->ops->types_count(linecard, linecard->priv); linecard->types = kmalloc_objs(*linecard_type, count); if (!linecard->types) return -ENOMEM; linecard->types_count = count; for (i = 0; i < count; i++) { linecard_type = &linecard->types[i]; linecard->ops->types_get(linecard, linecard->priv, i, &linecard_type->type, &linecard_type->priv); } return 0; } static void devlink_linecard_types_fini(struct devlink_linecard *linecard) { kfree(linecard->types); } /** * devl_linecard_create - Create devlink linecard * * @devlink: devlink * @linecard_index: driver-specific numerical identifier of the linecard * @ops: linecards ops * @priv: user priv pointer * * Create devlink linecard instance with provided linecard index. * Caller can use any indexing, even hw-related one. * * Return: Line card structure or an ERR_PTR() encoded error code. */ struct devlink_linecard * devl_linecard_create(struct devlink *devlink, unsigned int linecard_index, const struct devlink_linecard_ops *ops, void *priv) { struct devlink_linecard *linecard; int err; if (WARN_ON(!ops || !ops->provision || !ops->unprovision || !ops->types_count || !ops->types_get)) return ERR_PTR(-EINVAL); if (devlink_linecard_index_exists(devlink, linecard_index)) return ERR_PTR(-EEXIST); linecard = kzalloc_obj(*linecard); if (!linecard) return ERR_PTR(-ENOMEM); linecard->devlink = devlink; linecard->index = linecard_index; linecard->ops = ops; linecard->priv = priv; linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; mutex_init(&linecard->state_lock); err = devlink_linecard_types_init(linecard); if (err) { mutex_destroy(&linecard->state_lock); kfree(linecard); return ERR_PTR(err); } list_add_tail(&linecard->list, &devlink->linecard_list); devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); return linecard; } EXPORT_SYMBOL_GPL(devl_linecard_create); /** * devl_linecard_destroy - Destroy devlink linecard * * @linecard: devlink linecard */ void devl_linecard_destroy(struct devlink_linecard *linecard) { devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); list_del(&linecard->list); devlink_linecard_types_fini(linecard); mutex_destroy(&linecard->state_lock); kfree(linecard); } EXPORT_SYMBOL_GPL(devl_linecard_destroy); /** * devlink_linecard_provision_set - Set provisioning on linecard * * @linecard: devlink linecard * @type: linecard type * * This is either called directly from the provision() op call or * as a result of the provision() op call asynchronously. */ void devlink_linecard_provision_set(struct devlink_linecard *linecard, const char *type) { mutex_lock(&linecard->state_lock); WARN_ON(linecard->type && strcmp(linecard->type, type)); linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; linecard->type = type; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); } EXPORT_SYMBOL_GPL(devlink_linecard_provision_set); /** * devlink_linecard_provision_clear - Clear provisioning on linecard * * @linecard: devlink linecard * * This is either called directly from the unprovision() op call or * as a result of the unprovision() op call asynchronously. */ void devlink_linecard_provision_clear(struct devlink_linecard *linecard) { mutex_lock(&linecard->state_lock); linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; linecard->type = NULL; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); } EXPORT_SYMBOL_GPL(devlink_linecard_provision_clear); /** * devlink_linecard_provision_fail - Fail provisioning on linecard * * @linecard: devlink linecard * * This is either called directly from the provision() op call or * as a result of the provision() op call asynchronously. */ void devlink_linecard_provision_fail(struct devlink_linecard *linecard) { mutex_lock(&linecard->state_lock); linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING_FAILED; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); } EXPORT_SYMBOL_GPL(devlink_linecard_provision_fail); /** * devlink_linecard_activate - Set linecard active * * @linecard: devlink linecard */ void devlink_linecard_activate(struct devlink_linecard *linecard) { mutex_lock(&linecard->state_lock); WARN_ON(linecard->state != DEVLINK_LINECARD_STATE_PROVISIONED); linecard->state = DEVLINK_LINECARD_STATE_ACTIVE; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); } EXPORT_SYMBOL_GPL(devlink_linecard_activate); /** * devlink_linecard_deactivate - Set linecard inactive * * @linecard: devlink linecard */ void devlink_linecard_deactivate(struct devlink_linecard *linecard) { mutex_lock(&linecard->state_lock); switch (linecard->state) { case DEVLINK_LINECARD_STATE_ACTIVE: linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); break; case DEVLINK_LINECARD_STATE_UNPROVISIONING: /* Line card is being deactivated as part * of unprovisioning flow. */ break; default: WARN_ON(1); break; } mutex_unlock(&linecard->state_lock); } EXPORT_SYMBOL_GPL(devlink_linecard_deactivate); static void devlink_linecard_rel_notify_cb(struct devlink *devlink, u32 linecard_index) { struct devlink_linecard *linecard; linecard = devlink_linecard_get_by_index(devlink, linecard_index); if (!linecard) return; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); } static void devlink_linecard_rel_cleanup_cb(struct devlink *devlink, u32 linecard_index, u32 rel_index) { struct devlink_linecard *linecard; linecard = devlink_linecard_get_by_index(devlink, linecard_index); if (linecard && linecard->rel_index == rel_index) linecard->rel_index = 0; } /** * devlink_linecard_nested_dl_set - Attach/detach nested devlink * instance to linecard. * * @linecard: devlink linecard * @nested_devlink: devlink instance to attach or NULL to detach */ int devlink_linecard_nested_dl_set(struct devlink_linecard *linecard, struct devlink *nested_devlink) { return devlink_rel_nested_in_add(&linecard->rel_index, linecard->devlink->index, linecard->index, devlink_linecard_rel_notify_cb, devlink_linecard_rel_cleanup_cb, nested_devlink); } EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set);
16 15 3 13 15 1 15 9 8 8 8 8 8 4 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 // SPDX-License-Identifier: GPL-2.0-only #include "common.h" #include "netlink.h" struct linkinfo_req_info { struct ethnl_req_info base; }; struct linkinfo_reply_data { struct ethnl_reply_data base; struct ethtool_link_ksettings ksettings; struct ethtool_link_settings *lsettings; }; #define LINKINFO_REPDATA(__reply_base) \ container_of(__reply_base, struct linkinfo_reply_data, base) const struct nla_policy ethnl_linkinfo_get_policy[] = { [ETHTOOL_A_LINKINFO_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int linkinfo_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; data->lsettings = &data->ksettings.base; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = __ethtool_get_link_ksettings(dev, &data->ksettings); if (ret < 0) GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); ethnl_ops_complete(dev); return ret; } static int linkinfo_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { return nla_total_size(sizeof(u8)) /* LINKINFO_PORT */ + nla_total_size(sizeof(u8)) /* LINKINFO_PHYADDR */ + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX */ + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX_CTRL */ + nla_total_size(sizeof(u8)) /* LINKINFO_TRANSCEIVER */ + 0; } static int linkinfo_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base); if (nla_put_u8(skb, ETHTOOL_A_LINKINFO_PORT, data->lsettings->port) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_PHYADDR, data->lsettings->phy_address) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX, data->lsettings->eth_tp_mdix) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX_CTRL, data->lsettings->eth_tp_mdix_ctrl) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_TRANSCEIVER, data->lsettings->transceiver)) return -EMSGSIZE; return 0; } /* LINKINFO_SET */ const struct nla_policy ethnl_linkinfo_set_policy[] = { [ETHTOOL_A_LINKINFO_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_LINKINFO_PORT] = { .type = NLA_U8 }, [ETHTOOL_A_LINKINFO_PHYADDR] = { .type = NLA_U8 }, [ETHTOOL_A_LINKINFO_TP_MDIX_CTRL] = { .type = NLA_U8 }, }; static int ethnl_set_linkinfo_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; if (!ops->get_link_ksettings || !ops->set_link_ksettings) return -EOPNOTSUPP; return 1; } static int ethnl_set_linkinfo(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_link_ksettings ksettings = {}; struct ethtool_link_settings *lsettings; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; bool mod = false; int ret; ret = __ethtool_get_link_ksettings(dev, &ksettings); if (ret < 0) { GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); return ret; } lsettings = &ksettings.base; ethnl_update_u8(&lsettings->port, tb[ETHTOOL_A_LINKINFO_PORT], &mod); ethnl_update_u8(&lsettings->phy_address, tb[ETHTOOL_A_LINKINFO_PHYADDR], &mod); ethnl_update_u8(&lsettings->eth_tp_mdix_ctrl, tb[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL], &mod); if (!mod) return 0; ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); if (ret < 0) { GENL_SET_ERR_MSG(info, "link settings update failed"); return ret; } return 1; } const struct ethnl_request_ops ethnl_linkinfo_request_ops = { .request_cmd = ETHTOOL_MSG_LINKINFO_GET, .reply_cmd = ETHTOOL_MSG_LINKINFO_GET_REPLY, .hdr_attr = ETHTOOL_A_LINKINFO_HEADER, .req_info_size = sizeof(struct linkinfo_req_info), .reply_data_size = sizeof(struct linkinfo_reply_data), .prepare_data = linkinfo_prepare_data, .reply_size = linkinfo_reply_size, .fill_reply = linkinfo_fill_reply, .set_validate = ethnl_set_linkinfo_validate, .set = ethnl_set_linkinfo, .set_ntf_cmd = ETHTOOL_MSG_LINKINFO_NTF, };
626 624 767 1 737 44 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 // SPDX-License-Identifier: GPL-2.0-or-later /* * KVM paravirt_ops implementation * * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright IBM Corporation, 2007 * Authors: Anthony Liguori <aliguori@us.ibm.com> */ #define pr_fmt(fmt) "kvm-guest: " fmt #include <linux/context_tracking.h> #include <linux/init.h> #include <linux/irq.h> #include <linux/kernel.h> #include <linux/kvm_para.h> #include <linux/cpu.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/hardirq.h> #include <linux/notifier.h> #include <linux/reboot.h> #include <linux/hash.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/kprobes.h> #include <linux/nmi.h> #include <linux/swait.h> #include <linux/syscore_ops.h> #include <linux/cc_platform.h> #include <linux/efi.h> #include <linux/kvm_types.h> #include <linux/sched/cputime.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> #include <asm/desc.h> #include <asm/tlbflush.h> #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> #include <asm/mtrr.h> #include <asm/tlb.h> #include <asm/cpuidle_haltpoll.h> #include <asm/msr.h> #include <asm/ptrace.h> #include <asm/reboot.h> #include <asm/svm.h> #include <asm/e820/api.h> DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled); static int kvmapf = 1; static int __init parse_no_kvmapf(char *arg) { kvmapf = 0; return 0; } early_param("no-kvmapf", parse_no_kvmapf); static int steal_acc = 1; static int __init parse_no_stealacc(char *arg) { steal_acc = 0; return 0; } early_param("no-steal-acc", parse_no_stealacc); static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled); static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; static int has_steal_clock = 0; static int has_guest_poll = 0; #define KVM_TASK_SLEEP_HASHBITS 8 #define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS) struct kvm_task_sleep_node { struct hlist_node link; struct swait_queue_head wq; u32 token; int cpu; bool dummy; }; static struct kvm_task_sleep_head { raw_spinlock_t lock; struct hlist_head list; } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, u32 token) { struct hlist_node *p; hlist_for_each(p, &b->list) { struct kvm_task_sleep_node *n = hlist_entry(p, typeof(*n), link); if (n->token == token) return n; } return NULL; } static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; struct kvm_task_sleep_node *e; raw_spin_lock(&b->lock); e = _find_apf_task(b, token); if (e) { struct kvm_task_sleep_node *dummy = NULL; /* * The entry can either be a 'dummy' entry (which is put on the * list when wake-up happens ahead of APF handling completion) * or a token from another task which should not be touched. */ if (e->dummy) { hlist_del(&e->link); dummy = e; } raw_spin_unlock(&b->lock); kfree(dummy); return false; } n->token = token; n->cpu = smp_processor_id(); n->dummy = false; init_swait_queue_head(&n->wq); hlist_add_head(&n->link, &b->list); raw_spin_unlock(&b->lock); return true; } /* * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled * @token: Token to identify the sleep node entry * * Invoked from the async pagefault handling code or from the VM exit page * fault handler. In both cases RCU is watching. */ void kvm_async_pf_task_wait_schedule(u32 token) { struct kvm_task_sleep_node n; DECLARE_SWAITQUEUE(wait); lockdep_assert_irqs_disabled(); if (!kvm_async_pf_queue_task(token, &n)) return; for (;;) { prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); if (hlist_unhashed(&n.link)) break; local_irq_enable(); schedule(); local_irq_disable(); } finish_swait(&n.wq, &wait); } EXPORT_SYMBOL_FOR_KVM(kvm_async_pf_task_wait_schedule); static void apf_task_wake_one(struct kvm_task_sleep_node *n) { hlist_del_init(&n->link); if (swq_has_sleeper(&n->wq)) swake_up_one(&n->wq); } static void apf_task_wake_all(void) { int i; for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; struct kvm_task_sleep_node *n; struct hlist_node *p, *next; raw_spin_lock(&b->lock); hlist_for_each_safe(p, next, &b->list) { n = hlist_entry(p, typeof(*n), link); if (n->cpu == smp_processor_id()) apf_task_wake_one(n); } raw_spin_unlock(&b->lock); } } static void kvm_async_pf_task_wake(u32 token) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; struct kvm_task_sleep_node *n, *dummy = NULL; if (token == ~0) { apf_task_wake_all(); return; } again: raw_spin_lock(&b->lock); n = _find_apf_task(b, token); if (!n) { /* * Async #PF not yet handled, add a dummy entry for the token. * Allocating the token must be down outside of the raw lock * as the allocator is preemptible on PREEMPT_RT kernels. */ if (!dummy) { raw_spin_unlock(&b->lock); dummy = kzalloc_obj(*dummy, GFP_ATOMIC); /* * Continue looping on allocation failure, eventually * the async #PF will be handled and allocating a new * node will be unnecessary. */ if (!dummy) cpu_relax(); /* * Recheck for async #PF completion before enqueueing * the dummy token to avoid duplicate list entries. */ goto again; } dummy->token = token; dummy->cpu = smp_processor_id(); dummy->dummy = true; init_swait_queue_head(&dummy->wq); hlist_add_head(&dummy->link, &b->list); dummy = NULL; } else { apf_task_wake_one(n); } raw_spin_unlock(&b->lock); /* A dummy token might be allocated and ultimately not used. */ kfree(dummy); } noinstr u32 kvm_read_and_reset_apf_flags(void) { u32 flags = 0; if (__this_cpu_read(async_pf_enabled)) { flags = __this_cpu_read(apf_reason.flags); __this_cpu_write(apf_reason.flags, 0); } return flags; } EXPORT_SYMBOL_FOR_KVM(kvm_read_and_reset_apf_flags); noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { u32 flags = kvm_read_and_reset_apf_flags(); irqentry_state_t state; if (!flags) return false; state = irqentry_enter(regs); instrumentation_begin(); /* * If the host managed to inject an async #PF into an interrupt * disabled region, then die hard as this is not going to end well * and the host side is seriously broken. */ if (unlikely(!(regs->flags & X86_EFLAGS_IF))) panic("Host injected async #PF in interrupt disabled region\n"); if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { if (unlikely(!(user_mode(regs)))) panic("Host injected async #PF in kernel mode\n"); /* Page is swapped out by the host. */ kvm_async_pf_task_wait_schedule(token); } else { WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags); } instrumentation_end(); irqentry_exit(regs, state); return true; } DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) { struct pt_regs *old_regs = set_irq_regs(regs); u32 token; apic_eoi(); inc_irq_stat(irq_hv_callback_count); if (__this_cpu_read(async_pf_enabled)) { token = __this_cpu_read(apf_reason.token); kvm_async_pf_task_wake(token); __this_cpu_write(apf_reason.token, 0); wrmsrq(MSR_KVM_ASYNC_PF_ACK, 1); } set_irq_regs(old_regs); } static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_info.io_delay = false; #ifdef CONFIG_X86_IO_APIC no_timer_check = 1; #endif } static void kvm_register_steal_time(void) { int cpu = smp_processor_id(); struct kvm_steal_time *st = &per_cpu(steal_time, cpu); if (!has_steal_clock) return; wrmsrq(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); pr_debug("stealtime: cpu %d, msr %llx\n", cpu, (unsigned long long) slow_virt_to_phys(st)); } static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; static notrace __maybe_unused void kvm_guest_apic_eoi_write(void) { /** * This relies on __test_and_clear_bit to modify the memory * in a way that is atomic with respect to the local CPU. * The hypervisor only accesses this memory from the local CPU so * there's no need for lock or memory barriers. * An optimization barrier is implied in apic write. */ if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi))) return; apic_native_eoi(); } static void kvm_guest_cpu_init(void) { if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { u64 pa; WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; wrmsrq(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); wrmsrq(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(async_pf_enabled, true); pr_debug("setup async PF for cpu %d\n", smp_processor_id()); } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { unsigned long pa; /* Size alignment is implied but just to make it explicit. */ BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); __this_cpu_write(kvm_apic_eoi, 0); pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi)) | KVM_MSR_ENABLED; wrmsrq(MSR_KVM_PV_EOI_EN, pa); } if (has_steal_clock) kvm_register_steal_time(); } static void kvm_pv_disable_apf(void) { if (!__this_cpu_read(async_pf_enabled)) return; wrmsrq(MSR_KVM_ASYNC_PF_EN, 0); __this_cpu_write(async_pf_enabled, false); pr_debug("disable async PF for cpu %d\n", smp_processor_id()); } static void kvm_disable_steal_time(void) { if (!has_steal_clock) return; wrmsrq(MSR_KVM_STEAL_TIME, 0); } static u64 kvm_steal_clock(int cpu) { u64 steal; struct kvm_steal_time *src; int version; src = &per_cpu(steal_time, cpu); do { version = src->version; virt_rmb(); steal = src->steal; virt_rmb(); } while ((version & 1) || (version != src->version)); return steal; } static inline __init void __set_percpu_decrypted(void *ptr, unsigned long size) { early_set_memory_decrypted((unsigned long) ptr, size); } /* * Iterate through all possible CPUs and map the memory region pointed * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once. * * Note: we iterate through all possible CPUs to ensure that CPUs * hotplugged will have their per-cpu variable already mapped as * decrypted. */ static void __init sev_map_percpu_data(void) { int cpu; if (cc_vendor != CC_VENDOR_AMD || !cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return; for_each_possible_cpu(cpu) { __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason)); __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time)); __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi)); } } static void kvm_guest_cpu_offline(bool shutdown) { kvm_disable_steal_time(); if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) wrmsrq(MSR_KVM_PV_EOI_EN, 0); if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) wrmsrq(MSR_KVM_MIGRATION_CONTROL, 0); kvm_pv_disable_apf(); if (!shutdown) apf_task_wake_all(); kvmclock_disable(); } static int kvm_cpu_online(unsigned int cpu) { unsigned long flags; local_irq_save(flags); kvm_guest_cpu_init(); local_irq_restore(flags); return 0; } #ifdef CONFIG_SMP static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask); static bool pv_tlb_flush_supported(void) { return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && !kvm_para_has_hint(KVM_HINTS_REALTIME) && kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) && !boot_cpu_has(X86_FEATURE_MWAIT) && (num_possible_cpus() != 1)); } static bool pv_ipi_supported(void) { return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) && (num_possible_cpus() != 1)); } static bool pv_sched_yield_supported(void) { return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) && !kvm_para_has_hint(KVM_HINTS_REALTIME) && kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) && !boot_cpu_has(X86_FEATURE_MWAIT) && (num_possible_cpus() != 1)); } #define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG) static void __send_ipi_mask(const struct cpumask *mask, int vector) { unsigned long flags; int cpu, min = 0, max = 0; #ifdef CONFIG_X86_64 __uint128_t ipi_bitmap = 0; #else u64 ipi_bitmap = 0; #endif u32 apic_id, icr; long ret; if (cpumask_empty(mask)) return; local_irq_save(flags); switch (vector) { default: icr = APIC_DM_FIXED | vector; break; case NMI_VECTOR: icr = APIC_DM_NMI; break; } for_each_cpu(cpu, mask) { apic_id = per_cpu(x86_cpu_to_apicid, cpu); if (!ipi_bitmap) { min = max = apic_id; } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) { ipi_bitmap <<= min - apic_id; min = apic_id; } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) { max = apic_id < max ? max : apic_id; } else { ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld", ret); min = max = apic_id; ipi_bitmap = 0; } __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap); } if (ipi_bitmap) { ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld", ret); } local_irq_restore(flags); } static void kvm_send_ipi_mask(const struct cpumask *mask, int vector) { __send_ipi_mask(mask, vector); } static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) { unsigned int this_cpu = smp_processor_id(); struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); const struct cpumask *local_mask; cpumask_copy(new_mask, mask); cpumask_clear_cpu(this_cpu, new_mask); local_mask = new_mask; __send_ipi_mask(local_mask, vector); } static int __init setup_efi_kvm_sev_migration(void) { efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled"; efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID; efi_status_t status; unsigned long size; bool enabled; if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) || !kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) return 0; if (!efi_enabled(EFI_BOOT)) return 0; if (!efi_enabled(EFI_RUNTIME_SERVICES)) { pr_info("%s : EFI runtime services are not enabled\n", __func__); return 0; } size = sizeof(enabled); /* Get variable contents into buffer */ status = efi.get_variable(efi_sev_live_migration_enabled, &efi_variable_guid, NULL, &size, &enabled); if (status == EFI_NOT_FOUND) { pr_info("%s : EFI live migration variable not found\n", __func__); return 0; } if (status != EFI_SUCCESS) { pr_info("%s : EFI variable retrieval failed\n", __func__); return 0; } if (enabled == 0) { pr_info("%s: live migration disabled in EFI\n", __func__); return 0; } pr_info("%s : live migration enabled in EFI\n", __func__); wrmsrq(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY); return 1; } late_initcall(setup_efi_kvm_sev_migration); /* * Set the IPI entry points */ static __init void kvm_setup_pv_ipi(void) { apic_update_callback(send_IPI_mask, kvm_send_ipi_mask); apic_update_callback(send_IPI_mask_allbutself, kvm_send_ipi_mask_allbutself); pr_info("setup PV IPIs\n"); } static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) { int cpu; native_send_call_func_ipi(mask); /* Make sure other vCPUs get a chance to run if they need to. */ for_each_cpu(cpu, mask) { if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) { kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu)); break; } } } static void kvm_flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info) { u8 state; int cpu; struct kvm_steal_time *src; struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); cpumask_copy(flushmask, cpumask); /* * We have to call flush only on online vCPUs. And * queue flush_on_enter for pre-empted vCPUs */ for_each_cpu(cpu, flushmask) { /* * The local vCPU is never preempted, so we do not explicitly * skip check for local vCPU - it will never be cleared from * flushmask. */ src = &per_cpu(steal_time, cpu); state = READ_ONCE(src->preempted); if ((state & KVM_VCPU_PREEMPTED)) { if (try_cmpxchg(&src->preempted, &state, state | KVM_VCPU_FLUSH_TLB)) __cpumask_clear_cpu(cpu, flushmask); } } native_flush_tlb_multi(flushmask, info); } static __init int kvm_alloc_cpumask(void) { int cpu; if (!kvm_para_available() || nopv) return 0; if (pv_tlb_flush_supported() || pv_ipi_supported()) for_each_possible_cpu(cpu) { zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu), GFP_KERNEL, cpu_to_node(cpu)); } return 0; } arch_initcall(kvm_alloc_cpumask); static void __init kvm_smp_prepare_boot_cpu(void) { /* * Map the per-cpu variables as decrypted before kvm_guest_cpu_init() * shares the guest physical address with the hypervisor. */ sev_map_percpu_data(); kvm_guest_cpu_init(); native_smp_prepare_boot_cpu(); kvm_spinlock_init(); } static int kvm_cpu_down_prepare(unsigned int cpu) { unsigned long flags; local_irq_save(flags); kvm_guest_cpu_offline(false); local_irq_restore(flags); return 0; } #endif static int kvm_suspend(void *data) { u64 val = 0; kvm_guest_cpu_offline(false); #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) rdmsrq(MSR_KVM_POLL_CONTROL, val); has_guest_poll = !(val & 1); #endif return 0; } static void kvm_resume(void *data) { kvm_cpu_online(raw_smp_processor_id()); #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll) wrmsrq(MSR_KVM_POLL_CONTROL, 0); #endif } static const struct syscore_ops kvm_syscore_ops = { .suspend = kvm_suspend, .resume = kvm_resume, }; static struct syscore kvm_syscore = { .ops = &kvm_syscore_ops, }; static void kvm_pv_guest_cpu_reboot(void *unused) { kvm_guest_cpu_offline(true); } static int kvm_pv_reboot_notify(struct notifier_block *nb, unsigned long code, void *unused) { if (code == SYS_RESTART) on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); return NOTIFY_DONE; } static struct notifier_block kvm_pv_reboot_nb = { .notifier_call = kvm_pv_reboot_notify, }; /* * After a PV feature is registered, the host will keep writing to the * registered memory location. If the guest happens to shutdown, this memory * won't be valid. In cases like kexec, in which you install a new kernel, this * means a random memory location will be kept being written. */ #ifdef CONFIG_CRASH_DUMP static void kvm_crash_shutdown(struct pt_regs *regs) { kvm_guest_cpu_offline(true); native_machine_crash_shutdown(regs); } #endif #if defined(CONFIG_X86_32) || !defined(CONFIG_SMP) bool __kvm_vcpu_is_preempted(long cpu); __visible bool __kvm_vcpu_is_preempted(long cpu) { struct kvm_steal_time *src = &per_cpu(steal_time, cpu); return !!(src->preempted & KVM_VCPU_PREEMPTED); } PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); #else #include <asm/asm-offsets.h> extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); /* * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and * restoring to/from the stack. */ #define PV_VCPU_PREEMPTED_ASM \ "movq __per_cpu_offset(,%rdi,8), %rax\n\t" \ "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \ "setne %al\n\t" DEFINE_ASM_FUNC(__raw_callee_save___kvm_vcpu_is_preempted, PV_VCPU_PREEMPTED_ASM, .text); #endif static void __init kvm_guest_init(void) { int i; paravirt_ops_setup(); register_reboot_notifier(&kvm_pv_reboot_nb); for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) raw_spin_lock_init(&async_pf_sleepers[i].lock); if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { has_steal_clock = 1; static_call_update(pv_steal_clock, kvm_steal_clock); #ifdef CONFIG_PARAVIRT_SPINLOCKS pv_ops_lock.vcpu_is_preempted = PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); #endif } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_update_callback(eoi, kvm_guest_apic_eoi_write); if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { static_branch_enable(&kvm_async_pf_enabled); sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt); } #ifdef CONFIG_SMP if (pv_tlb_flush_supported()) { pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi; pr_info("KVM setup pv remote TLB flush\n"); } smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; if (pv_sched_yield_supported()) { smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi; pr_info("setup PV sched yield\n"); } if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", kvm_cpu_online, kvm_cpu_down_prepare) < 0) pr_err("failed to install cpu hotplug callbacks\n"); #else sev_map_percpu_data(); kvm_guest_cpu_init(); #endif #ifdef CONFIG_CRASH_DUMP machine_ops.crash_shutdown = kvm_crash_shutdown; #endif register_syscore(&kvm_syscore); /* * Hard lockup detection is enabled by default. Disable it, as guests * can get false positives too easily, for example if the host is * overcommitted. */ hardlockup_detector_disable(); } static noinline uint32_t __kvm_cpuid_base(void) { if (boot_cpu_data.cpuid_level < 0) return 0; /* So we don't blow up on old processors */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return cpuid_base_hypervisor(KVM_SIGNATURE, 0); return 0; } static inline uint32_t kvm_cpuid_base(void) { static int kvm_cpuid_base = -1; if (kvm_cpuid_base == -1) kvm_cpuid_base = __kvm_cpuid_base(); return kvm_cpuid_base; } bool kvm_para_available(void) { return kvm_cpuid_base() != 0; } EXPORT_SYMBOL_GPL(kvm_para_available); unsigned int kvm_arch_para_features(void) { return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES); } unsigned int kvm_arch_para_hints(void) { return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES); } EXPORT_SYMBOL_GPL(kvm_arch_para_hints); static uint32_t __init kvm_detect(void) { return kvm_cpuid_base(); } static void __init kvm_apic_init(void) { #ifdef CONFIG_SMP if (pv_ipi_supported()) kvm_setup_pv_ipi(); #endif } static bool __init kvm_msi_ext_dest_id(void) { return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID); } static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc) { kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, pfn << PAGE_SHIFT, npages, KVM_MAP_GPA_RANGE_ENC_STAT(enc) | KVM_MAP_GPA_RANGE_PAGE_SZ_4K); } static void __init kvm_init_platform(void) { u64 tolud = PFN_PHYS(e820__end_of_low_ram_pfn()); /* * Note, hardware requires variable MTRR ranges to be power-of-2 sized * and naturally aligned. But when forcing guest MTRR state, Linux * doesn't program the forced ranges into hardware. Don't bother doing * the math to generate a technically-legal range. */ struct mtrr_var_range pci_hole = { .base_lo = tolud | X86_MEMTYPE_UC, .mask_lo = (u32)(~(SZ_4G - tolud - 1)) | MTRR_PHYSMASK_V, .mask_hi = (BIT_ULL(boot_cpu_data.x86_phys_bits) - 1) >> 32, }; if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) && kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) { unsigned long nr_pages; int i; pv_ops.mmu.notify_page_enc_status_changed = kvm_sev_hc_page_enc_status; /* * Reset the host's shared pages list related to kernel * specific page encryption status settings before we load a * new kernel by kexec. Reset the page encryption status * during early boot instead of just before kexec to avoid SMP * races during kvm_pv_guest_cpu_reboot(). * NOTE: We cannot reset the complete shared pages list * here as we need to retain the UEFI/OVMF firmware * specific settings. */ for (i = 0; i < e820_table->nr_entries; i++) { struct e820_entry *entry = &e820_table->entries[i]; if (entry->type != E820_TYPE_RAM) continue; nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE); kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, entry->addr, nr_pages, KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K); } /* * Ensure that _bss_decrypted section is marked as decrypted in the * shared pages list. */ early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted, __end_bss_decrypted - __start_bss_decrypted, 0); /* * If not booted using EFI, enable Live migration support. */ if (!efi_enabled(EFI_BOOT)) wrmsrq(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY); } kvmclock_init(); x86_platform.apic_post_init = kvm_apic_init; /* * Set WB as the default cache mode for SEV-SNP and TDX, with a single * UC range for the legacy PCI hole, e.g. so that devices that expect * to get UC/WC mappings don't get surprised with WB. */ guest_force_mtrr_state(&pci_hole, 1, MTRR_TYPE_WRBACK); } #if defined(CONFIG_AMD_MEM_ENCRYPT) static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs) { /* RAX and CPL are already in the GHCB */ ghcb_set_rbx(ghcb, regs->bx); ghcb_set_rcx(ghcb, regs->cx); ghcb_set_rdx(ghcb, regs->dx); ghcb_set_rsi(ghcb, regs->si); } static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) { /* No checking of the return state needed */ return true; } #endif const __initconst struct hypervisor_x86 x86_hyper_kvm = { .name = "KVM", .detect = kvm_detect, .type = X86_HYPER_KVM, .init.guest_late_init = kvm_guest_init, .init.x2apic_available = kvm_para_available, .init.msi_ext_dest_id = kvm_msi_ext_dest_id, .init.init_platform = kvm_init_platform, #if defined(CONFIG_AMD_MEM_ENCRYPT) .runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare, .runtime.sev_es_hcall_finish = kvm_sev_es_hcall_finish, #endif }; static __init int activate_jump_labels(void) { if (has_steal_clock) { static_key_slow_inc(&paravirt_steal_enabled); if (steal_acc) static_key_slow_inc(&paravirt_steal_rq_enabled); } return 0; } arch_initcall(activate_jump_labels); #ifdef CONFIG_PARAVIRT_SPINLOCKS /* Kick a cpu by its apicid. Used to wake up a halted vcpu */ static void kvm_kick_cpu(int cpu) { unsigned long flags = 0; u32 apicid; apicid = per_cpu(x86_cpu_to_apicid, cpu); kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); } #include <asm/qspinlock.h> static void kvm_wait(u8 *ptr, u8 val) { if (in_nmi()) return; /* * halt until it's our turn and kicked. Note that we do safe halt * for irq enabled case to avoid hang when lock info is overwritten * in irq spinlock slowpath and no spurious interrupt occur to save us. */ if (irqs_disabled()) { if (READ_ONCE(*ptr) == val) halt(); } else { local_irq_disable(); /* safe_halt() will enable IRQ */ if (READ_ONCE(*ptr) == val) safe_halt(); else local_irq_enable(); } } /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. */ void __init kvm_spinlock_init(void) { /* * Disable PV spinlocks and use native qspinlock when dedicated pCPUs * are available. */ if (kvm_para_has_hint(KVM_HINTS_REALTIME)) { pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n"); goto out; } if (num_possible_cpus() == 1) { pr_info("PV spinlocks disabled, single CPU\n"); goto out; } if (nopvspin) { pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n"); goto out; } /* * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is * preferred over native qspinlock when vCPU is preempted. */ if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) { pr_info("PV spinlocks disabled, no host support\n"); return; } pr_info("PV spinlocks enabled\n"); __pv_init_lock_hash(); pv_ops_lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; pv_ops_lock.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); pv_ops_lock.wait = kvm_wait; pv_ops_lock.kick = kvm_kick_cpu; /* * When PV spinlock is enabled which is preferred over * virt_spin_lock(), virt_spin_lock_key's value is meaningless. * Just disable it anyway. */ out: static_branch_disable(&virt_spin_lock_key); } #endif /* CONFIG_PARAVIRT_SPINLOCKS */ #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL static void kvm_disable_host_haltpoll(void *i) { wrmsrq(MSR_KVM_POLL_CONTROL, 0); } static void kvm_enable_host_haltpoll(void *i) { wrmsrq(MSR_KVM_POLL_CONTROL, 1); } void arch_haltpoll_enable(unsigned int cpu) { if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) { pr_err_once("host does not support poll control\n"); pr_err_once("host upgrade recommended\n"); return; } /* Enable guest halt poll disables host halt poll */ smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1); } EXPORT_SYMBOL_GPL(arch_haltpoll_enable); void arch_haltpoll_disable(unsigned int cpu) { if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) return; /* Disable guest halt poll enables host halt poll */ smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1); } EXPORT_SYMBOL_GPL(arch_haltpoll_disable); #endif
6766 6786 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 // SPDX-License-Identifier: GPL-2.0-or-later /* * Generic Timer-queue * * Manages a simple queue of timers, ordered by expiration time. * Uses rbtrees for quick list adds and expiration. * * NOTE: All of the following functions need to be serialized * to avoid races. No locking is done by this library code. */ #include <linux/bug.h> #include <linux/timerqueue.h> #include <linux/rbtree.h> #include <linux/export.h> #define __node_2_tq(_n) \ rb_entry((_n), struct timerqueue_node, node) static inline bool __timerqueue_less(struct rb_node *a, const struct rb_node *b) { return __node_2_tq(a)->expires < __node_2_tq(b)->expires; } /** * timerqueue_add - Adds timer to timerqueue. * * @head: head of timerqueue * @node: timer node to be added * * Adds the timer node to the timerqueue, sorted by the node's expires * value. Returns true if the newly added timer is the first expiring timer in * the queue. */ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) { /* Make sure we don't add nodes that are already added */ WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node)); return rb_add_cached(&node->node, &head->rb_root, __timerqueue_less); } EXPORT_SYMBOL_GPL(timerqueue_add); /** * timerqueue_del - Removes a timer from the timerqueue. * * @head: head of timerqueue * @node: timer node to be removed * * Removes the timer node from the timerqueue. Returns true if the queue is * not empty after the remove. */ bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) { WARN_ON_ONCE(RB_EMPTY_NODE(&node->node)); rb_erase_cached(&node->node, &head->rb_root); RB_CLEAR_NODE(&node->node); return !RB_EMPTY_ROOT(&head->rb_root.rb_root); } EXPORT_SYMBOL_GPL(timerqueue_del); /** * timerqueue_iterate_next - Returns the timer after the provided timer * * @node: Pointer to a timer. * * Provides the timer that is after the given node. This is used, when * necessary, to iterate through the list of timers in a timer list * without modifying the list. */ struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node) { struct rb_node *next; if (!node) return NULL; next = rb_next(&node->node); if (!next) return NULL; return container_of(next, struct timerqueue_node, node); } EXPORT_SYMBOL_GPL(timerqueue_iterate_next); #define __node_2_tq_linked(_n) \ container_of(rb_entry((_n), struct rb_node_linked, node), struct timerqueue_linked_node, node) static __always_inline bool __tq_linked_less(struct rb_node *a, const struct rb_node *b) { return __node_2_tq_linked(a)->expires < __node_2_tq_linked(b)->expires; } bool timerqueue_linked_add(struct timerqueue_linked_head *head, struct timerqueue_linked_node *node) { return rb_add_linked(&node->node, &head->rb_root, __tq_linked_less); } EXPORT_SYMBOL_GPL(timerqueue_linked_add);
3 6 6 2 5 5 5 1 4 11 11 9 8 8 4 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 // SPDX-License-Identifier: GPL-2.0-or-later /* * Glue Code for the AVX assembler implementation of the Cast5 Cipher * * Copyright (C) 2012 Johannes Goetzfried * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> */ #include <crypto/algapi.h> #include <crypto/cast5.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/module.h> #include <linux/types.h> #include "ecb_cbc_helpers.h" #define CAST5_PARALLEL_BLOCKS 16 asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src); asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src); asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src); static int cast5_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return cast5_setkey(&tfm->base, key, keylen); } static int ecb_encrypt(struct skcipher_request *req) { ECB_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS); ECB_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_ecb_enc_16way); ECB_BLOCK(1, __cast5_encrypt); ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { ECB_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS); ECB_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_ecb_dec_16way); ECB_BLOCK(1, __cast5_decrypt); ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { CBC_WALK_START(req, CAST5_BLOCK_SIZE, -1); CBC_ENC_BLOCK(__cast5_encrypt); CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { CBC_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS); CBC_DEC_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_cbc_dec_16way); CBC_DEC_BLOCK(1, __cast5_decrypt); CBC_WALK_END(); } static struct skcipher_alg cast5_algs[] = { { .base.cra_name = "ecb(cast5)", .base.cra_driver_name = "ecb-cast5-avx", .base.cra_priority = 200, .base.cra_blocksize = CAST5_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct cast5_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CAST5_MIN_KEY_SIZE, .max_keysize = CAST5_MAX_KEY_SIZE, .setkey = cast5_setkey_skcipher, .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { .base.cra_name = "cbc(cast5)", .base.cra_driver_name = "cbc-cast5-avx", .base.cra_priority = 200, .base.cra_blocksize = CAST5_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct cast5_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CAST5_MIN_KEY_SIZE, .max_keysize = CAST5_MAX_KEY_SIZE, .ivsize = CAST5_BLOCK_SIZE, .setkey = cast5_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, } }; static int __init cast5_init(void) { const char *feature_name; if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } return crypto_register_skciphers(cast5_algs, ARRAY_SIZE(cast5_algs)); } static void __exit cast5_exit(void) { crypto_unregister_skciphers(cast5_algs, ARRAY_SIZE(cast5_algs)); } module_init(cast5_init); module_exit(cast5_exit); MODULE_DESCRIPTION("Cast5 Cipher Algorithm, AVX optimized"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("cast5");
964 131 78 2140 123 2135 304 2070 123 304 1 304 88 88 1098 304 304 301 241 216 88 1976 126 76 21 96 19 15 4 303 109 1974 1975 362 1977 1976 964 205 103 107 22 34 3 17 1 192 52 44 383 86 21 1 22 188 186 46 46 46 2 1924 2000 1999 233 88 1097 15 11 4 1106 1104 221 1998 2000 248 11 249 248 349 66 64 72 53 253 187 46 6 189 3 13 9 10 37 21 28 39 156 108 11 6 44 72 44 365 46 46 259 120 120 1 108 72 72 3 8 58 78 6 44 75 6 383 383 381 6 6 379 383 365 389 389 389 363 382 1 383 314 314 314 313 314 104 57 7 161 314 220 78 17 2 3 22 305 58 10 96 249 203 69 69 69 44 1231 1230 1232 935 1232 893 60 193 1997 344 1676 1995 1193 2000 1999 73 249 248 246 22 249 25 69 15 54 86 63 150 329 329 134 284 301 19 180 241 288 287 131 130 131 243 314 1678 1232 1231 1231 1231 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/filter.h> #define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) #define BPF_COMPLEXITY_LIMIT_STATES 64 static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx) { return bpf_is_may_goto_insn(&env->prog->insnsi[insn_idx]); } static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx) { return env->insn_aux_data[insn_idx].is_iter_next; } static void update_peak_states(struct bpf_verifier_env *env) { u32 cur_states; cur_states = env->explored_states_size + env->free_list_size + env->num_backedges; env->peak_states = max(env->peak_states, cur_states); } /* struct bpf_verifier_state->parent refers to states * that are in either of env->{expored_states,free_list}. * In both cases the state is contained in struct bpf_verifier_state_list. */ static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_state *st) { if (st->parent) return container_of(st->parent, struct bpf_verifier_state_list, state); return NULL; } static bool incomplete_read_marks(struct bpf_verifier_env *env, struct bpf_verifier_state *st); /* A state can be freed if it is no longer referenced: * - is in the env->free_list; * - has no children states; */ static void maybe_free_verifier_state(struct bpf_verifier_env *env, struct bpf_verifier_state_list *sl) { if (!sl->in_free_list || sl->state.branches != 0 || incomplete_read_marks(env, &sl->state)) return; list_del(&sl->node); bpf_free_verifier_state(&sl->state, false); kfree(sl); env->free_list_size--; } /* For state @st look for a topmost frame with frame_insn_idx() in some SCC, * if such frame exists form a corresponding @callchain as an array of * call sites leading to this frame and SCC id. * E.g.: * * void foo() { A: loop {... SCC#1 ...}; } * void bar() { B: loop { C: foo(); ... SCC#2 ... } * D: loop { E: foo(); ... SCC#3 ... } } * void main() { F: bar(); } * * @callchain at (A) would be either (F,SCC#2) or (F,SCC#3) depending * on @st frame call sites being (F,C,A) or (F,E,A). */ static bool compute_scc_callchain(struct bpf_verifier_env *env, struct bpf_verifier_state *st, struct bpf_scc_callchain *callchain) { u32 i, scc, insn_idx; memset(callchain, 0, sizeof(*callchain)); for (i = 0; i <= st->curframe; i++) { insn_idx = bpf_frame_insn_idx(st, i); scc = env->insn_aux_data[insn_idx].scc; if (scc) { callchain->scc = scc; break; } else if (i < st->curframe) { callchain->callsites[i] = insn_idx; } else { return false; } } return true; } /* Check if bpf_scc_visit instance for @callchain exists. */ static struct bpf_scc_visit *scc_visit_lookup(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain) { struct bpf_scc_info *info = env->scc_info[callchain->scc]; struct bpf_scc_visit *visits = info->visits; u32 i; if (!info) return NULL; for (i = 0; i < info->num_visits; i++) if (memcmp(callchain, &visits[i].callchain, sizeof(*callchain)) == 0) return &visits[i]; return NULL; } /* Allocate a new bpf_scc_visit instance corresponding to @callchain. * Allocated instances are alive for a duration of the do_check_common() * call and are freed by free_states(). */ static struct bpf_scc_visit *scc_visit_alloc(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain) { struct bpf_scc_visit *visit; struct bpf_scc_info *info; u32 scc, num_visits; u64 new_sz; scc = callchain->scc; info = env->scc_info[scc]; num_visits = info ? info->num_visits : 0; new_sz = sizeof(*info) + sizeof(struct bpf_scc_visit) * (num_visits + 1); info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL_ACCOUNT); if (!info) return NULL; env->scc_info[scc] = info; info->num_visits = num_visits + 1; visit = &info->visits[num_visits]; memset(visit, 0, sizeof(*visit)); memcpy(&visit->callchain, callchain, sizeof(*callchain)); return visit; } /* Form a string '(callsite#1,callsite#2,...,scc)' in env->tmp_str_buf */ static char *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain) { char *buf = env->tmp_str_buf; int i, delta = 0; delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "("); for (i = 0; i < ARRAY_SIZE(callchain->callsites); i++) { if (!callchain->callsites[i]) break; delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u,", callchain->callsites[i]); } delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u)", callchain->scc); return env->tmp_str_buf; } /* If callchain for @st exists (@st is in some SCC), ensure that * bpf_scc_visit instance for this callchain exists. * If instance does not exist or is empty, assign visit->entry_state to @st. */ static int maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { struct bpf_scc_callchain *callchain = &env->callchain_buf; struct bpf_scc_visit *visit; if (!compute_scc_callchain(env, st, callchain)) return 0; visit = scc_visit_lookup(env, callchain); visit = visit ?: scc_visit_alloc(env, callchain); if (!visit) return -ENOMEM; if (!visit->entry_state) { visit->entry_state = st; if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "SCC enter %s\n", format_callchain(env, callchain)); } return 0; } static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit); /* If callchain for @st exists (@st is in some SCC), make it empty: * - set visit->entry_state to NULL; * - flush accumulated backedges. */ static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { struct bpf_scc_callchain *callchain = &env->callchain_buf; struct bpf_scc_visit *visit; if (!compute_scc_callchain(env, st, callchain)) return 0; visit = scc_visit_lookup(env, callchain); if (!visit) { /* * If path traversal stops inside an SCC, corresponding bpf_scc_visit * must exist for non-speculative paths. For non-speculative paths * traversal stops when: * a. Verification error is found, maybe_exit_scc() is not called. * b. Top level BPF_EXIT is reached. Top level BPF_EXIT is not a member * of any SCC. * c. A checkpoint is reached and matched. Checkpoints are created by * is_state_visited(), which calls maybe_enter_scc(), which allocates * bpf_scc_visit instances for checkpoints within SCCs. * (c) is the only case that can reach this point. */ if (!st->speculative) { verifier_bug(env, "scc exit: no visit info for call chain %s", format_callchain(env, callchain)); return -EFAULT; } return 0; } if (visit->entry_state != st) return 0; if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "SCC exit %s\n", format_callchain(env, callchain)); visit->entry_state = NULL; env->num_backedges -= visit->num_backedges; visit->num_backedges = 0; update_peak_states(env); return propagate_backedges(env, visit); } /* Lookup an bpf_scc_visit instance corresponding to @st callchain * and add @backedge to visit->backedges. @st callchain must exist. */ static int add_scc_backedge(struct bpf_verifier_env *env, struct bpf_verifier_state *st, struct bpf_scc_backedge *backedge) { struct bpf_scc_callchain *callchain = &env->callchain_buf; struct bpf_scc_visit *visit; if (!compute_scc_callchain(env, st, callchain)) { verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d", st->insn_idx); return -EFAULT; } visit = scc_visit_lookup(env, callchain); if (!visit) { verifier_bug(env, "add backedge: no visit info for call chain %s", format_callchain(env, callchain)); return -EFAULT; } if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "SCC backedge %s\n", format_callchain(env, callchain)); backedge->next = visit->backedges; visit->backedges = backedge; visit->num_backedges++; env->num_backedges++; update_peak_states(env); return 0; } /* bpf_reg_state->live marks for registers in a state @st are incomplete, * if state @st is in some SCC and not all execution paths starting at this * SCC are fully explored. */ static bool incomplete_read_marks(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { struct bpf_scc_callchain *callchain = &env->callchain_buf; struct bpf_scc_visit *visit; if (!compute_scc_callchain(env, st, callchain)) return false; visit = scc_visit_lookup(env, callchain); if (!visit) return false; return !!visit->backedges; } int bpf_update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { struct bpf_verifier_state_list *sl = NULL, *parent_sl; struct bpf_verifier_state *parent; int err; while (st) { u32 br = --st->branches; /* verifier_bug_if(br > 1, ...) technically makes sense here, * but see comment in push_stack(), hence: */ verifier_bug_if((int)br < 0, env, "%s:branches_to_explore=%d", __func__, br); if (br) break; err = maybe_exit_scc(env, st); if (err) return err; parent = st->parent; parent_sl = state_parent_as_list(st); if (sl) maybe_free_verifier_state(env, sl); st = parent; sl = parent_sl; } return 0; } /* check %cur's range satisfies %old's */ static bool range_within(const struct bpf_reg_state *old, const struct bpf_reg_state *cur) { return old->umin_value <= cur->umin_value && old->umax_value >= cur->umax_value && old->smin_value <= cur->smin_value && old->smax_value >= cur->smax_value && old->u32_min_value <= cur->u32_min_value && old->u32_max_value >= cur->u32_max_value && old->s32_min_value <= cur->s32_min_value && old->s32_max_value >= cur->s32_max_value; } /* If in the old state two registers had the same id, then they need to have * the same id in the new state as well. But that id could be different from * the old state, so we need to track the mapping from old to new ids. * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent * regs with old id 5 must also have new id 9 for the new state to be safe. But * regs with a different old id could still have new id 9, we don't care about * that. * So we look through our idmap to see if this old id has been seen before. If * so, we require the new id to match; otherwise, we add the id pair to the map. */ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) { struct bpf_id_pair *map = idmap->map; unsigned int i; /* either both IDs should be set or both should be zero */ if (!!old_id != !!cur_id) return false; if (old_id == 0) /* cur_id == 0 as well */ return true; for (i = 0; i < idmap->cnt; i++) { if (map[i].old == old_id) return map[i].cur == cur_id; if (map[i].cur == cur_id) return false; } /* Reached the end of known mappings; haven't seen this id before */ if (idmap->cnt < BPF_ID_MAP_SIZE) { map[idmap->cnt].old = old_id; map[idmap->cnt].cur = cur_id; idmap->cnt++; return true; } /* We ran out of idmap slots, which should be impossible */ WARN_ON_ONCE(1); return false; } /* * Compare scalar register IDs for state equivalence. * * When old_id == 0, the old register is independent - not linked to any * other register. Any linking in the current state only adds constraints, * making it more restrictive. Since the old state didn't rely on any ID * relationships for this register, it's always safe to accept cur regardless * of its ID. Hence, return true immediately. * * When old_id != 0 but cur_id == 0, we need to ensure that different * independent registers in cur don't incorrectly satisfy the ID matching * requirements of linked registers in old. * * Example: if old has r6.id=X and r7.id=X (linked), but cur has r6.id=0 * and r7.id=0 (both independent), without temp IDs both would map old_id=X * to cur_id=0 and pass. With temp IDs: r6 maps X->temp1, r7 tries to map * X->temp2, but X is already mapped to temp1, so the check fails correctly. * * When old_id has BPF_ADD_CONST set, the compound id (base | flag) and the * base id (flag stripped) must both map consistently. Example: old has * r2.id=A, r3.id=A|flag (r3 = r2 + delta), cur has r2.id=B, r3.id=C|flag * (r3 derived from unrelated r4). Without the base check, idmap gets two * independent entries A->B and A|flag->C|flag, missing that A->C conflicts * with A->B. The base ID cross-check catches this. */ static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) { if (!old_id) return true; cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen; if (!check_ids(old_id, cur_id, idmap)) return false; if (old_id & BPF_ADD_CONST) { old_id &= ~BPF_ADD_CONST; cur_id &= ~BPF_ADD_CONST; if (!check_ids(old_id, cur_id, idmap)) return false; } return true; } static void __clean_func_state(struct bpf_verifier_env *env, struct bpf_func_state *st, u16 live_regs, int frame) { int i, j; for (i = 0; i < BPF_REG_FP; i++) { /* liveness must not touch this register anymore */ if (!(live_regs & BIT(i))) /* since the register is unused, clear its state * to make further comparison simpler */ bpf_mark_reg_not_init(env, &st->regs[i]); } /* * Clean dead 4-byte halves within each SPI independently. * half_spi 2*i → lower half: slot_type[0..3] (closer to FP) * half_spi 2*i+1 → upper half: slot_type[4..7] (farther from FP) */ for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { bool lo_live = bpf_stack_slot_alive(env, frame, i * 2); bool hi_live = bpf_stack_slot_alive(env, frame, i * 2 + 1); if (!hi_live || !lo_live) { int start = !lo_live ? 0 : BPF_REG_SIZE / 2; int end = !hi_live ? BPF_REG_SIZE : BPF_REG_SIZE / 2; u8 stype = st->stack[i].slot_type[7]; /* * Don't clear special slots. * destroy_if_dynptr_stack_slot() needs STACK_DYNPTR to * detect overwrites and invalidate associated data slices. * is_iter_reg_valid_uninit() and is_irq_flag_reg_valid_uninit() * check for their respective slot types to detect double-create. */ if (stype == STACK_DYNPTR || stype == STACK_ITER || stype == STACK_IRQ_FLAG) continue; /* * Only destroy spilled_ptr when hi half is dead. * If hi half is still live with STACK_SPILL, the * spilled_ptr metadata is needed for correct state * comparison in stacksafe(). * is_spilled_reg() is using slot_type[7], but * is_spilled_scalar_after() check either slot_type[0] or [4] */ if (!hi_live) { struct bpf_reg_state *spill = &st->stack[i].spilled_ptr; if (lo_live && stype == STACK_SPILL) { u8 val = STACK_MISC; /* * 8 byte spill of scalar 0 where half slot is dead * should become STACK_ZERO in lo 4 bytes. */ if (bpf_register_is_null(spill)) val = STACK_ZERO; for (j = 0; j < 4; j++) { u8 *t = &st->stack[i].slot_type[j]; if (*t == STACK_SPILL) *t = val; } } bpf_mark_reg_not_init(env, spill); } for (j = start; j < end; j++) st->stack[i].slot_type[j] = STACK_POISON; } } } static int clean_verifier_state(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { int i, err; err = bpf_live_stack_query_init(env, st); if (err) return err; for (i = 0; i <= st->curframe; i++) { u32 ip = bpf_frame_insn_idx(st, i); u16 live_regs = env->insn_aux_data[ip].live_regs_before; __clean_func_state(env, st->frame[i], live_regs, i); } return 0; } static bool regs_exact(const struct bpf_reg_state *rold, const struct bpf_reg_state *rcur, struct bpf_idmap *idmap) { return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && check_ids(rold->id, rcur->id, idmap) && check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); } enum exact_level { NOT_EXACT, EXACT, RANGE_WITHIN }; /* Returns true if (rold safe implies rcur safe) */ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, struct bpf_reg_state *rcur, struct bpf_idmap *idmap, enum exact_level exact) { if (exact == EXACT) return regs_exact(rold, rcur, idmap); if (rold->type == NOT_INIT) /* explored state can't have used this */ return true; /* Enforce that register types have to match exactly, including their * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general * rule. * * One can make a point that using a pointer register as unbounded * SCALAR would be technically acceptable, but this could lead to * pointer leaks because scalars are allowed to leak while pointers * are not. We could make this safe in special cases if root is * calling us, but it's probably not worth the hassle. * * Also, register types that are *not* MAYBE_NULL could technically be * safe to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE * is safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point * to the same map). * However, if the old MAYBE_NULL register then got NULL checked, * doing so could have affected others with the same id, and we can't * check for that because we lost the id when we converted to * a non-MAYBE_NULL variant. * So, as a general rule we don't allow mixing MAYBE_NULL and * non-MAYBE_NULL registers as well. */ if (rold->type != rcur->type) return false; switch (base_type(rold->type)) { case SCALAR_VALUE: if (env->explore_alu_limits) { /* explore_alu_limits disables tnum_in() and range_within() * logic and requires everything to be strict */ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && check_scalar_ids(rold->id, rcur->id, idmap); } if (!rold->precise && exact == NOT_EXACT) return true; /* * Linked register tracking uses rold->id to detect relationships. * When rold->id == 0, the register is independent and any linking * in rcur only adds constraints. When rold->id != 0, we must verify * id mapping and (for BPF_ADD_CONST) offset consistency. * * +------------------+-----------+------------------+---------------+ * | | rold->id | rold + ADD_CONST | rold->id == 0 | * |------------------+-----------+------------------+---------------| * | rcur->id | range,ids | false | range | * | rcur + ADD_CONST | false | range,ids,off | range | * | rcur->id == 0 | range,ids | false | range | * +------------------+-----------+------------------+---------------+ * * Why check_ids() for scalar registers? * * Consider the following BPF code: * 1: r6 = ... unbound scalar, ID=a ... * 2: r7 = ... unbound scalar, ID=b ... * 3: if (r6 > r7) goto +1 * 4: r6 = r7 * 5: if (r6 > X) goto ... * 6: ... memory operation using r7 ... * * First verification path is [1-6]: * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7; * - at (5) r6 would be marked <= X, sync_linked_regs() would also mark * r7 <= X, because r6 and r7 share same id. * Next verification path is [1-4, 6]. * * Instruction (6) would be reached in two states: * I. r6{.id=b}, r7{.id=b} via path 1-6; * II. r6{.id=a}, r7{.id=b} via path 1-4, 6. * * Use check_ids() to distinguish these states. * --- * Also verify that new value satisfies old value range knowledge. */ /* * ADD_CONST flags must match exactly: BPF_ADD_CONST32 and * BPF_ADD_CONST64 have different linking semantics in * sync_linked_regs() (alu32 zero-extends, alu64 does not), * so pruning across different flag types is unsafe. */ if (rold->id && (rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST)) return false; /* Both have offset linkage: offsets must match */ if ((rold->id & BPF_ADD_CONST) && rold->delta != rcur->delta) return false; if (!check_scalar_ids(rold->id, rcur->id, idmap)) return false; return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: case PTR_TO_MEM: case PTR_TO_BUF: case PTR_TO_TP_BUFFER: /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. */ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 && range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off) && check_ids(rold->id, rcur->id, idmap) && check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); case PTR_TO_PACKET_META: case PTR_TO_PACKET: /* We must have at least as much range as the old ptr * did, so that any accesses which were safe before are * still safe. This is true even if old range < old off, * since someone could have accessed through (ptr - k), or * even done ptr -= k in a register, to get a safe access. */ if (rold->range < 0 || rcur->range < 0) { /* special case for [BEYOND|AT]_PKT_END */ if (rold->range != rcur->range) return false; } else if (rold->range > rcur->range) { return false; } /* id relations must be preserved */ if (!check_ids(rold->id, rcur->id, idmap)) return false; /* new val must satisfy old val knowledge */ return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); case PTR_TO_STACK: /* two stack pointers are equal only if they're pointing to * the same stack frame, since fp-8 in foo != fp-8 in bar */ return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno; case PTR_TO_ARENA: return true; case PTR_TO_INSN: return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 && range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); default: return regs_exact(rold, rcur, idmap); } } static struct bpf_reg_state unbound_reg; static __init int unbound_reg_init(void) { bpf_mark_reg_unknown_imprecise(&unbound_reg); return 0; } late_initcall(unbound_reg_init); static bool is_spilled_scalar_after(const struct bpf_stack_state *stack, int im) { return stack->slot_type[im] == STACK_SPILL && stack->spilled_ptr.type == SCALAR_VALUE; } static bool is_stack_misc_after(struct bpf_verifier_env *env, struct bpf_stack_state *stack, int im) { u32 i; for (i = im; i < ARRAY_SIZE(stack->slot_type); ++i) { if ((stack->slot_type[i] == STACK_MISC) || ((stack->slot_type[i] == STACK_INVALID || stack->slot_type[i] == STACK_POISON) && env->allow_uninit_stack)) continue; return false; } return true; } static struct bpf_reg_state *scalar_reg_for_stack(struct bpf_verifier_env *env, struct bpf_stack_state *stack, int im) { if (is_spilled_scalar_after(stack, im)) return &stack->spilled_ptr; if (is_stack_misc_after(env, stack, im)) return &unbound_reg; return NULL; } static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, struct bpf_func_state *cur, struct bpf_idmap *idmap, enum exact_level exact) { int i, spi; /* walk slots of the explored stack and ignore any additional * slots in the current stack, since explored(safe) state * didn't use them */ for (i = 0; i < old->allocated_stack; i++) { struct bpf_reg_state *old_reg, *cur_reg; int im = i % BPF_REG_SIZE; spi = i / BPF_REG_SIZE; if (exact == EXACT) { u8 old_type = old->stack[spi].slot_type[i % BPF_REG_SIZE]; u8 cur_type = i < cur->allocated_stack ? cur->stack[spi].slot_type[i % BPF_REG_SIZE] : STACK_INVALID; /* STACK_INVALID and STACK_POISON are equivalent for pruning */ if (old_type == STACK_POISON) old_type = STACK_INVALID; if (cur_type == STACK_POISON) cur_type = STACK_INVALID; if (i >= cur->allocated_stack || old_type != cur_type) return false; } if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID || old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_POISON) continue; if (env->allow_uninit_stack && old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC) continue; /* explored stack has more populated slots than current stack * and these slots were used */ if (i >= cur->allocated_stack) return false; /* * 64 and 32-bit scalar spills vs MISC/INVALID slots and vice versa. * Load from MISC/INVALID slots produces unbound scalar. * Construct a fake register for such stack and call * regsafe() to ensure scalar ids are compared. */ if (im == 0 || im == 4) { old_reg = scalar_reg_for_stack(env, &old->stack[spi], im); cur_reg = scalar_reg_for_stack(env, &cur->stack[spi], im); if (old_reg && cur_reg) { if (!regsafe(env, old_reg, cur_reg, idmap, exact)) return false; i += (im == 0 ? BPF_REG_SIZE - 1 : 3); continue; } } /* if old state was safe with misc data in the stack * it will be safe with zero-initialized stack. * The opposite is not true */ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC && cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO) continue; if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != cur->stack[spi].slot_type[i % BPF_REG_SIZE]) /* Ex: old explored (safe) state has STACK_SPILL in * this stack slot, but current has STACK_MISC -> * this verifier states are not equivalent, * return false to continue verification of this path */ return false; if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1) continue; /* Both old and cur are having same slot_type */ switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) { case STACK_SPILL: /* when explored and current stack slot are both storing * spilled registers, check that stored pointers types * are the same as well. * Ex: explored safe path could have stored * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8} * but current path has stored: * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16} * such verifier states are not equivalent. * return false to continue verification of this path */ if (!regsafe(env, &old->stack[spi].spilled_ptr, &cur->stack[spi].spilled_ptr, idmap, exact)) return false; break; case STACK_DYNPTR: old_reg = &old->stack[spi].spilled_ptr; cur_reg = &cur->stack[spi].spilled_ptr; if (old_reg->dynptr.type != cur_reg->dynptr.type || old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot || !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) return false; break; case STACK_ITER: old_reg = &old->stack[spi].spilled_ptr; cur_reg = &cur->stack[spi].spilled_ptr; /* iter.depth is not compared between states as it * doesn't matter for correctness and would otherwise * prevent convergence; we maintain it only to prevent * infinite loop check triggering, see * iter_active_depths_differ() */ if (old_reg->iter.btf != cur_reg->iter.btf || old_reg->iter.btf_id != cur_reg->iter.btf_id || old_reg->iter.state != cur_reg->iter.state || /* ignore {old_reg,cur_reg}->iter.depth, see above */ !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) return false; break; case STACK_IRQ_FLAG: old_reg = &old->stack[spi].spilled_ptr; cur_reg = &cur->stack[spi].spilled_ptr; if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) || old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class) return false; break; case STACK_MISC: case STACK_ZERO: case STACK_INVALID: case STACK_POISON: continue; /* Ensure that new unhandled slot types return false by default */ default: return false; } } return true; } static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur, struct bpf_idmap *idmap) { int i; if (old->acquired_refs != cur->acquired_refs) return false; if (old->active_locks != cur->active_locks) return false; if (old->active_preempt_locks != cur->active_preempt_locks) return false; if (old->active_rcu_locks != cur->active_rcu_locks) return false; if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap)) return false; if (!check_ids(old->active_lock_id, cur->active_lock_id, idmap) || old->active_lock_ptr != cur->active_lock_ptr) return false; for (i = 0; i < old->acquired_refs; i++) { if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) || old->refs[i].type != cur->refs[i].type) return false; switch (old->refs[i].type) { case REF_TYPE_PTR: case REF_TYPE_IRQ: break; case REF_TYPE_LOCK: case REF_TYPE_RES_LOCK: case REF_TYPE_RES_LOCK_IRQ: if (old->refs[i].ptr != cur->refs[i].ptr) return false; break; default: WARN_ONCE(1, "Unhandled enum type for reference state: %d\n", old->refs[i].type); return false; } } return true; } /* compare two verifier states * * all states stored in state_list are known to be valid, since * verifier reached 'bpf_exit' instruction through them * * this function is called when verifier exploring different branches of * execution popped from the state stack. If it sees an old state that has * more strict register state and more strict stack state then this execution * branch doesn't need to be explored further, since verifier already * concluded that more strict state leads to valid finish. * * Therefore two states are equivalent if register state is more conservative * and explored stack state is more conservative than the current one. * Example: * explored current * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC) * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC) * * In other words if current stack state (one being explored) has more * valid slots than old one that already passed validation, it means * the verifier can stop exploring and conclude that current state is valid too * * Similarly with registers. If explored state has register type as invalid * whereas register type in current state is meaningful, it means that * the current state will reach 'bpf_exit' instruction safely */ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, struct bpf_func_state *cur, u32 insn_idx, enum exact_level exact) { u16 live_regs = env->insn_aux_data[insn_idx].live_regs_before; u16 i; if (old->callback_depth > cur->callback_depth) return false; for (i = 0; i < MAX_BPF_REG; i++) if (((1 << i) & live_regs) && !regsafe(env, &old->regs[i], &cur->regs[i], &env->idmap_scratch, exact)) return false; if (!stacksafe(env, old, cur, &env->idmap_scratch, exact)) return false; return true; } static void reset_idmap_scratch(struct bpf_verifier_env *env) { struct bpf_idmap *idmap = &env->idmap_scratch; idmap->tmp_id_gen = env->id_gen; idmap->cnt = 0; } static bool states_equal(struct bpf_verifier_env *env, struct bpf_verifier_state *old, struct bpf_verifier_state *cur, enum exact_level exact) { u32 insn_idx; int i; if (old->curframe != cur->curframe) return false; reset_idmap_scratch(env); /* Verification state from speculative execution simulation * must never prune a non-speculative execution one. */ if (old->speculative && !cur->speculative) return false; if (old->in_sleepable != cur->in_sleepable) return false; if (!refsafe(old, cur, &env->idmap_scratch)) return false; /* for states to be equal callsites have to be the same * and all frame states need to be equivalent */ for (i = 0; i <= old->curframe; i++) { insn_idx = bpf_frame_insn_idx(old, i); if (old->frame[i]->callsite != cur->frame[i]->callsite) return false; if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact)) return false; } return true; } /* find precise scalars in the previous equivalent state and * propagate them into the current state */ static int propagate_precision(struct bpf_verifier_env *env, const struct bpf_verifier_state *old, struct bpf_verifier_state *cur, bool *changed) { struct bpf_reg_state *state_reg; struct bpf_func_state *state; int i, err = 0, fr; bool first; for (fr = old->curframe; fr >= 0; fr--) { state = old->frame[fr]; state_reg = state->regs; first = true; for (i = 0; i < BPF_REG_FP; i++, state_reg++) { if (state_reg->type != SCALAR_VALUE || !state_reg->precise) continue; if (env->log.level & BPF_LOG_LEVEL2) { if (first) verbose(env, "frame %d: propagating r%d", fr, i); else verbose(env, ",r%d", i); } bpf_bt_set_frame_reg(&env->bt, fr, i); first = false; } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { if (!bpf_is_spilled_reg(&state->stack[i])) continue; state_reg = &state->stack[i].spilled_ptr; if (state_reg->type != SCALAR_VALUE || !state_reg->precise) continue; if (env->log.level & BPF_LOG_LEVEL2) { if (first) verbose(env, "frame %d: propagating fp%d", fr, (-i - 1) * BPF_REG_SIZE); else verbose(env, ",fp%d", (-i - 1) * BPF_REG_SIZE); } bpf_bt_set_frame_slot(&env->bt, fr, i); first = false; } if (!first && (env->log.level & BPF_LOG_LEVEL2)) verbose(env, "\n"); } err = bpf_mark_chain_precision(env, cur, -1, changed); if (err < 0) return err; return 0; } #define MAX_BACKEDGE_ITERS 64 /* Propagate read and precision marks from visit->backedges[*].state->equal_state * to corresponding parent states of visit->backedges[*].state until fixed point is reached, * then free visit->backedges. * After execution of this function incomplete_read_marks() will return false * for all states corresponding to @visit->callchain. */ static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit) { struct bpf_scc_backedge *backedge; struct bpf_verifier_state *st; bool changed; int i, err; i = 0; do { if (i++ > MAX_BACKEDGE_ITERS) { if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "%s: too many iterations\n", __func__); for (backedge = visit->backedges; backedge; backedge = backedge->next) bpf_mark_all_scalars_precise(env, &backedge->state); break; } changed = false; for (backedge = visit->backedges; backedge; backedge = backedge->next) { st = &backedge->state; err = propagate_precision(env, st->equal_state, st, &changed); if (err) return err; } } while (changed); bpf_free_backedges(visit); return 0; } static bool states_maybe_looping(struct bpf_verifier_state *old, struct bpf_verifier_state *cur) { struct bpf_func_state *fold, *fcur; int i, fr = cur->curframe; if (old->curframe != fr) return false; fold = old->frame[fr]; fcur = cur->frame[fr]; for (i = 0; i < MAX_BPF_REG; i++) if (memcmp(&fold->regs[i], &fcur->regs[i], offsetof(struct bpf_reg_state, frameno))) return false; return true; } /* is_state_visited() handles iter_next() (see process_iter_next_call() for * terminology) calls specially: as opposed to bounded BPF loops, it *expects* * states to match, which otherwise would look like an infinite loop. So while * iter_next() calls are taken care of, we still need to be careful and * prevent erroneous and too eager declaration of "infinite loop", when * iterators are involved. * * Here's a situation in pseudo-BPF assembly form: * * 0: again: ; set up iter_next() call args * 1: r1 = &it ; <CHECKPOINT HERE> * 2: call bpf_iter_num_next ; this is iter_next() call * 3: if r0 == 0 goto done * 4: ... something useful here ... * 5: goto again ; another iteration * 6: done: * 7: r1 = &it * 8: call bpf_iter_num_destroy ; clean up iter state * 9: exit * * This is a typical loop. Let's assume that we have a prune point at 1:, * before we get to `call bpf_iter_num_next` (e.g., because of that `goto * again`, assuming other heuristics don't get in a way). * * When we first time come to 1:, let's say we have some state X. We proceed * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit. * Now we come back to validate that forked ACTIVE state. We proceed through * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we * are converging. But the problem is that we don't know that yet, as this * convergence has to happen at iter_next() call site only. So if nothing is * done, at 1: verifier will use bounded loop logic and declare infinite * looping (and would be *technically* correct, if not for iterator's * "eventual sticky NULL" contract, see process_iter_next_call()). But we * don't want that. So what we do in process_iter_next_call() when we go on * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's * a different iteration. So when we suspect an infinite loop, we additionally * check if any of the *ACTIVE* iterator states depths differ. If yes, we * pretend we are not looping and wait for next iter_next() call. * * This only applies to ACTIVE state. In DRAINED state we don't expect to * loop, because that would actually mean infinite loop, as DRAINED state is * "sticky", and so we'll keep returning into the same instruction with the * same state (at least in one of possible code paths). * * This approach allows to keep infinite loop heuristic even in the face of * active iterator. E.g., C snippet below is and will be detected as * infinitely looping: * * struct bpf_iter_num it; * int *p, x; * * bpf_iter_num_new(&it, 0, 10); * while ((p = bpf_iter_num_next(&t))) { * x = p; * while (x--) {} // <<-- infinite loop here * } * */ static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur) { struct bpf_reg_state *slot, *cur_slot; struct bpf_func_state *state; int i, fr; for (fr = old->curframe; fr >= 0; fr--) { state = old->frame[fr]; for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { if (state->stack[i].slot_type[0] != STACK_ITER) continue; slot = &state->stack[i].spilled_ptr; if (slot->iter.state != BPF_ITER_STATE_ACTIVE) continue; cur_slot = &cur->frame[fr]->stack[i].spilled_ptr; if (cur_slot->iter.depth != slot->iter.depth) return true; } } return false; } static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { struct bpf_func_state *func; struct bpf_reg_state *reg; int i, j; for (i = 0; i <= st->curframe; i++) { func = st->frame[i]; for (j = 0; j < BPF_REG_FP; j++) { reg = &func->regs[j]; if (reg->type != SCALAR_VALUE) continue; reg->precise = false; } for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { if (!bpf_is_spilled_reg(&func->stack[j])) continue; reg = &func->stack[j].spilled_ptr; if (reg->type != SCALAR_VALUE) continue; reg->precise = false; } } } int bpf_is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state, *new; bool force_new_state, add_new_state, loop; int n, err, states_cnt = 0; struct list_head *pos, *tmp, *head; force_new_state = env->test_state_freq || bpf_is_force_checkpoint(env, insn_idx) || /* Avoid accumulating infinitely long jmp history */ cur->jmp_history_cnt > 40; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 * Do not add new state for future pruning if the verifier hasn't seen * at least 2 jumps and at least 8 instructions. * This heuristics helps decrease 'total_states' and 'peak_states' metric. * In tests that amounts to up to 50% reduction into total verifier * memory consumption and 20% verifier time speedup. */ add_new_state = force_new_state; if (env->jmps_processed - env->prev_jmps_processed >= 2 && env->insn_processed - env->prev_insn_processed >= 8) add_new_state = true; /* keep cleaning the current state as registers/stack become dead */ err = clean_verifier_state(env, cur); if (err) return err; loop = false; head = bpf_explored_state(env, insn_idx); list_for_each_safe(pos, tmp, head) { sl = container_of(pos, struct bpf_verifier_state_list, node); states_cnt++; if (sl->state.insn_idx != insn_idx) continue; if (sl->state.branches) { struct bpf_func_state *frame = sl->state.frame[sl->state.curframe]; if (frame->in_async_callback_fn && frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) { /* Different async_entry_cnt means that the verifier is * processing another entry into async callback. * Seeing the same state is not an indication of infinite * loop or infinite recursion. * But finding the same state doesn't mean that it's safe * to stop processing the current state. The previous state * hasn't yet reached bpf_exit, since state.branches > 0. * Checking in_async_callback_fn alone is not enough either. * Since the verifier still needs to catch infinite loops * inside async callbacks. */ goto skip_inf_loop_check; } /* BPF open-coded iterators loop detection is special. * states_maybe_looping() logic is too simplistic in detecting * states that *might* be equivalent, because it doesn't know * about ID remapping, so don't even perform it. * See process_iter_next_call() and iter_active_depths_differ() * for overview of the logic. When current and one of parent * states are detected as equivalent, it's a good thing: we prove * convergence and can stop simulating further iterations. * It's safe to assume that iterator loop will finish, taking into * account iter_next() contract of eventually returning * sticky NULL result. * * Note, that states have to be compared exactly in this case because * read and precision marks might not be finalized inside the loop. * E.g. as in the program below: * * 1. r7 = -16 * 2. r6 = bpf_get_prandom_u32() * 3. while (bpf_iter_num_next(&fp[-8])) { * 4. if (r6 != 42) { * 5. r7 = -32 * 6. r6 = bpf_get_prandom_u32() * 7. continue * 8. } * 9. r0 = r10 * 10. r0 += r7 * 11. r8 = *(u64 *)(r0 + 0) * 12. r6 = bpf_get_prandom_u32() * 13. } * * Here verifier would first visit path 1-3, create a checkpoint at 3 * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does * not have read or precision mark for r7 yet, thus inexact states * comparison would discard current state with r7=-32 * => unsafe memory access at 11 would not be caught. */ if (is_iter_next_insn(env, insn_idx)) { if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) { struct bpf_func_state *cur_frame; struct bpf_reg_state *iter_state, *iter_reg; int spi; cur_frame = cur->frame[cur->curframe]; /* btf_check_iter_kfuncs() enforces that * iter state pointer is always the first arg */ iter_reg = &cur_frame->regs[BPF_REG_1]; /* current state is valid due to states_equal(), * so we can assume valid iter and reg state, * no need for extra (re-)validations */ spi = bpf_get_spi(iter_reg->var_off.value); iter_state = &bpf_func(env, iter_reg)->stack[spi].spilled_ptr; if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) { loop = true; goto hit; } } goto skip_inf_loop_check; } if (is_may_goto_insn_at(env, insn_idx)) { if (sl->state.may_goto_depth != cur->may_goto_depth && states_equal(env, &sl->state, cur, RANGE_WITHIN)) { loop = true; goto hit; } } if (bpf_calls_callback(env, insn_idx)) { if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) { loop = true; goto hit; } goto skip_inf_loop_check; } /* attempt to detect infinite loop to avoid unnecessary doomed work */ if (states_maybe_looping(&sl->state, cur) && states_equal(env, &sl->state, cur, EXACT) && !iter_active_depths_differ(&sl->state, cur) && sl->state.may_goto_depth == cur->may_goto_depth && sl->state.callback_unroll_depth == cur->callback_unroll_depth) { verbose_linfo(env, insn_idx, "; "); verbose(env, "infinite loop detected at insn %d\n", insn_idx); verbose(env, "cur state:"); print_verifier_state(env, cur, cur->curframe, true); verbose(env, "old state:"); print_verifier_state(env, &sl->state, cur->curframe, true); return -EINVAL; } /* if the verifier is processing a loop, avoid adding new state * too often, since different loop iterations have distinct * states and may not help future pruning. * This threshold shouldn't be too low to make sure that * a loop with large bound will be rejected quickly. * The most abusive loop will be: * r1 += 1 * if r1 < 1000000 goto pc-2 * 1M insn_procssed limit / 100 == 10k peak states. * This threshold shouldn't be too high either, since states * at the end of the loop are likely to be useful in pruning. */ skip_inf_loop_check: if (!force_new_state && env->jmps_processed - env->prev_jmps_processed < 20 && env->insn_processed - env->prev_insn_processed < 100) add_new_state = false; goto miss; } /* See comments for mark_all_regs_read_and_precise() */ loop = incomplete_read_marks(env, &sl->state); if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) { hit: sl->hit_cnt++; /* if previous state reached the exit with precision and * current state is equivalent to it (except precision marks) * the precision needs to be propagated back in * the current state. */ err = 0; if (bpf_is_jmp_point(env, env->insn_idx)) err = bpf_push_jmp_history(env, cur, 0, 0); err = err ? : propagate_precision(env, &sl->state, cur, NULL); if (err) return err; /* When processing iterator based loops above propagate_liveness and * propagate_precision calls are not sufficient to transfer all relevant * read and precision marks. E.g. consider the following case: * * .-> A --. Assume the states are visited in the order A, B, C. * | | | Assume that state B reaches a state equivalent to state A. * | v v At this point, state C is not processed yet, so state A * '-- B C has not received any read or precision marks from C. * Thus, marks propagated from A to B are incomplete. * * The verifier mitigates this by performing the following steps: * * - Prior to the main verification pass, strongly connected components * (SCCs) are computed over the program's control flow graph, * intraprocedurally. * * - During the main verification pass, `maybe_enter_scc()` checks * whether the current verifier state is entering an SCC. If so, an * instance of a `bpf_scc_visit` object is created, and the state * entering the SCC is recorded as the entry state. * * - This instance is associated not with the SCC itself, but with a * `bpf_scc_callchain`: a tuple consisting of the call sites leading to * the SCC and the SCC id. See `compute_scc_callchain()`. * * - When a verification path encounters a `states_equal(..., * RANGE_WITHIN)` condition, there exists a call chain describing the * current state and a corresponding `bpf_scc_visit` instance. A copy * of the current state is created and added to * `bpf_scc_visit->backedges`. * * - When a verification path terminates, `maybe_exit_scc()` is called * from `bpf_update_branch_counts()`. For states with `branches == 0`, it * checks whether the state is the entry state of any `bpf_scc_visit` * instance. If it is, this indicates that all paths originating from * this SCC visit have been explored. `propagate_backedges()` is then * called, which propagates read and precision marks through the * backedges until a fixed point is reached. * (In the earlier example, this would propagate marks from A to B, * from C to A, and then again from A to B.) * * A note on callchains * -------------------- * * Consider the following example: * * void foo() { loop { ... SCC#1 ... } } * void main() { * A: foo(); * B: ... * C: foo(); * } * * Here, there are two distinct callchains leading to SCC#1: * - (A, SCC#1) * - (C, SCC#1) * * Each callchain identifies a separate `bpf_scc_visit` instance that * accumulates backedge states. The `propagate_{liveness,precision}()` * functions traverse the parent state of each backedge state, which * means these parent states must remain valid (i.e., not freed) while * the corresponding `bpf_scc_visit` instance exists. * * Associating `bpf_scc_visit` instances directly with SCCs instead of * callchains would break this invariant: * - States explored during `C: foo()` would contribute backedges to * SCC#1, but SCC#1 would only be exited once the exploration of * `A: foo()` completes. * - By that time, the states explored between `A: foo()` and `C: foo()` * (i.e., `B: ...`) may have already been freed, causing the parent * links for states from `C: foo()` to become invalid. */ if (loop) { struct bpf_scc_backedge *backedge; backedge = kzalloc_obj(*backedge, GFP_KERNEL_ACCOUNT); if (!backedge) return -ENOMEM; err = bpf_copy_verifier_state(&backedge->state, cur); backedge->state.equal_state = &sl->state; backedge->state.insn_idx = insn_idx; err = err ?: add_scc_backedge(env, &sl->state, backedge); if (err) { bpf_free_verifier_state(&backedge->state, false); kfree(backedge); return err; } } return 1; } miss: /* when new state is not going to be added do not increase miss count. * Otherwise several loop iterations will remove the state * recorded earlier. The goal of these heuristics is to have * states from some iterations of the loop (some in the beginning * and some at the end) to help pruning. */ if (add_new_state) sl->miss_cnt++; /* heuristic to determine whether this state is beneficial * to keep checking from state equivalence point of view. * Higher numbers increase max_states_per_insn and verification time, * but do not meaningfully decrease insn_processed. * 'n' controls how many times state could miss before eviction. * Use bigger 'n' for checkpoints because evicting checkpoint states * too early would hinder iterator convergence. */ n = bpf_is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3; if (sl->miss_cnt > sl->hit_cnt * n + n) { /* the state is unlikely to be useful. Remove it to * speed up verification */ sl->in_free_list = true; list_del(&sl->node); list_add(&sl->node, &env->free_list); env->free_list_size++; env->explored_states_size--; maybe_free_verifier_state(env, sl); } } if (env->max_states_per_insn < states_cnt) env->max_states_per_insn = states_cnt; if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) return 0; if (!add_new_state) return 0; /* There were no equivalent states, remember the current one. * Technically the current state is not proven to be safe yet, * but it will either reach outer most bpf_exit (which means it's safe) * or it will be rejected. When there are no loops the verifier won't be * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) * again on the way to bpf_exit. * When looping the sl->state.branches will be > 0 and this state * will not be considered for equivalence until branches == 0. */ new_sl = kzalloc_obj(struct bpf_verifier_state_list, GFP_KERNEL_ACCOUNT); if (!new_sl) return -ENOMEM; env->total_states++; env->explored_states_size++; update_peak_states(env); env->prev_jmps_processed = env->jmps_processed; env->prev_insn_processed = env->insn_processed; /* forget precise markings we inherited, see __mark_chain_precision */ if (env->bpf_capable) mark_all_scalars_imprecise(env, cur); bpf_clear_singular_ids(env, cur); /* add new state to the head of linked list */ new = &new_sl->state; err = bpf_copy_verifier_state(new, cur); if (err) { bpf_free_verifier_state(new, false); kfree(new_sl); return err; } new->insn_idx = insn_idx; verifier_bug_if(new->branches != 1, env, "%s:branches_to_explore=%d insn %d", __func__, new->branches, insn_idx); err = maybe_enter_scc(env, new); if (err) { bpf_free_verifier_state(new, false); kfree(new_sl); return err; } cur->parent = new; cur->first_insn_idx = insn_idx; cur->dfs_depth = new->dfs_depth + 1; bpf_clear_jmp_history(cur); list_add(&new_sl->node, head); return 0; }
920 937 201 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PID_NS_H #define _LINUX_PID_NS_H #include <linux/sched.h> #include <linux/bug.h> #include <linux/mm.h> #include <linux/workqueue.h> #include <linux/threads.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/idr.h> /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 struct fs_pin; #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) /* modes for vm.memfd_noexec sysctl */ #define MEMFD_NOEXEC_SCOPE_EXEC 0 /* MFD_EXEC implied if unset */ #define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1 /* MFD_NOEXEC_SEAL implied if unset */ #define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2 /* same as 1, except MFD_EXEC rejected */ #endif struct pid_namespace { struct idr idr; struct rcu_head rcu; unsigned int pid_allocated; #ifdef CONFIG_SYSCTL #if defined(CONFIG_MEMFD_CREATE) int memfd_noexec_scope; #endif struct ctl_table_set set; struct ctl_table_header *sysctls; #endif struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; int pid_max; struct pid_namespace *parent; #ifdef CONFIG_BSD_PROCESS_ACCT struct fs_pin *bacct; #endif struct user_namespace *user_ns; struct ucounts *ucounts; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; struct work_struct work; } __randomize_layout; extern struct pid_namespace init_pid_ns; #define PIDNS_ADDING (1U << 31) #ifdef CONFIG_PID_NS static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) { return container_of(ns, struct pid_namespace, ns); } static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { ns_ref_inc(ns); return ns; } #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) { int scope = MEMFD_NOEXEC_SCOPE_EXEC; for (; ns; ns = ns->parent) scope = max(scope, READ_ONCE(ns->memfd_noexec_scope)); return scope; } #else static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) { return 0; } #endif extern struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); extern void put_pid_ns(struct pid_namespace *ns); extern bool pidns_is_ancestor(struct pid_namespace *child, struct pid_namespace *ancestor); #else /* !CONFIG_PID_NS */ #include <linux/err.h> static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { return ns; } static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) { return 0; } static inline struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns) { if (flags & CLONE_NEWPID) ns = ERR_PTR(-EINVAL); return ns; } static inline void put_pid_ns(struct pid_namespace *ns) { } static inline void zap_pid_ns_processes(struct pid_namespace *ns) { BUG(); } static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { return 0; } static inline bool pidns_is_ancestor(struct pid_namespace *child, struct pid_namespace *ancestor) { return false; } #endif /* CONFIG_PID_NS */ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); void pidhash_init(void); void pid_idr_init(void); int register_pidns_sysctls(struct pid_namespace *pidns); void unregister_pidns_sysctls(struct pid_namespace *pidns); static inline bool task_is_in_init_pid_ns(struct task_struct *tsk) { return task_active_pid_ns(tsk) == &init_pid_ns; } #endif /* _LINUX_PID_NS_H */
1 39 39 1 42 42 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/export.h> #include <linux/uaccess.h> #include <linux/mm.h> #include <linux/bitops.h> #include <asm/word-at-a-time.h> /* * Do a strnlen, return length of string *with* final '\0'. * 'count' is the user-supplied count, while 'max' is the * address space maximum. * * Return 0 for exceptions (which includes hitting the address * space maximum), or 'count+1' if hitting the user-supplied * maximum count. * * NOTE! We can sometimes overshoot the user-supplied maximum * if it fits in a aligned 'long'. The caller needs to check * the return value against "> max". */ static __always_inline long do_strnlen_user(const char __user *src, unsigned long count, unsigned long max) { const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; unsigned long align, res = 0; unsigned long c; /* * Do everything aligned. But that means that we * need to also expand the maximum.. */ align = (sizeof(unsigned long) - 1) & (unsigned long)src; src -= align; max += align; unsafe_get_user(c, (unsigned long __user *)src, efault); c |= aligned_byte_mask(align); for (;;) { unsigned long data; if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); data = create_zero_mask(data); return res + find_zero(data) + 1 - align; } res += sizeof(unsigned long); /* We already handled 'unsigned long' bytes. Did we do it all ? */ if (unlikely(max <= sizeof(unsigned long))) break; max -= sizeof(unsigned long); unsafe_get_user(c, (unsigned long __user *)(src+res), efault); } res -= align; /* * Uhhuh. We hit 'max'. But was that the user-specified maximum * too? If so, return the marker for "too long". */ if (res >= count) return count+1; /* * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's 0. */ efault: return 0; } /** * strnlen_user: - Get the size of a user string INCLUDING final NUL. * @str: The string to measure. * @count: Maximum count (including NUL character) * * Context: User context only. This function may sleep if pagefaults are * enabled. * * Get the size of a NUL-terminated string in user space. * * Returns the size of the string INCLUDING the terminating NUL. * If the string is too long, returns a number larger than @count. User * has to check the return value against "> count". * On exception (or invalid count), returns 0. * * NOTE! You should basically never use this function. There is * almost never any valid case for using the length of a user space * string, since the string can be changed at any time by other * threads. Use "strncpy_from_user()" instead to get a stable copy * of the string. */ long strnlen_user(const char __user *str, long count) { unsigned long max_addr, src_addr; if (unlikely(count <= 0)) return 0; if (can_do_masked_user_access()) { long retval; str = masked_user_read_access_begin(str); retval = do_strnlen_user(str, count, count); user_read_access_end(); return retval; } max_addr = TASK_SIZE_MAX; src_addr = (unsigned long)untagged_addr(str); if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; long retval; /* * Truncate 'max' to the user-specified limit, so that * we only have one limit we need to check in the loop */ if (max > count) max = count; if (user_read_access_begin(str, max)) { retval = do_strnlen_user(str, count, max); user_read_access_end(); return retval; } } return 0; } EXPORT_SYMBOL(strnlen_user);
27222 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CLOCK_INLINED_H #define _ASM_X86_CLOCK_INLINED_H #include <asm/tsc.h> struct clocksource; static __always_inline u64 arch_inlined_clocksource_read(struct clocksource *cs) { return (u64)rdtsc_ordered(); } struct clock_event_device; static __always_inline void arch_inlined_clockevent_set_next_coupled(u64 cycles, struct clock_event_device *evt) { native_wrmsrq(MSR_IA32_TSC_DEADLINE, cycles); } #endif
5 5 8 3 2 2 8 8 8 8 8 8 8 8 8 3 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 // SPDX-License-Identifier: GPL-2.0-or-later /* * 842 Software Decompression * * Copyright (C) 2015 Dan Streetman, IBM Corp * * See 842.h for details of the 842 compressed format. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define MODULE_NAME "842_decompress" #include "842.h" #include "842_debugfs.h" /* rolling fifo sizes */ #define I2_FIFO_SIZE (2 * (1 << I2_BITS)) #define I4_FIFO_SIZE (4 * (1 << I4_BITS)) #define I8_FIFO_SIZE (8 * (1 << I8_BITS)) static u8 decomp_ops[OPS_MAX][4] = { { D8, N0, N0, N0 }, { D4, D2, I2, N0 }, { D4, I2, D2, N0 }, { D4, I2, I2, N0 }, { D4, I4, N0, N0 }, { D2, I2, D4, N0 }, { D2, I2, D2, I2 }, { D2, I2, I2, D2 }, { D2, I2, I2, I2 }, { D2, I2, I4, N0 }, { I2, D2, D4, N0 }, { I2, D4, I2, N0 }, { I2, D2, I2, D2 }, { I2, D2, I2, I2 }, { I2, D2, I4, N0 }, { I2, I2, D4, N0 }, { I2, I2, D2, I2 }, { I2, I2, I2, D2 }, { I2, I2, I2, I2 }, { I2, I2, I4, N0 }, { I4, D4, N0, N0 }, { I4, D2, I2, N0 }, { I4, I2, D2, N0 }, { I4, I2, I2, N0 }, { I4, I4, N0, N0 }, { I8, N0, N0, N0 } }; struct sw842_param { u8 *in; u8 bit; u64 ilen; u8 *out; u8 *ostart; u64 olen; }; #define beN_to_cpu(d, s) \ ((s) == 2 ? be16_to_cpu(get_unaligned((__be16 *)d)) : \ (s) == 4 ? be32_to_cpu(get_unaligned((__be32 *)d)) : \ (s) == 8 ? be64_to_cpu(get_unaligned((__be64 *)d)) : \ 0) static int next_bits(struct sw842_param *p, u64 *d, u8 n); static int __split_next_bits(struct sw842_param *p, u64 *d, u8 n, u8 s) { u64 tmp = 0; int ret; if (n <= s) { pr_debug("split_next_bits invalid n %u s %u\n", n, s); return -EINVAL; } ret = next_bits(p, &tmp, n - s); if (ret) return ret; ret = next_bits(p, d, s); if (ret) return ret; *d |= tmp << s; return 0; } static int next_bits(struct sw842_param *p, u64 *d, u8 n) { u8 *in = p->in, b = p->bit, bits = b + n; if (n > 64) { pr_debug("next_bits invalid n %u\n", n); return -EINVAL; } /* split this up if reading > 8 bytes, or if we're at the end of * the input buffer and would read past the end */ if (bits > 64) return __split_next_bits(p, d, n, 32); else if (p->ilen < 8 && bits > 32 && bits <= 56) return __split_next_bits(p, d, n, 16); else if (p->ilen < 4 && bits > 16 && bits <= 24) return __split_next_bits(p, d, n, 8); if (DIV_ROUND_UP(bits, 8) > p->ilen) return -EOVERFLOW; if (bits <= 8) *d = *in >> (8 - bits); else if (bits <= 16) *d = be16_to_cpu(get_unaligned((__be16 *)in)) >> (16 - bits); else if (bits <= 32) *d = be32_to_cpu(get_unaligned((__be32 *)in)) >> (32 - bits); else *d = be64_to_cpu(get_unaligned((__be64 *)in)) >> (64 - bits); *d &= GENMASK_ULL(n - 1, 0); p->bit += n; if (p->bit > 7) { p->in += p->bit / 8; p->ilen -= p->bit / 8; p->bit %= 8; } return 0; } static int do_data(struct sw842_param *p, u8 n) { u64 v; int ret; if (n > p->olen) return -ENOSPC; ret = next_bits(p, &v, n * 8); if (ret) return ret; switch (n) { case 2: put_unaligned(cpu_to_be16((u16)v), (__be16 *)p->out); break; case 4: put_unaligned(cpu_to_be32((u32)v), (__be32 *)p->out); break; case 8: put_unaligned(cpu_to_be64((u64)v), (__be64 *)p->out); break; default: return -EINVAL; } p->out += n; p->olen -= n; return 0; } static int __do_index(struct sw842_param *p, u8 size, u8 bits, u64 fsize) { u64 index, offset, total = round_down(p->out - p->ostart, 8); int ret; ret = next_bits(p, &index, bits); if (ret) return ret; offset = index * size; /* a ring buffer of fsize is used; correct the offset */ if (total > fsize) { /* this is where the current fifo is */ u64 section = round_down(total, fsize); /* the current pos in the fifo */ u64 pos = total - section; /* if the offset is past/at the pos, we need to * go back to the last fifo section */ if (offset >= pos) section -= fsize; offset += section; } if (offset + size > total) { pr_debug("index%x %lx points past end %lx\n", size, (unsigned long)offset, (unsigned long)total); return -EINVAL; } if (size != 2 && size != 4 && size != 8) WARN(1, "__do_index invalid size %x\n", size); else pr_debug("index%x to %lx off %lx adjoff %lx tot %lx data %lx\n", size, (unsigned long)index, (unsigned long)(index * size), (unsigned long)offset, (unsigned long)total, (unsigned long)beN_to_cpu(&p->ostart[offset], size)); memcpy(p->out, &p->ostart[offset], size); p->out += size; p->olen -= size; return 0; } static int do_index(struct sw842_param *p, u8 n) { switch (n) { case 2: return __do_index(p, 2, I2_BITS, I2_FIFO_SIZE); case 4: return __do_index(p, 4, I4_BITS, I4_FIFO_SIZE); case 8: return __do_index(p, 8, I8_BITS, I8_FIFO_SIZE); default: return -EINVAL; } } static int do_op(struct sw842_param *p, u8 o) { int i, ret = 0; if (o >= OPS_MAX) return -EINVAL; for (i = 0; i < 4; i++) { u8 op = decomp_ops[o][i]; pr_debug("op is %x\n", op); switch (op & OP_ACTION) { case OP_ACTION_DATA: ret = do_data(p, op & OP_AMOUNT); break; case OP_ACTION_INDEX: ret = do_index(p, op & OP_AMOUNT); break; case OP_ACTION_NOOP: break; default: pr_err("Internal error, invalid op %x\n", op); return -EINVAL; } if (ret) return ret; } if (sw842_template_counts) atomic_inc(&template_count[o]); return 0; } /** * sw842_decompress * * Decompress the 842-compressed buffer of length @ilen at @in * to the output buffer @out, using no more than @olen bytes. * * The compressed buffer must be only a single 842-compressed buffer, * with the standard format described in the comments in 842.h * Processing will stop when the 842 "END" template is detected, * not the end of the buffer. * * Returns: 0 on success, error on failure. The @olen parameter * will contain the number of output bytes written on success, or * 0 on error. */ int sw842_decompress(const u8 *in, unsigned int ilen, u8 *out, unsigned int *olen) { struct sw842_param p; int ret; u64 op, rep, tmp, bytes, total; u64 crc; p.in = (u8 *)in; p.bit = 0; p.ilen = ilen; p.out = out; p.ostart = out; p.olen = *olen; total = p.olen; *olen = 0; do { ret = next_bits(&p, &op, OP_BITS); if (ret) return ret; pr_debug("template is %lx\n", (unsigned long)op); switch (op) { case OP_REPEAT: ret = next_bits(&p, &rep, REPEAT_BITS); if (ret) return ret; if (p.out == out) /* no previous bytes */ return -EINVAL; /* copy rep + 1 */ rep++; if (rep * 8 > p.olen) return -ENOSPC; while (rep-- > 0) { memcpy(p.out, p.out - 8, 8); p.out += 8; p.olen -= 8; } if (sw842_template_counts) atomic_inc(&template_repeat_count); break; case OP_ZEROS: if (8 > p.olen) return -ENOSPC; memset(p.out, 0, 8); p.out += 8; p.olen -= 8; if (sw842_template_counts) atomic_inc(&template_zeros_count); break; case OP_SHORT_DATA: ret = next_bits(&p, &bytes, SHORT_DATA_BITS); if (ret) return ret; if (!bytes || bytes > SHORT_DATA_BITS_MAX) return -EINVAL; while (bytes-- > 0) { ret = next_bits(&p, &tmp, 8); if (ret) return ret; *p.out = (u8)tmp; p.out++; p.olen--; } if (sw842_template_counts) atomic_inc(&template_short_data_count); break; case OP_END: if (sw842_template_counts) atomic_inc(&template_end_count); break; default: /* use template */ ret = do_op(&p, op); if (ret) return ret; break; } } while (op != OP_END); /* * crc(0:31) is saved in compressed data starting with the * next bit after End of stream template. */ ret = next_bits(&p, &crc, CRC_BITS); if (ret) return ret; /* * Validate CRC saved in compressed data. */ if (crc != (u64)crc32_be(0, out, total - p.olen)) { pr_debug("CRC mismatch for decompression\n"); return -EINVAL; } if (unlikely((total - p.olen) > UINT_MAX)) return -ENOSPC; *olen = total - p.olen; return 0; } EXPORT_SYMBOL_GPL(sw842_decompress); static int __init sw842_init(void) { if (sw842_template_counts) sw842_debugfs_create(); return 0; } module_init(sw842_init); static void __exit sw842_exit(void) { if (sw842_template_counts) sw842_debugfs_remove(); } module_exit(sw842_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Software 842 Decompressor"); MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
12 79 22 80 18 26 22 20 25 15 25 149 113 80 80 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 // SPDX-License-Identifier: GPL-2.0 /* * lib/minmax.c: windowed min/max tracker * * Kathleen Nichols' algorithm for tracking the minimum (or maximum) * value of a data stream over some fixed time interval. (E.g., * the minimum RTT over the past five minutes.) It uses constant * space and constant time per update yet almost always delivers * the same minimum as an implementation that has to keep all the * data in the window. * * The algorithm keeps track of the best, 2nd best & 3rd best min * values, maintaining an invariant that the measurement time of * the n'th best >= n-1'th best. It also makes sure that the three * values are widely separated in the time window since that bounds * the worse case error when that data is monotonically increasing * over the window. * * Upon getting a new min, we can forget everything earlier because * it has no value - the new min is <= everything else in the window * by definition and it's the most recent. So we restart fresh on * every new min and overwrites 2nd & 3rd choices. The same property * holds for 2nd & 3rd best. */ #include <linux/module.h> #include <linux/win_minmax.h> /* As time advances, update the 1st, 2nd, and 3rd choices. */ static u32 minmax_subwin_update(struct minmax *m, u32 win, const struct minmax_sample *val) { u32 dt = val->t - m->s[0].t; if (unlikely(dt > win)) { /* * Passed entire window without a new val so make 2nd * choice the new val & 3rd choice the new 2nd choice. * we may have to iterate this since our 2nd choice * may also be outside the window (we checked on entry * that the third choice was in the window). */ m->s[0] = m->s[1]; m->s[1] = m->s[2]; m->s[2] = *val; if (unlikely(val->t - m->s[0].t > win)) { m->s[0] = m->s[1]; m->s[1] = m->s[2]; m->s[2] = *val; } } else if (unlikely(m->s[1].t == m->s[0].t) && dt > win/4) { /* * We've passed a quarter of the window without a new val * so take a 2nd choice from the 2nd quarter of the window. */ m->s[2] = m->s[1] = *val; } else if (unlikely(m->s[2].t == m->s[1].t) && dt > win/2) { /* * We've passed half the window without finding a new val * so take a 3rd choice from the last half of the window */ m->s[2] = *val; } return m->s[0].v; } /* Check if new measurement updates the 1st, 2nd or 3rd choice max. */ u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas) { struct minmax_sample val = { .t = t, .v = meas }; if (unlikely(val.v >= m->s[0].v) || /* found new max? */ unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */ return minmax_reset(m, t, meas); /* forget earlier samples */ if (unlikely(val.v >= m->s[1].v)) m->s[2] = m->s[1] = val; else if (unlikely(val.v >= m->s[2].v)) m->s[2] = val; return minmax_subwin_update(m, win, &val); } EXPORT_SYMBOL(minmax_running_max); /* Check if new measurement updates the 1st, 2nd or 3rd choice min. */ u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas) { struct minmax_sample val = { .t = t, .v = meas }; if (unlikely(val.v <= m->s[0].v) || /* found new min? */ unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */ return minmax_reset(m, t, meas); /* forget earlier samples */ if (unlikely(val.v <= m->s[1].v)) m->s[2] = m->s[1] = val; else if (unlikely(val.v <= m->s[2].v)) m->s[2] = val; return minmax_subwin_update(m, win, &val); } EXPORT_SYMBOL(minmax_running_min);
1 2 1 1 3 3 182 182 22 22 37313 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net> * * Based on the original implementation which is: * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright 2003 Andi Kleen, SuSE Labs. * * Parts of the original code have been moved to arch/x86/vdso/vma.c * * This file implements vsyscall emulation. vsyscalls are a legacy ABI: * Userspace can request certain kernel services by calling fixed * addresses. This concept is problematic: * * - It interferes with ASLR. * - It's awkward to write code that lives in kernel addresses but is * callable by userspace at fixed addresses. * - The whole concept is impossible for 32-bit compat userspace. * - UML cannot easily virtualize a vsyscall. * * As of mid-2014, I believe that there is no new userspace code that * will use a vsyscall if the vDSO is present. I hope that there will * soon be no new userspace code that will ever use a vsyscall. * * The code in this file emulates vsyscalls when notified of a page * fault or a general protection fault to a vsyscall address. */ #include <linux/kernel.h> #include <linux/timer.h> #include <linux/sched/signal.h> #include <linux/mm_types.h> #include <linux/syscalls.h> #include <linux/ratelimit.h> #include <asm/vsyscall.h> #include <asm/unistd.h> #include <asm/fixmap.h> #include <asm/traps.h> #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init = #ifdef CONFIG_LEGACY_VSYSCALL_NONE NONE; #elif defined(CONFIG_LEGACY_VSYSCALL_XONLY) XONLY; #else #error VSYSCALL config is broken #endif static int __init vsyscall_setup(char *str) { if (str) { if (!strcmp("emulate", str)) vsyscall_mode = EMULATE; else if (!strcmp("xonly", str)) vsyscall_mode = XONLY; else if (!strcmp("none", str)) vsyscall_mode = NONE; else return -EINVAL; if (cpu_feature_enabled(X86_FEATURE_LASS) && vsyscall_mode == EMULATE) { setup_clear_cpu_cap(X86_FEATURE_LASS); pr_warn_once("x86/cpu: Disabling LASS due to vsyscall=emulate\n"); } return 0; } return -EINVAL; } early_param("vsyscall", vsyscall_setup); static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, const char *message) { if (!show_unhandled_signals) return; printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n", level, current->comm, task_pid_nr(current), message, regs->ip, regs->cs, regs->sp, regs->ax, regs->si, regs->di); } static int addr_to_vsyscall_nr(unsigned long addr) { int nr; if ((addr & ~0xC00UL) != VSYSCALL_ADDR) return -EINVAL; nr = (addr & 0xC00UL) >> 10; if (nr >= 3) return -EINVAL; return nr; } static bool write_ok_or_segv(unsigned long ptr, size_t size) { if (!access_ok((void __user *)ptr, size)) { struct thread_struct *thread = &current->thread; thread->error_code = X86_PF_USER | X86_PF_WRITE; thread->cr2 = ptr; thread->trap_nr = X86_TRAP_PF; force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr); return false; } else { return true; } } static bool __emulate_vsyscall(struct pt_regs *regs, unsigned long address) { unsigned long caller; int vsyscall_nr, syscall_nr, tmp; long ret; unsigned long orig_dx; /* Confirm that the fault happened in 64-bit user mode */ if (!user_64bit_mode(regs)) return false; if (vsyscall_mode == NONE) { warn_bad_vsyscall(KERN_INFO, regs, "vsyscall attempted with vsyscall=none"); return false; } vsyscall_nr = addr_to_vsyscall_nr(address); trace_emulate_vsyscall(vsyscall_nr); if (vsyscall_nr < 0) { warn_bad_vsyscall(KERN_WARNING, regs, "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); goto sigsegv; } if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { warn_bad_vsyscall(KERN_WARNING, regs, "vsyscall with bad stack (exploit attempt?)"); goto sigsegv; } /* * Check for access_ok violations and find the syscall nr. * * NULL is a valid user pointer (in the access_ok sense) on 32-bit and * 64-bit, so we don't need to special-case it here. For all the * vsyscalls, NULL means "don't write anything" not "write it at * address 0". */ switch (vsyscall_nr) { case 0: if (!write_ok_or_segv(regs->di, sizeof(struct __kernel_old_timeval)) || !write_ok_or_segv(regs->si, sizeof(struct timezone))) { ret = -EFAULT; goto check_fault; } syscall_nr = __NR_gettimeofday; break; case 1: if (!write_ok_or_segv(regs->di, sizeof(__kernel_old_time_t))) { ret = -EFAULT; goto check_fault; } syscall_nr = __NR_time; break; case 2: if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || !write_ok_or_segv(regs->si, sizeof(unsigned))) { ret = -EFAULT; goto check_fault; } syscall_nr = __NR_getcpu; break; } /* * Handle seccomp. regs->ip must be the original value. * See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst. * * We could optimize the seccomp disabled case, but performance * here doesn't matter. */ regs->orig_ax = syscall_nr; regs->ax = -ENOSYS; tmp = secure_computing(); if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { warn_bad_vsyscall(KERN_DEBUG, regs, "seccomp tried to change syscall nr or ip"); force_exit_sig(SIGSYS); return true; } regs->orig_ax = -1; if (tmp) goto do_ret; /* skip requested */ /* * With a real vsyscall, page faults cause SIGSEGV. */ ret = -EFAULT; switch (vsyscall_nr) { case 0: /* this decodes regs->di and regs->si on its own */ ret = __x64_sys_gettimeofday(regs); break; case 1: /* this decodes regs->di on its own */ ret = __x64_sys_time(regs); break; case 2: /* while we could clobber regs->dx, we didn't in the past... */ orig_dx = regs->dx; regs->dx = 0; /* this decodes regs->di, regs->si and regs->dx on its own */ ret = __x64_sys_getcpu(regs); regs->dx = orig_dx; break; } check_fault: if (ret == -EFAULT) { /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall fault (exploit attempt?)"); goto sigsegv; } regs->ax = ret; do_ret: /* Emulate a ret instruction. */ regs->ip = caller; regs->sp += 8; return true; sigsegv: force_sig(SIGSEGV); return true; } bool emulate_vsyscall_pf(unsigned long error_code, struct pt_regs *regs, unsigned long address) { /* Write faults or kernel-privilege faults never get fixed up. */ if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) return false; /* * Assume that faults at regs->ip are because of an instruction * fetch. Return early and avoid emulation for faults during * data accesses: */ if (address != regs->ip) { /* Failed vsyscall read */ if (vsyscall_mode == EMULATE) return false; /* User code tried and failed to read the vsyscall page. */ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround"); return false; } /* * X86_PF_INSTR is only set when NX is supported. When * available, use it to double-check that the emulation code * is only being used for instruction fetches: */ if (cpu_feature_enabled(X86_FEATURE_NX)) WARN_ON_ONCE(!(error_code & X86_PF_INSTR)); return __emulate_vsyscall(regs, address); } bool emulate_vsyscall_gp(struct pt_regs *regs) { /* Without LASS, vsyscall accesses are expected to generate a #PF */ if (!cpu_feature_enabled(X86_FEATURE_LASS)) return false; /* Emulate only if the RIP points to the vsyscall address */ if (!is_vsyscall_vaddr(regs->ip)) return false; return __emulate_vsyscall(regs, regs->ip); } /* * A pseudo VMA to allow ptrace access for the vsyscall page. This only * covers the 64bit vsyscall page now. 32bit has a real VMA now and does * not need special handling anymore: */ static const char *gate_vma_name(struct vm_area_struct *vma) { return "[vsyscall]"; } static const struct vm_operations_struct gate_vma_ops = { .name = gate_vma_name, }; static struct vm_area_struct gate_vma __ro_after_init = { .vm_start = VSYSCALL_ADDR, .vm_end = VSYSCALL_ADDR + PAGE_SIZE, .vm_page_prot = PAGE_READONLY_EXEC, .vm_flags = VM_READ | VM_EXEC, .vm_ops = &gate_vma_ops, }; struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef CONFIG_COMPAT if (!mm || !test_bit(MM_CONTEXT_HAS_VSYSCALL, &mm->context.flags)) return NULL; #endif if (vsyscall_mode == NONE) return NULL; return &gate_vma; } int in_gate_area(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma = get_gate_vma(mm); if (!vma) return 0; return (addr >= vma->vm_start) && (addr < vma->vm_end); } /* * Use this when you have no reliable mm, typically from interrupt * context. It is less reliable than using a task's mm and may give * false positives. */ int in_gate_area_no_mm(unsigned long addr) { return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; } /* * The VSYSCALL page is the only user-accessible page in the kernel address * range. Normally, the kernel page tables can have _PAGE_USER clear, but * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls * are enabled. * * Some day we may create a "minimal" vsyscall mode in which we emulate * vsyscalls but leave the page not present. If so, we skip calling * this. */ void __init set_vsyscall_pgtable_user_bits(pgd_t *root) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); p4d = p4d_offset(pgd, VSYSCALL_ADDR); set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER)); pud = pud_offset(p4d, VSYSCALL_ADDR); set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); pmd = pmd_offset(pud, VSYSCALL_ADDR); set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER)); } void __init map_vsyscall(void) { extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); /* * For full emulation, the page needs to exist for real. In * execute-only mode, there is no PTE at all backing the vsyscall * page. */ if (vsyscall_mode == EMULATE) { __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, PAGE_KERNEL_VVAR); set_vsyscall_pgtable_user_bits(swapper_pg_dir); } if (vsyscall_mode == XONLY) vm_flags_init(&gate_vma, VM_EXEC); BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); }
166 67 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 /* SPDX-License-Identifier: GPL-2.0 */ #und